# Scraping UFC Stats

This Python script primarily serves the purpose of scraping data related to UFC (Ultimate Fighting Championship) fights from a website called "bestfightodds.com". It makes use of libraries such as Pandas, BeautifulSoup, and Selenium to perform actions such as data manipulation, html parsing, and simulating browser interactions, respectively. 

The program operates in this way:

1. It begins by importing the necessary libraries for the tasks and setting the working directory.

2. It reads event data from an existing CSV file ("Final_Hand_Done_BFO_Urls_with UFC_Stats_Urls.csv") located in the set directory.

3. The script pulls event URLs from the loaded file and prepares a list of all unique URLs.

4. The program verifies these URLs against the completed UFC events on the UFC stats website. If it finds new events that haven't been accounted for in the initial CSV, it identifies these events, and it fetches their corresponding URLs from "bestfightodds.com" using a Google search.

5. If encountering issues while fetching these URLs, the program flags these errors and, depending on the problem, it may require human interference to resolve the issue.

6. For each event, the program downloads odds information by sending GET requests to the corresponding URL. The script uses user-agent headers with these requests to prevent them from being blocked. The scraped data is then processed and added to a Pandas DataFrame for data manipulation.

7. After gathering data for all the URLs, the bot attempts to download all the odds change data. It accomplishes this by using the Selenium WebDriver to imitate browser interactions for each URL.

8. After collecting all data, the script summarizes the data acquisition process. It saves all the gathered data into CSV files for subsequent analysis.

It's also important to note that the script is written in such a way to handle errors and exceptions. If the operation fails at any point, it captures essential failure information and proceeds with the next iterations. This handling lets the user debug the issue, making the script more robust and reliable for future execution.

In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import matplotlib.ticker as mtick
import sqlite3
import seaborn as sns
from matplotlib.pyplot import figure
from bs4 import BeautifulSoup
import requests     
import shutil       
import datetime
from scipy.stats import norm
import warnings
warnings.filterwarnings('ignore')
import requests
import json
from random import randint
import  random
import os
os.chdir('/Users/travisroyce/Library/CloudStorage/OneDrive-Personal/Data Science/Personal_Projects/Sports/UFC_Prediction_V2')
from cmath import nan
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from bs4 import BeautifulSoup
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from sklearn.pipeline import Pipeline, make_pipeline, FeatureUnion
import pickle
from sklearn.metrics import fbeta_score
from bs4 import BeautifulSoup
import time

### Initial Scraping Functions

In [4]:
# Define function with url parameter
def get_event_date_from_ufcstats(url):

    # Get page content from url
    page = requests.get(url)

    # Create BeautifulSoup object
    soup = BeautifulSoup(page.content, 'html.parser')

    # Find the date element in the HTML code and extract the text
    date = soup.find('li', class_='b-list__box-list-item').text

    # Remove any newline characters from the text
    date = date.replace('\n', '')

    # Find the index of the colon character in the date string
    ed = date.find(':')

    # Extract the date portion of the string and remove any leading/trailing whitespace
    date = date[ed+2:].strip()

    # Return the extracted date
    return date

In [5]:
def get_details_from_ufcstats(url):
    # Send a GET request to the specified URL and retrieve the page's content
    page = requests.get(url)

    # Use Beautiful Soup to parse the page content
    soup = BeautifulSoup(page.content, 'html.parser')

    # Find the div containing the fight details and extract its text content
    details = soup.find('div', class_='b-fight-details__content').text

    # Remove newline characters from the details text
    details = details.replace('\n', '')

    # Find the start of the details section and extract only the details text
    deet = details.find('Details:')
    details = details[deet+8:].strip()
    
    # Return the extracted fight details
    return details

In [6]:
get_details_from_ufcstats('http://www.ufcstats.com/fight-details/7e8fd03e070d0c25')

'Guillotine Choke From Bottom Guard'

In [7]:
def get_event_title_from_ufcstats(url):
    # Send GET request to the given URL and store the response in 'page' variable
    page = requests.get(url)

    # Create BeautifulSoup object by parsing the 'page' content using html.parser
    soup = BeautifulSoup(page.content, 'html.parser')

    # Find the <h2> tag element with class 'b-content__title' and store its text in 'title'
    title = soup.find('h2', class_='b-content__title').text

    # Remove any new line characters and white spaces from the title string
    title = title.replace('\n', '').strip()

    # Return the final title string
    return title

In [8]:
get_event_title_from_ufcstats('http://www.ufcstats.com/fight-details/7e8fd03e070d0c25')

'UFC 279: Diaz vs. Ferguson'

In [9]:
def get_event_url_from_ufcstats(url):
    # Scrapes the HTML content from the input URL
    page = requests.get(url)

    # Parses the HTML content using BeautifulSoup
    soup = BeautifulSoup(page.content, 'html.parser')

    # Finds the event title element and extracts the URL from it
    title_element = soup.find('h2', class_='b-content__title')
    title_anchor = title_element.find('a') # finds the <a> element within the title
    title_url = title_anchor['href'] # extracts the href attribute from the <a> element

    # Returns the URL of the event page
    return title_url

In [10]:
get_event_url_from_ufcstats('http://www.ufcstats.com/fight-details/7e8fd03e070d0c25')

'http://www.ufcstats.com/event-details/93bf96be327fcd98'

In [11]:
def get_fight_urls(urls):
    
    # create an empty dataframe to store the fight urls
    links = pd.DataFrame()
    # iterate over each url in the provided list
    for u in urls:
        try:
            # send a GET request to the current url
            reqs = requests.get(u)
            # extract the text from the html response
            soup = BeautifulSoup(reqs.text, 'html.parser')
            # extract the title of the event from the html
            title = soup.find('h2', class_='b-content__title').text.strip()
            # extract the date of the event from the html
            date = soup.find('li', class_='b-list__box-list-item').text.strip()
            d = date.find('Date:')
            # extract the date from the extracted string
            date = date[d+6:].strip()
            # find all links in the html
            for link in soup.find_all('a'):
                # check if the link contains 'fight-details'
                if 'fight-details' in link.get('href'):
                    # append the fight url to the dataframe along with the title and date of the event
                    links = links.append({
                        'Fight_url': link.get('href'),
                        'Event_title': title,
                        'Date': date
                    }, ignore_index=True)
        except:
            # return an error message if there was an issue with the request
            return 'No Fighties'
    # return the dataframe containing the fight urls
    return links


In [12]:
# test get_fight_urls
get_fight_urls(['http://www.ufcstats.com/event-details/93bf96be327fcd98'])

Unnamed: 0,Fight_url,Event_title,Date
0,http://www.ufcstats.com/fight-details/7e8fd03e...,UFC 279: Diaz vs. Ferguson,"September 10, 2022"
1,http://www.ufcstats.com/fight-details/0111e6a6...,UFC 279: Diaz vs. Ferguson,"September 10, 2022"
2,http://www.ufcstats.com/fight-details/d3e23d7d...,UFC 279: Diaz vs. Ferguson,"September 10, 2022"
3,http://www.ufcstats.com/fight-details/989b8d36...,UFC 279: Diaz vs. Ferguson,"September 10, 2022"
4,http://www.ufcstats.com/fight-details/d869eaea...,UFC 279: Diaz vs. Ferguson,"September 10, 2022"
5,http://www.ufcstats.com/fight-details/0e6d6240...,UFC 279: Diaz vs. Ferguson,"September 10, 2022"
6,http://www.ufcstats.com/fight-details/301f230c...,UFC 279: Diaz vs. Ferguson,"September 10, 2022"
7,http://www.ufcstats.com/fight-details/77acb967...,UFC 279: Diaz vs. Ferguson,"September 10, 2022"
8,http://www.ufcstats.com/fight-details/bf334cb4...,UFC 279: Diaz vs. Ferguson,"September 10, 2022"
9,http://www.ufcstats.com/fight-details/db3e26d0...,UFC 279: Diaz vs. Ferguson,"September 10, 2022"


In [13]:
def get_winner_from_ufcstats(url):
    try:
        # Send a GET request to the URL
        page = requests.get(url)
        # Parse the page content using BeautifulSoup
        soup = BeautifulSoup(page.content, 'html.parser')
        # Find the two <div> elements with class 'b-fight-details__person'
        both = soup.find_all('div', class_='b-fight-details__person')
        # Get the text of the first element and remove whitespace
        first = both[0].text.replace('   ', '').replace('\n', '').strip()
        # Get the text of the second element and remove whitespace
        second = both[1].text.replace('   ', '').replace('\n', '').strip()
        # Check which fighter is the winner based on the 'W' indicator
        if(first.startswith('W')):
            winner = first[3:]
        else:
            winner = second[3:]
        # Return the name of the winner
        return winner

    except:
        # Return NaN if there's an exception
        return nan

In [14]:
# test
get_winner_from_ufcstats('http://www.ufcstats.com/fight-details/7e8fd03e070d0c25')

'Nate Diaz'

In [15]:
def get_event_date_from_ufcstats(url):
    try:
        page = requests.get(url)
        # create a BeautifulSoup object using the HTML content of the response
        soup = BeautifulSoup(page.content, 'html.parser')
        # extract the event date from the HTML source using the relevant class 
        date = soup.find('li', class_='b-list__box-list-item').text
        # remove any newline characters from the date string
        date = date.replace('\n', '')
        # find the index of the colon character in the date string
        ed = date.find(':')
        # extract the event date from the date string and remove any leading/trailing whitespace
        date = date[ed+2:].strip()
        # return the event date as a string
        return date
    except:
        return nan

In [16]:
def get_event_details(event_id):
    # This function scrapes event details from ufc stats 
    # and saves them to a csv file

    #try:
        df = pd.read_html('http://www.ufcstats.com/event-details/' + event_id)
        df = df[0]
        # split by space and get third element
        df['Fighter1'] = df['Fighter'].str.split('  ').str[0]
        df['Fighter2'] = df['Fighter'].str.split('  ').str[1]
        # fix Kd
        df['F1_Kd'] = df['Kd'].str.split('  ').str[0]
        df['F2_Kd'] = df['Kd'].str.split('  ').str[1]
        # fix Str
        df['F1_Str'] = df['Str'].str.split('  ').str[0]
        df['F2_Str'] = df['Str'].str.split('  ').str[1]
        # fix Td
        df['F1_Td'] = df['Td'].str.split('  ').str[0]
        df['F2_Td'] = df['Td'].str.split('  ').str[1]
        # fix Sub
        df['F1_Sub'] = df['Sub'].str.split('  ').str[0]
        df['F2_Sub'] = df['Sub'].str.split('  ').str[1]

        badcols = ['Fighter', 'Kd', 'Str', 'Td', 'Sub']

        # save index+1 to new column called "fight_num"
        df['fight_num'] = df.index + 1
        df['event_id'] = event_id

        # get fight links
        page = requests.get('http://www.ufcstats.com/event-details/' + event_id)
        soup = BeautifulSoup(page.content, 'html.parser')
        fight_linkies = soup.find_all('a')
        fight_links = [n['href'] for n in fight_linkies]
        fight_links = [n for n in fight_links if 'fight-details' in n]
        # delete duplicate links
        fight_links2 = pd.DataFrame(fight_links)
        fight_links2 = fight_links2.drop_duplicates()
        fight_links = fight_links2[0].tolist()

        # append fight links to df
        df['fight_link'] = fight_links

        df = df.drop(badcols, axis=1)
        df.to_csv('data/ufc_stats/events2/'+ event_id +'.csv')
        return df

In [17]:
# test 
test_1 = get_event_details('a23e63184c65f5b8').head(3)
test_1

Unnamed: 0,W/L,Weight class,Method,Round,Time,Fighter1,Fighter2,F1_Kd,F2_Kd,F1_Str,F2_Str,F1_Td,F2_Td,F1_Sub,F2_Sub,fight_num,event_id,fight_link
0,win,Bantamweight,KO/TKO Kick,4,2:17,Marlon Vera,Dominick Cruz,3,0,61,92,0,2,0,0,1,a23e63184c65f5b8,http://www.ufcstats.com/fight-details/11cc331f...
1,win,Featherweight,M-DEC,3,5:00,Nate Landwehr,David Onama,0,1,91,71,3,1,2,0,2,a23e63184c65f5b8,http://www.ufcstats.com/fight-details/efd7efba...
2,win,Women's Strawweight,U-DEC,3,5:00,Yazmin Jauregui,Iasmin Lucindo,0,0,86,66,0,0,0,0,3,a23e63184c65f5b8,http://www.ufcstats.com/fight-details/833d67f4...


In [18]:
def get_fight_totals(fight_details_url):
    # function returns the fight totals for a given fightid
    da_url = fight_details_url
    fightid = da_url[da_url.find('fight-details/')+14:]
    df = pd.read_html(da_url)
    # Part 1: Total Fight Stats
    totals = df[0]
    # replace '---' with 0
    for col in totals.columns:
        totals[col] = totals[col].astype(str).replace({'---': 0})
        totals[col] = totals[col].astype(str).replace({'--': 0})


    totals['Fighter_A'] = totals['Fighter'].str.split(' ').str[0] + ' ' + totals['Fighter'].str.split(' ').str[1]
    totals['Fighter_B'] = totals['Fighter'].str.split(' ').str[2] + ' ' + totals['Fighter'].str.split(' ').str[3]
    totals['A_Kd'] = totals['KD'].str.split('  ').str[0]
    totals['B_Kd'] = totals['KD'].str.split('  ').str[1]
    totals['A_Sig_strike'] = totals['Sig. str.'].str.split('  ').str[0]
    totals['B_Sig_strike'] = totals['Sig. str.'].str.split('  ').str[1]
    # split sig strikes into landed and attempted
    totals['A_Sig_strike_land'] = totals['A_Sig_strike'].str.split(' of ').str[0]
    totals['A_Sig_strike_att'] = totals['A_Sig_strike'].str.split(' of ').str[1]
    totals['B_Sig_strike_land'] = totals['B_Sig_strike'].str.split(' of ').str[0]
    totals['B_Sig_strike_att'] = totals['B_Sig_strike'].str.split(' of ').str[1]
    # change to numeric
    totals['A_Sig_strike_land'].replace({'---', 0}, inplace=True)
    totals['A_Sig_strike_att'].replace({'---', 0}, inplace=True)
    totals['B_Sig_strike_land'].replace({'---', 0}, inplace=True)
    totals['B_Sig_strike_att'].replace({'---', 0}, inplace=True)

    totals['A_Sig_strike_land'] = pd.to_numeric(totals['A_Sig_strike_land'])
    totals['A_Sig_strike_att'] = pd.to_numeric(totals['A_Sig_strike_att'])
    totals['B_Sig_strike_land'] = pd.to_numeric(totals['B_Sig_strike_land'])
    totals['B_Sig_strike_att'] = pd.to_numeric(totals['B_Sig_strike_att'])
    # change infs and nans to 0
    totals['A_Sig_strike_land'] = totals['A_Sig_strike_land'].replace([np.inf, -np.inf, np.nan], 0)
    totals['A_Sig_strike_att'] = totals['A_Sig_strike_att'].replace([np.inf, -np.inf, np.nan], 0)
    totals['B_Sig_strike_land'] = totals['B_Sig_strike_land'].replace([np.inf, -np.inf, np.nan], 0)
    totals['B_Sig_strike_att'] = totals['B_Sig_strike_att'].replace([np.inf, -np.inf, np.nan], 0)

    totals['A_Sig_strike_percent'] = totals['Sig. str. %'].str.split('  ').str[0]
    totals['B_Sig_strike_percent'] = totals['Sig. str. %'].str.split('  ').str[1]
    # get rid of % sign
    totals['A_Sig_strike_percent'] = totals['A_Sig_strike_percent'].str.replace('%', '')
    totals['B_Sig_strike_percent'] = totals['B_Sig_strike_percent'].str.replace('%', '')

    totals['A_Sig_strike_percent'] = totals['A_Sig_strike_percent'].astype(str).replace({'---': 0})
    totals['B_Sig_strike_percent'] = totals['B_Sig_strike_percent'].astype(str).replace({'---': 0})
    # change to numeric
    totals['A_Sig_strike_percent'] = pd.to_numeric(totals['A_Sig_strike_percent'])/100
    totals['B_Sig_strike_percent'] = pd.to_numeric(totals['B_Sig_strike_percent'])/100
    # total strikes
    totals['A_Total_Strikes'] = totals['Total str.'].str.split('  ').str[0]
    totals['B_Total_Strikes'] = totals['Total str.'].str.split('  ').str[1]
    # split total strikes into landed and attempted
    totals['A_Total_Strikes_land'] = totals['A_Total_Strikes'].str.split(' of ').str[0]
    totals['A_Total_Strikes_att'] = totals['A_Total_Strikes'].str.split(' of ').str[1]
    totals['B_Total_Strikes_land'] = totals['B_Total_Strikes'].str.split(' of ').str[0]
    totals['B_Total_Strikes_att'] = totals['B_Total_Strikes'].str.split(' of ').str[1]

    totals['A_Total_Strikes_land'].replace({'---',0}, inplace=True)
    totals['A_Total_Strikes_att'].replace({'---',0}, inplace=True)
    totals['B_Total_Strikes_land'].replace({'---',0}, inplace=True)
    totals['B_Total_Strikes_att'].replace({'---',0}, inplace=True)
    # change to numeric
    totals['A_Total_Strikes_land'] = pd.to_numeric(totals['A_Total_Strikes_land'])
    totals['A_Total_Strikes_att'] = pd.to_numeric(totals['A_Total_Strikes_att'])
    totals['B_Total_Strikes_land'] = pd.to_numeric(totals['B_Total_Strikes_land'])
    totals['B_Total_Strikes_att'] = pd.to_numeric(totals['B_Total_Strikes_att'])

    totals['A_Total_Strikes_land'].astype(str).replace({'---',0}, inplace=True)
    totals['A_Total_Strikes_att'].astype(str).replace({'---',0}, inplace=True)
    totals['B_Total_Strikes_land'].astype(str).replace({'---',0}, inplace=True)
    totals['B_Total_Strikes_att'].astype(str).replace({'---',0}, inplace=True) 
    # change infs and nans to 0
    totals['A_Total_Strikes_land'] = totals['A_Total_Strikes_land'].replace([np.inf, -np.inf, np.nan], 0)
    totals['A_Total_Strikes_att'] = totals['A_Total_Strikes_att'].replace([np.inf, -np.inf, np.nan], 0)
    totals['B_Total_Strikes_land'] = totals['B_Total_Strikes_land'].replace([np.inf, -np.inf, np.nan], 0)
    totals['B_Total_Strikes_att'] = totals['B_Total_Strikes_att'].replace([np.inf, -np.inf, np.nan], 0)


    # calculate total strike percentage
    totals['A_Total_Strikes_percent'] = totals['A_Total_Strikes_land'] / totals['A_Total_Strikes_att']
    totals['B_Total_Strikes_percent'] = totals['B_Total_Strikes_land'] / totals['B_Total_Strikes_att']
    # takedown stats
    totals['A_Takedowns'] = totals['Td'].str.split('  ').str[0]
    totals['B_Takedowns'] = totals['Td'].str.split('  ').str[1]
    # split takedowns into landed and attempted
    totals['A_Takedowns_land'] = totals['A_Takedowns'].str.split(' of ').str[0]
    totals['A_Takedowns_att'] = totals['A_Takedowns'].str.split(' of ').str[1]
    totals['B_Takedowns_land'] = totals['B_Takedowns'].str.split(' of ').str[0]
    totals['B_Takedowns_att'] = totals['B_Takedowns'].str.split(' of ').str[1]
    # change to numeric
    totals['A_Takedowns_land'] = pd.to_numeric(totals['A_Takedowns_land'])
    totals['A_Takedowns_att'] = pd.to_numeric(totals['A_Takedowns_att'])
    totals['B_Takedowns_land'] = pd.to_numeric(totals['B_Takedowns_land'])
    totals['B_Takedowns_att'] = pd.to_numeric(totals['B_Takedowns_att'])
    # fix %
    totals['Td %'].replace({'---': 0}, inplace=True)
    # get rid of ---
    totals['A_Takedowns_land'].replace({'---': 0}, inplace=True)
    totals['A_Takedowns_att'].replace({'---': 0}, inplace=True)
    totals['B_Takedowns_land'].replace({'---': 0}, inplace=True)
    totals['B_Takedowns_att'].replace({'---': 0}, inplace=True)
    # change infs and nans to 0
    totals['A_Takedowns_land'] = totals['A_Takedowns_land'].replace([np.inf, -np.inf, np.nan], 0)
    totals['A_Takedowns_att'] = totals['A_Takedowns_att'].replace([np.inf, -np.inf, np.nan], 0)
    totals['B_Takedowns_land'] = totals['B_Takedowns_land'].replace([np.inf, -np.inf, np.nan], 0)
    totals['B_Takedowns_att'] = totals['B_Takedowns_att'].replace([np.inf, -np.inf, np.nan], 0)

    totals['A_Takedown_percent'] = totals['A_Takedowns_land'] / totals['A_Takedowns_att']
    totals['B_Takedown_percent'] = totals['B_Takedowns_land'] / totals['B_Takedowns_att']
    # submission attempts
    totals['A_Sub_Attempts'] = totals['Sub. att'].str.split('  ').str[0]
    totals['B_Sub_Attempts'] = totals['Sub. att'].str.split('  ').str[1]
    # split submission attempts into landed and attempted
    totals['A_Sub_Attempts_land'] = totals['A_Sub_Attempts'].str.split(' of ').str[0]
    totals['A_Sub_Attempts_att'] = totals['A_Sub_Attempts'].str.split(' of ').str[1]
    totals['B_Sub_Attempts_land'] = totals['B_Sub_Attempts'].str.split(' of ').str[0]
    totals['B_Sub_Attempts_att'] = totals['B_Sub_Attempts'].str.split(' of ').str[1]
    # change to numeric
    totals['A_Sub_Attempts_land'] = pd.to_numeric(totals['A_Sub_Attempts_land'])
    totals['A_Sub_Attempts_att'] = pd.to_numeric(totals['A_Sub_Attempts_att'])
    totals['B_Sub_Attempts_land'] = pd.to_numeric(totals['B_Sub_Attempts_land'])
    totals['B_Sub_Attempts_att'] = pd.to_numeric(totals['B_Sub_Attempts_att'])

    # change infs and nans to 0
    totals['A_Sub_Attempts_land'] = totals['A_Sub_Attempts_land'].replace([np.inf, -np.inf, np.nan], 0)
    totals['A_Sub_Attempts_att'] = totals['A_Sub_Attempts_att'].replace([np.inf, -np.inf, np.nan], 0)
    totals['B_Sub_Attempts_land'] = totals['B_Sub_Attempts_land'].replace([np.inf, -np.inf, np.nan], 0)
    totals['B_Sub_Attempts_att'] = totals['B_Sub_Attempts_att'].replace([np.inf, -np.inf, np.nan], 0)

    # create submission percent
    totals['A_Sub_Success_Percent'] = totals['A_Sub_Attempts_land'] / totals['A_Sub_Attempts_att']
    totals['B_Sub_Success_Percent'] = totals['B_Sub_Attempts_land'] / totals['B_Sub_Attempts_att']

    # Rev
    totals['A_Rev'] = totals['Rev.'].str.split('  ').str[0]
    totals['B_Rev'] = totals['Rev.'].str.split('  ').str[1]
    # change to numeric
    totals['A_Rev'] = pd.to_numeric(totals['A_Rev'])
    totals['B_Rev'] = pd.to_numeric(totals['B_Rev'])
    # control time
    totals['A_Ctrl_time'] = totals['Ctrl'].str.split('  ').str[0]
    totals['B_Ctrl_time'] = totals['Ctrl'].str.split('  ').str[1]
    

    # split control time into minutes and seconds, and then convert to seconds
    totals['A_Ctrl_time_min'] = totals['A_Ctrl_time'].str.split(':').str[0]
    totals['A_Ctrl_time_sec'] = totals['A_Ctrl_time'].str.split(':').str[1]
    totals['B_Ctrl_time_min'] = totals['B_Ctrl_time'].str.split(':').str[0]
    totals['B_Ctrl_time_sec'] = totals['B_Ctrl_time'].str.split(':').str[1]

    totals['A_Ctrl_time_min'] = totals['A_Ctrl_time_min'].replace({'--': 0})
    totals['A_Ctrl_time_sec'] = totals['A_Ctrl_time_sec'].replace({'--': 0})
    totals['B_Ctrl_time_min'] = totals['B_Ctrl_time_min'].replace({'--': 0})
    totals['B_Ctrl_time_sec'] = totals['B_Ctrl_time_sec'].replace({'--': 0})

    # convert to seconds
    totals['A_Ctrl_time_tot'] = pd.to_numeric(totals['A_Ctrl_time_min']) * 60 + pd.to_numeric(totals['A_Ctrl_time_sec'])
    totals['B_Ctrl_time_tot'] = pd.to_numeric(totals['B_Ctrl_time_min']) * 60 + pd.to_numeric(totals['B_Ctrl_time_sec'])


    # add date with beatufil soup
    totals['details'] = get_details_from_ufcstats(da_url)
    totals['event_title'] = get_event_title_from_ufcstats(da_url)
    event_url = get_event_url_from_ufcstats(da_url)
    totals['event_url'] = event_url
    totals['date'] = get_event_date_from_ufcstats(event_url)

    try:
        totals['Winner']= get_winner_from_ufcstats(da_url)
    except:
        totals['Winner']= nan

    # Identify columns to drop
    bad_cols = ['Fighter', 'KD', 'Sig. str. %', 'Sig. str.', 'Total str.', 'Td', 'Td %', 'Sub. att', 
                'Rev.', 'Ctrl', 'A_Sig_strike', 'B_Sig_strike', 'A_Takedowns', 'B_Takedowns',
                'A_Sub_Attempts', 'B_Sub_Attempts', 'A_Ctrl_time', 'B_Ctrl_time']
    totals.drop(columns=bad_cols, inplace=True)

    # Save
    totals.to_csv('data/ufc_stats/fight_totals3/' + fightid + '_totals.csv')
    
    return totals
    


In [19]:
# More Concise Version:
# def get_fight_totals(fight_details_url):

#     # function returns the fight totals for a given fightid
#     fightid = fight_details_url[fight_details_url.find('fight-details/')+14:]
#     df = pd.read_html(fight_details_url)

#     # Part 1: Total Fight Stats
#     totals = df[0]
#     # replace '---' and '--' with 0
#     totals = totals.replace(['---', '--'], 0)

#     # Creating new columns with fighters A and B information
#     for col in ['Fighter', 'KD', 'Sig. str.', 'Total str.', 'Td', 'Sub. att', 'Rev.', 'Ctrl']:
#         col_split = totals[col].str.split('  ').apply(pd.Series)
#         totals[f'Fighter_A_{col}'], totals[f'Fighter_B_{col}'] = col_split[0], col_split[1]

#     totals['Fighter_A'], totals['Fighter_B'] = totals['Fighter_A_Fighter'].str.split(' ').str[:2].str.join(' '), totals['Fighter_B_Fighter'].str.split(' ').str[:2].str.join(' ')

#     # Drop original columns
#     bad_cols = ['Fighter', 'KD', 'Sig. str.', 'Total str.', 'Td', 'Sub. att', 'Rev.', 'Ctrl']
#     totals.drop(columns=bad_cols, inplace=True)

#     # Create helper function to split columns with ' of ' and calculate percentage
#     def split_and_calculate_percentage(data, column):
#         data[f'{column}_land'], data[f'{column}_att'] = data[column].str.split(' of ').str
#         data[f'{column}_land'], data[f'{column}_att'] = pd.to_numeric(data[f'{column}_land']), pd.to_numeric(data[f'{column}_att'])
#         data[f'{column}_percent'] = data[f'{column}_land'] / data[f'{column}_att']
#         data[f'{column}_percent'] = data[f'{column}_percent'].replace([np.inf, -np.inf, np.nan], 0)

#     # Calculate percentage for Sig. str., Total str., Td and Sub. att
#     for col in ['Fighter_A_Sig. str.', 'Fighter_B_Sig. str.', 'Fighter_A_Total str.', 'Fighter_B_Total str.', 'Fighter_A_Td', 'Fighter_B_Td', 'Fighter_A_Sub. att', 'Fighter_B_Sub. att']:
#         split_and_calculate_percentage(totals, col)

#     # Calculate Control Time in seconds
#     for fighter in ['Fighter_A', 'Fighter_B']:
#         totals[f'{fighter}_Ctrl_min'], totals[f'{fighter}_Ctrl_sec'] = totals[f'{fighter}_Ctrl'].str.split(':').str
#         totals[f'{fighter}_Ctrl_tot'] = pd.to_numeric(totals[f'{fighter}_Ctrl_min']) * 60 + pd.to_numeric(totals[f'{fighter}_Ctrl_sec'])

#     # Save fight details to CSV
#     fight_info_cols = ['details', 'event_title', 'event_url', 'date']
#     for col in fight_info_cols:
#         totals[col] = totals['Fighter_A_Fighter'].apply(lambda _: '')

#     totals['details'] = get_details_from_ufcstats(fight_details_url)
#     totals['event_title'] = get_event_title_from_ufcstats(fight_details_url)
#     totals['event_url'] = get_event_url_from_ufcstats(fight_details_url)
#     totals['date'] = get_event_date_from_ufcstats(totals['event_url'])

#     try:
#         totals['Winner'] = get_winner_from_ufcstats(fight_details_url)
#     except Exception:
#         totals['Winner'] = np.nan

#     totals.to_csv('data/ufc_stats/fight_totals3/' + fightid + '_totals.csv')

#     return totals

In [20]:
# test
thomp_holland= get_fight_totals('http://www.ufcstats.com/fight-details/a419e6df765aa755')

In [21]:
def get_significant_strikes(url):
    df = pd.read_html(url)
    # Part 2: Significant Strikes
    ss = df[2]
     
    ss['Fighter_A'] = ss['Fighter'].str.split(' ').str[0] + ' ' + ss['Fighter'].str.split(' ').str[1]
    ss['Fighter_B'] = ss['Fighter'].str.split(' ').str[2] + ' ' + ss['Fighter'].str.split(' ').str[3]
    ss['A_Head_Strikes'] = ss['Head'].str.split('  ').str[0]
    ss['B_Head_Strikes'] = ss['Head'].str.split('  ').str[1]
    # split head strikes into landed and attempted
    ss['A_Head_Strikes_land'] = ss['A_Head_Strikes'].str.split(' of ').str[0]
    ss['A_Head_Strikes_att'] = ss['A_Head_Strikes'].str.split(' of ').str[1]
    ss['B_Head_Strikes_land'] = ss['B_Head_Strikes'].str.split(' of ').str[0]
    ss['B_Head_Strikes_att'] = ss['B_Head_Strikes'].str.split(' of ').str[1]
    # change to numeric
    ss['A_Head_Strikes_land'] = pd.to_numeric(ss['A_Head_Strikes_land'])
    ss['A_Head_Strikes_att'] = pd.to_numeric(ss['A_Head_Strikes_att'])
    ss['B_Head_Strikes_land'] = pd.to_numeric(ss['B_Head_Strikes_land'])
    ss['B_Head_Strikes_att'] = pd.to_numeric(ss['B_Head_Strikes_att'])

    # replace any nans or infs with 0
    ss['A_Head_Strikes_land'].replace({np.nan: 0, np.inf: 0}, inplace=True)
    ss['A_Head_Strikes_att'].replace({np.nan: 0, np.inf: 0}, inplace=True)
    ss['B_Head_Strikes_land'].replace({np.nan: 0, np.inf: 0}, inplace=True)
    ss['B_Head_Strikes_att'].replace({np.nan: 0, np.inf: 0}, inplace=True)
    

    # Head Strikes Percentage
    ss['A_Head_Strikes_percent'] = ss['A_Head_Strikes_land'] / ss['A_Head_Strikes_att']
    ss['B_Head_Strikes_percent'] = ss['B_Head_Strikes_land'] / ss['B_Head_Strikes_att']
    # Body Strikes
    ss['A_Body_Strikes'] = ss['Body'].str.split('  ').str[0]
    ss['B_Body_Strikes'] = ss['Body'].str.split('  ').str[1]
    # split body strikes into landed and attempted
    ss['A_Body_Strikes_land'] = ss['A_Body_Strikes'].str.split(' of ').str[0]
    ss['A_Body_Strikes_att'] = ss['A_Body_Strikes'].str.split(' of ').str[1]
    ss['B_Body_Strikes_land'] = ss['B_Body_Strikes'].str.split(' of ').str[0]
    ss['B_Body_Strikes_att'] = ss['B_Body_Strikes'].str.split(' of ').str[1]
    # change to numeric
    ss['A_Body_Strikes_land'] = pd.to_numeric(ss['A_Body_Strikes_land'])
    ss['A_Body_Strikes_att'] = pd.to_numeric(ss['A_Body_Strikes_att'])
    ss['B_Body_Strikes_land'] = pd.to_numeric(ss['B_Body_Strikes_land'])
    ss['B_Body_Strikes_att'] = pd.to_numeric(ss['B_Body_Strikes_att'])
    # Change any nans or infs to 0
    ss['A_Body_Strikes_land'].replace({np.nan: 0, np.inf: 0}, inplace=True)
    ss['A_Body_Strikes_att'].replace({np.nan: 0, np.inf: 0}, inplace=True)
    ss['B_Body_Strikes_land'].replace({np.nan: 0, np.inf: 0}, inplace=True)
    ss['B_Body_Strikes_att'].replace({np.nan: 0, np.inf: 0}, inplace=True)

    # Body Strikes Percentage
    ss['A_Body_Strikes_percent'] = ss['A_Body_Strikes_land'] / ss['A_Body_Strikes_att']
    ss['B_Body_Strikes_percent'] = ss['B_Body_Strikes_land'] / ss['B_Body_Strikes_att']
    # Leg Strikes
    ss['A_Leg_Strikes'] = ss['Leg'].str.split('  ').str[0]
    ss['B_Leg_Strikes'] = ss['Leg'].str.split('  ').str[1]
    # split leg strikes into landed and attempted
    ss['A_Leg_Strikes_land'] = ss['A_Leg_Strikes'].str.split(' of ').str[0]
    ss['A_Leg_Strikes_att'] = ss['A_Leg_Strikes'].str.split(' of ').str[1]
    ss['B_Leg_Strikes_land'] = ss['B_Leg_Strikes'].str.split(' of ').str[0]
    ss['B_Leg_Strikes_att'] = ss['B_Leg_Strikes'].str.split(' of ').str[1]
    # change to numeric
    ss['A_Leg_Strikes_land'] = pd.to_numeric(ss['A_Leg_Strikes_land'])
    ss['A_Leg_Strikes_att'] = pd.to_numeric(ss['A_Leg_Strikes_att'])
    ss['B_Leg_Strikes_land'] = pd.to_numeric(ss['B_Leg_Strikes_land'])
    ss['B_Leg_Strikes_att'] = pd.to_numeric(ss['B_Leg_Strikes_att'])
    # change any nans or infs to 0
    ss['A_Leg_Strikes_land'].replace({np.nan: 0, np.inf: 0}, inplace=True)
    ss['A_Leg_Strikes_att'].replace({np.nan: 0, np.inf: 0}, inplace=True)
    ss['B_Leg_Strikes_land'].replace({np.nan: 0, np.inf: 0}, inplace=True)
    ss['B_Leg_Strikes_att'].replace({np.nan: 0, np.inf: 0}, inplace=True)

    # Leg Strikes Percentage
    ss['A_Leg_Strikes_percent'] = ss['A_Leg_Strikes_land'] / ss['A_Leg_Strikes_att']
    ss['B_Leg_Strikes_percent'] = ss['B_Leg_Strikes_land'] / ss['B_Leg_Strikes_att']
    # Distance Strikes
    ss['A_Distance_Strikes'] = ss['Distance'].str.split('  ').str[0]
    ss['B_Distance_Strikes'] = ss['Distance'].str.split('  ').str[1]
    # split distance strikes into landed and attempted
    ss['A_Distance_Strikes_land'] = ss['A_Distance_Strikes'].str.split(' of ').str[0]
    ss['A_Distance_Strikes_att'] = ss['A_Distance_Strikes'].str.split(' of ').str[1]
    ss['B_Distance_Strikes_land'] = ss['B_Distance_Strikes'].str.split(' of ').str[0]
    ss['B_Distance_Strikes_att'] = ss['B_Distance_Strikes'].str.split(' of ').str[1]
    # change to numeric
    ss['A_Distance_Strikes_land'] = pd.to_numeric(ss['A_Distance_Strikes_land'])
    ss['A_Distance_Strikes_att'] = pd.to_numeric(ss['A_Distance_Strikes_att'])
    ss['B_Distance_Strikes_land'] = pd.to_numeric(ss['B_Distance_Strikes_land'])
    ss['B_Distance_Strikes_att'] = pd.to_numeric(ss['B_Distance_Strikes_att'])

    # change any nans or infs to 0
    ss['A_Distance_Strikes_land'].replace({np.nan: 0, np.inf: 0}, inplace=True)
    ss['A_Distance_Strikes_att'].replace({np.nan: 0, np.inf: 0}, inplace=True)
    ss['B_Distance_Strikes_land'].replace({np.nan: 0, np.inf: 0}, inplace=True)
    ss['B_Distance_Strikes_att'].replace({np.nan: 0, np.inf: 0}, inplace=True)


    # distance strikes percentage
    ss['A_Distance_Strikes_percent'] = ss['A_Distance_Strikes_land'] / ss['A_Distance_Strikes_att']
    ss['B_Distance_Strikes_percent'] = ss['B_Distance_Strikes_land'] / ss['B_Distance_Strikes_att']

    # Clinch Strikes
    ss['A_Clintch_Strikes'] = ss['Clinch'].str.split('  ').str[0]
    ss['B_Clintch_Strikes'] = ss['Clinch'].str.split('  ').str[1]
    # split clinch strikes into landed and attempted
    ss['A_Clinch_Strikes_land'] = ss['A_Clintch_Strikes'].str.split(' of ').str[0]
    ss['A_Clinch_Strikes_att'] = ss['A_Clintch_Strikes'].str.split(' of ').str[1]
    ss['B_Clinch_Strikes_land'] = ss['B_Clintch_Strikes'].str.split(' of ').str[0]
    ss['B_Clinch_Strikes_att'] = ss['B_Clintch_Strikes'].str.split(' of ').str[1]
    # change to numeric
    ss['A_Clinch_Strikes_land'] = pd.to_numeric(ss['A_Clinch_Strikes_land'])
    ss['A_Clinch_Strikes_att'] = pd.to_numeric(ss['A_Clinch_Strikes_att'])
    ss['B_Clinch_Strikes_land'] = pd.to_numeric(ss['B_Clinch_Strikes_land'])
    ss['B_Clinch_Strikes_att'] = pd.to_numeric(ss['B_Clinch_Strikes_att'])
    # change any nans or infs to 0
    ss['A_Clinch_Strikes_land'].replace({np.nan: 0, np.inf: 0}, inplace=True)
    ss['A_Clinch_Strikes_att'].replace({np.nan: 0, np.inf: 0}, inplace=True)
    ss['B_Clinch_Strikes_land'].replace({np.nan: 0, np.inf: 0}, inplace=True)
    ss['B_Clinch_Strikes_att'].replace({np.nan: 0, np.inf: 0}, inplace=True)

    # clinch strikes percentage
    ss['A_Clinch_Strikes_percent'] = ss['A_Clinch_Strikes_land'] / ss['A_Clinch_Strikes_att']
    ss['B_Clinch_Strikes_percent'] = ss['B_Clinch_Strikes_land'] / ss['B_Clinch_Strikes_att']

    # Ground Strikes
    ss['A_Ground_Strikes'] = ss['Ground'].str.split('  ').str[0]
    ss['B_Ground_Strikes'] = ss['Ground'].str.split('  ').str[1]
    # split ground strikes into landed and attempted
    ss['A_Ground_Strikes_land'] = ss['A_Ground_Strikes'].str.split(' of ').str[0]
    ss['A_Ground_Strikes_att'] = ss['A_Ground_Strikes'].str.split(' of ').str[1]
    ss['B_Ground_Strikes_land'] = ss['B_Ground_Strikes'].str.split(' of ').str[0]
    ss['B_Ground_Strikes_att'] = ss['B_Ground_Strikes'].str.split(' of ').str[1]
    # change to numeric
    ss['A_Ground_Strikes_land'] = pd.to_numeric(ss['A_Ground_Strikes_land'])
    ss['A_Ground_Strikes_att'] = pd.to_numeric(ss['A_Ground_Strikes_att'])
    ss['B_Ground_Strikes_land'] = pd.to_numeric(ss['B_Ground_Strikes_land'])
    ss['B_Ground_Strikes_att'] = pd.to_numeric(ss['B_Ground_Strikes_att'])
    # change any nans or infs to 0
    ss['A_Ground_Strikes_land'].replace({np.nan: 0, np.inf: 0}, inplace=True)
    ss['A_Ground_Strikes_att'].replace({np.nan: 0, np.inf: 0}, inplace=True)
    ss['B_Ground_Strikes_land'].replace({np.nan: 0, np.inf: 0}, inplace=True)
    ss['B_Ground_Strikes_att'].replace({np.nan: 0, np.inf: 0}, inplace=True)

    # ground strikes percentage
    ss['A_Ground_Strikes_percent'] = ss['A_Ground_Strikes_land'] / ss['A_Ground_Strikes_att']
    ss['B_Ground_Strikes_percent'] = ss['B_Ground_Strikes_land'] / ss['B_Ground_Strikes_att']
    # add stuff with beautiful soup
    ss['details'] = get_details_from_ufcstats(url)
    ss['event_title'] = get_event_title_from_ufcstats(url)
    event_url = get_event_url_from_ufcstats(url)
    ss['event_url'] = event_url

    fightid = url[url.find('fight-details/')+14:]

    # drop columns
    bad_cols = ['Fighter', 'Sig. str', 'Sig. str. %', 'Head', 'Body', 'Leg', 'Distance', 'Clinch', 'Ground',
                'A_Head_Strikes', 'B_Head_Strikes', 'A_Body_Strikes', 'B_Body_Strikes', 'A_Leg_Strikes', 'B_Leg_Strikes',
                'A_Distance_Strikes', 'B_Distance_Strikes', 'A_Clintch_Strikes', 'B_Clintch_Strikes', 'A_Ground_Strikes', 
                'B_Ground_Strikes']
    ss.drop(columns=bad_cols, inplace=True)

    # Save
    ss.to_csv('data/ufc_stats/sig_strikes3/' + fightid + '_sigstrikes.csv')

    return ss

In [22]:
# get all events from oddsbyfighter
all_event_odds = pd.read_csv('data/final/odds/All_Odds_by_Fighter_WithChange.csv')

# get all events from all_event_odds

all_event_odds_urls = all_event_odds['event_ufcstats_url'].unique()
print(len(all_event_odds_urls))
all_event_odds_urls

518


array(['http://ufcstats.com/event-details/dfdd0c5dd0d4bc23',
       'http://ufcstats.com/event-details/aa79d5399571068e',
       'http://ufcstats.com/event-details/e7bc606d269896aa',
       'http://ufcstats.com/event-details/3c48019bc387b80c',
       'http://ufcstats.com/event-details/e69c5ce12f4e762b',
       'http://ufcstats.com/event-details/35585d970300d45a',
       'http://ufcstats.com/event-details/e5c38954c006f15c',
       'http://ufcstats.com/event-details/46effbd1135423c5',
       'http://ufcstats.com/event-details/3f7c14c7eca7195d',
       'http://ufcstats.com/event-details/67ec58d7cf599835',
       'http://ufcstats.com/event-details/4f732e58ed907eff',
       'http://ufcstats.com/event-details/269d103c96a4c3a5',
       'http://ufcstats.com/event-details/bad28b7b34f334de',
       'http://ufcstats.com/event-details/31da66df48c0c1a0',
       'http://ufcstats.com/event-details/ac9521250dc1a14c',
       'http://ufcstats.com/event-details/3bc27ec15facbcf3',
       'http://ufcstats.

In [23]:
all_event_urls = all_event_odds_urls.tolist()

In [24]:
# get all the fight links from an event page, dl to events folder

#TODO: Do we need to do this for all events, or just NEW events?

# events_with_fight_links = pd.DataFrame()
# m = 1
# errors = []

# for event_url in all_event_urls:
#     try:
#         event_id = event_url.split('/')[-1]
#         get_event_details(event_id)
#         print(f'{m} of {len(all_event_urls)}')
#         m += 1
#     except:
#         errors.append(event_url)
#         print(f'ERROR: {m} of {len(all_event_urls)}')
#         m += 1

# print(len(errors))
# errors


In [25]:
events_folder = os.listdir('data/ufc_stats/events2/')
# append all the events together
all_events = pd.DataFrame()

for event in events_folder:
    event_df = pd.read_csv('data/ufc_stats/events2/' + event)
    all_events = all_events.append(event_df)

all_events.drop_duplicates(inplace=True)
all_events.reset_index(drop=True, inplace=True)
all_events

Unnamed: 0.1,Unnamed: 0,W/L,Weight class,Method,Round,Time,Fighter1,Fighter2,F1_Kd,F2_Kd,F1_Str,F2_Str,F1_Td,F2_Td,F1_Sub,F2_Sub,fight_num,event_id,fight_link
0,0,win,Middleweight,KO/TKO Punch,2,3:33,Israel Adesanya,Robert Whittaker,2,0,40,32,0,0,0,0,1,3cf68c1d17f66af7,http://www.ufcstats.com/fight-details/2556b752...
1,1,win,Lightweight,U-DEC,3,5:00,Dan Hooker,Al Iaquinta,1,0,98,37,0,0,0,0,2,3cf68c1d17f66af7,http://www.ufcstats.com/fight-details/0697d552...
2,2,win,Heavyweight,SUB Arm Triangle,2,3:14,Serghei Spivac,Tai Tuivasa,0,0,23,21,6,0,1,0,3,3cf68c1d17f66af7,http://www.ufcstats.com/fight-details/8cd7ca0e...
3,3,win,Welterweight,S-DEC,3,5:00,Dhiego Lima,Luke Jumeau,0,0,32,24,2,0,0,0,4,3cf68c1d17f66af7,http://www.ufcstats.com/fight-details/fd0fd9a2...
4,4,win,Heavyweight,KO/TKO Punch,1,2:10,Yorgan De Castro,Justin Tafa,1,0,4,6,0,0,0,0,5,3cf68c1d17f66af7,http://www.ufcstats.com/fight-details/9dfac33c...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5882,7,win,Lightweight,SUB Guillotine Choke,1,2:25,Joel Alvarez,Joe Duffy,0,0,18,14,0,1,1,0,8,ddbd0d6259ce57cc,http://www.ufcstats.com/fight-details/b76c0b0e...
5883,8,win,Bantamweight,U-DEC,3,5:00,Brett Johns,Montel Jackson,0,1,11,26,8,2,1,0,9,ddbd0d6259ce57cc,http://www.ufcstats.com/fight-details/b5f70d0c...
5884,9,win,Bantamweight,SUB Triangle Choke,1,4:42,Amir Albazi,Malcolm Gordon,0,0,12,3,1,0,1,0,10,ddbd0d6259ce57cc,http://www.ufcstats.com/fight-details/b9b0da84...
5885,10,win,Lightweight,U-DEC,3,5:00,Arman Tsarukyan,Davi Ramos,0,0,91,33,1,0,0,0,11,ddbd0d6259ce57cc,http://www.ufcstats.com/fight-details/ecb24300...


In [26]:
# add fight_id to all_events
all_events['fight_id'] = all_events['fight_link'].str.split('/').str[-1]

In [27]:
all_events

Unnamed: 0.1,Unnamed: 0,W/L,Weight class,Method,Round,Time,Fighter1,Fighter2,F1_Kd,F2_Kd,F1_Str,F2_Str,F1_Td,F2_Td,F1_Sub,F2_Sub,fight_num,event_id,fight_link,fight_id
0,0,win,Middleweight,KO/TKO Punch,2,3:33,Israel Adesanya,Robert Whittaker,2,0,40,32,0,0,0,0,1,3cf68c1d17f66af7,http://www.ufcstats.com/fight-details/2556b752...,2556b7520536ce1d
1,1,win,Lightweight,U-DEC,3,5:00,Dan Hooker,Al Iaquinta,1,0,98,37,0,0,0,0,2,3cf68c1d17f66af7,http://www.ufcstats.com/fight-details/0697d552...,0697d5520b74a8df
2,2,win,Heavyweight,SUB Arm Triangle,2,3:14,Serghei Spivac,Tai Tuivasa,0,0,23,21,6,0,1,0,3,3cf68c1d17f66af7,http://www.ufcstats.com/fight-details/8cd7ca0e...,8cd7ca0e7b861b3d
3,3,win,Welterweight,S-DEC,3,5:00,Dhiego Lima,Luke Jumeau,0,0,32,24,2,0,0,0,4,3cf68c1d17f66af7,http://www.ufcstats.com/fight-details/fd0fd9a2...,fd0fd9a2d6ef8c4f
4,4,win,Heavyweight,KO/TKO Punch,1,2:10,Yorgan De Castro,Justin Tafa,1,0,4,6,0,0,0,0,5,3cf68c1d17f66af7,http://www.ufcstats.com/fight-details/9dfac33c...,9dfac33cd3ae4afd
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5882,7,win,Lightweight,SUB Guillotine Choke,1,2:25,Joel Alvarez,Joe Duffy,0,0,18,14,0,1,1,0,8,ddbd0d6259ce57cc,http://www.ufcstats.com/fight-details/b76c0b0e...,b76c0b0ed2926737
5883,8,win,Bantamweight,U-DEC,3,5:00,Brett Johns,Montel Jackson,0,1,11,26,8,2,1,0,9,ddbd0d6259ce57cc,http://www.ufcstats.com/fight-details/b5f70d0c...,b5f70d0c88f12645
5884,9,win,Bantamweight,SUB Triangle Choke,1,4:42,Amir Albazi,Malcolm Gordon,0,0,12,3,1,0,1,0,10,ddbd0d6259ce57cc,http://www.ufcstats.com/fight-details/b9b0da84...,b9b0da84b243213f
5885,10,win,Lightweight,U-DEC,3,5:00,Arman Tsarukyan,Davi Ramos,0,0,91,33,1,0,0,0,11,ddbd0d6259ce57cc,http://www.ufcstats.com/fight-details/ecb24300...,ecb24300bb3bef8c


In [28]:
# find all fights downloaded
downloaded_fights = os.listdir('data/ufc_stats/fight_totals3/')
downloaded_fights = [fight.replace('_totals.csv', '') for fight in downloaded_fights]
downloaded_fights

['89820fae001dd151',
 'df33799f117000cb',
 '18aff757c54687f1',
 'd019250cc6d93527',
 '4655930eb83446c7',
 '821c27f0dbb27e86',
 '59bb17087b12ad35',
 'cc6934588298958e',
 '64eb4ae171613218',
 '296cc69bcc3c1635',
 '1fe58cdab57d233b',
 '11706648d34ff3e8',
 '23c6a428df1569bc',
 '32b0e450b11b32fe',
 'd4d6c5ff6bef93ce',
 'c1322d09e8b6efca',
 'e67e15cc6578d1c3',
 'eed8c9955cad1e30',
 '0e9091311ca565ce',
 'f680e6ebe3bdfe3e',
 'caf3ca7fc0195412',
 '382d626d45c36f14',
 'de2069fea664c4b7',
 'db68ff7bf2487971',
 '37a7dc68f3a0e65d',
 '8857bf28823b1b2d',
 'c4e16d57dd9a1b39',
 '7ec61e2b0728e6d4',
 '87126427cfcaee52',
 'ae07b35f2797242e',
 'da26301ae3e7a9b1',
 '8a7b0cd5ad9ca4fe',
 'e857ca0bc3b9fdcd',
 '534c07488396e124',
 '63c08ecea35e8bee',
 '8f8a6c578308d014',
 '5755a68d39867abc',
 '226884e46a10865d',
 '68137e9ff22d426b',
 '783761d95060ffc4',
 '3a1d4016452477f5',
 '91daeeeea0a83e49',
 '2310fbcb6fdf3b47',
 'd763ad79532b27ff',
 '516868e246064e2b',
 'b695fc80bd538172',
 '9882c1dac77c0367',
 '5cd18690137

In [29]:
# find all fights that are in all_downloaded_fights but not in fight_totals3
fights_to_dl = [fight for fight in all_events['fight_id'] if fight not in downloaded_fights]
print(len(fights_to_dl))
fights_to_dl

112


['f557405652165aa3',
 '4dc5a41f357a7d85',
 '0548a24a9bf8d7a0',
 'ea29714dcfb07ce2',
 'cd137988da724076',
 '288f7fb936c1523c',
 '962e74250884e4c1',
 'f5a6acc99bc5c634',
 'fe370c518cf7873e',
 '454265014520a3f8',
 'dffc65202afb881d',
 'bd2e14ee959f9b77',
 'ea006bae605e2a6b',
 '4eaf54ad3c82b6fa',
 'dfa8fd0b82b37fa1',
 '5098d052525da54f',
 '1c29c53ff032c5c1',
 '0d41a3fa5efef91b',
 '1b9a127f9dadaad9',
 '1be27d95bba03f17',
 'e3a5a7ede88a2555',
 'ac73c6888ee9322c',
 '2a48d0a98c21e736',
 'c3ef3cb03edde8bb',
 '07cb64236ae7aaea',
 '2874ea5d2f783e23',
 '582806c33ce6dcf6',
 '9124740fe7816d70',
 'ced01368259428f5',
 '42922bab8a3e1828',
 '8b296724a6844865',
 'c22de92b9d9030dc',
 'c2cdeb207cce5ceb',
 'f18d44292036d5de',
 'fcc1191761e365f9',
 '7b909acede0a26eb',
 '7f16e7725245bb2d',
 'e4ed294e16e16d81',
 '222bb2c67a87d312',
 'f9d2137f380e66bf',
 '3e1c27f17433f9e4',
 'a3d75cc363a6544b',
 '09a7f0979a58e6ea',
 '0aae15358d8606b9',
 'bfff5a3777841344',
 '56809feb4fe7ea61',
 '5d481ff494bf40da',
 '65c89e0c92e

In [30]:
# turn fights_to_dl into a list of urls
fights_to_dl_urls = []

for fight in fights_to_dl:
    fight_url = 'http://www.ufcstats.com/fight-details/' + fight
    fights_to_dl_urls.append(fight_url)

fights_to_dl_urls

['http://www.ufcstats.com/fight-details/f557405652165aa3',
 'http://www.ufcstats.com/fight-details/4dc5a41f357a7d85',
 'http://www.ufcstats.com/fight-details/0548a24a9bf8d7a0',
 'http://www.ufcstats.com/fight-details/ea29714dcfb07ce2',
 'http://www.ufcstats.com/fight-details/cd137988da724076',
 'http://www.ufcstats.com/fight-details/288f7fb936c1523c',
 'http://www.ufcstats.com/fight-details/962e74250884e4c1',
 'http://www.ufcstats.com/fight-details/f5a6acc99bc5c634',
 'http://www.ufcstats.com/fight-details/fe370c518cf7873e',
 'http://www.ufcstats.com/fight-details/454265014520a3f8',
 'http://www.ufcstats.com/fight-details/dffc65202afb881d',
 'http://www.ufcstats.com/fight-details/bd2e14ee959f9b77',
 'http://www.ufcstats.com/fight-details/ea006bae605e2a6b',
 'http://www.ufcstats.com/fight-details/4eaf54ad3c82b6fa',
 'http://www.ufcstats.com/fight-details/dfa8fd0b82b37fa1',
 'http://www.ufcstats.com/fight-details/5098d052525da54f',
 'http://www.ufcstats.com/fight-details/1c29c53ff032c5c1

In [31]:
# download all the missing fights with both fight_totals and sig_strikes
m = 1
errors = []

for fight in fights_to_dl_urls:
    try:
        get_fight_totals(fight)
        get_significant_strikes(fight)
        print(f'{m} of {len(fights_to_dl_urls)}')
        m += 1
    except:
        errors.append(fight)
        print(f'ERROR: {m} of {len(fights_to_dl_urls)}')
        m += 1


1 of 112
2 of 112
3 of 112
4 of 112
5 of 112
6 of 112
7 of 112
8 of 112
9 of 112
10 of 112
11 of 112
12 of 112
13 of 112
14 of 112
15 of 112
16 of 112
17 of 112
18 of 112
19 of 112
20 of 112
21 of 112
22 of 112
23 of 112
24 of 112
25 of 112
26 of 112
27 of 112
28 of 112
29 of 112
30 of 112
31 of 112
32 of 112
33 of 112
34 of 112
35 of 112
36 of 112
37 of 112
38 of 112
39 of 112
40 of 112
41 of 112
42 of 112
43 of 112
44 of 112
45 of 112
46 of 112
47 of 112
48 of 112
49 of 112
50 of 112
51 of 112
52 of 112
53 of 112
54 of 112
55 of 112
56 of 112
57 of 112
58 of 112
59 of 112
60 of 112
61 of 112
62 of 112
63 of 112
64 of 112
65 of 112
66 of 112
67 of 112
68 of 112
69 of 112
70 of 112
71 of 112
72 of 112
73 of 112
74 of 112
75 of 112
76 of 112
77 of 112
78 of 112
79 of 112
80 of 112
81 of 112
82 of 112
83 of 112
84 of 112
85 of 112
86 of 112
87 of 112
88 of 112
89 of 112
90 of 112
91 of 112
92 of 112
93 of 112
94 of 112
95 of 112
96 of 112
97 of 112
98 of 112
99 of 112
100 of 112
101 of 1

In [32]:
# Aggregate all the fight data from all the events
all_event_data = pd.DataFrame()
m = 1
errors = []

event_files = os.listdir('data/ufc_stats/events2/')

for event_file in event_files:
    data = pd.read_csv('data/ufc_stats/events2/' + event_file)
    all_event_data = all_event_data.append(data, ignore_index=True)

all_event_data

Unnamed: 0.1,Unnamed: 0,W/L,Weight class,Method,Round,Time,Fighter1,Fighter2,F1_Kd,F2_Kd,F1_Str,F2_Str,F1_Td,F2_Td,F1_Sub,F2_Sub,fight_num,event_id,fight_link
0,0,win,Middleweight,KO/TKO Punch,2,3:33,Israel Adesanya,Robert Whittaker,2,0,40,32,0,0,0,0,1,3cf68c1d17f66af7,http://www.ufcstats.com/fight-details/2556b752...
1,1,win,Lightweight,U-DEC,3,5:00,Dan Hooker,Al Iaquinta,1,0,98,37,0,0,0,0,2,3cf68c1d17f66af7,http://www.ufcstats.com/fight-details/0697d552...
2,2,win,Heavyweight,SUB Arm Triangle,2,3:14,Serghei Spivac,Tai Tuivasa,0,0,23,21,6,0,1,0,3,3cf68c1d17f66af7,http://www.ufcstats.com/fight-details/8cd7ca0e...
3,3,win,Welterweight,S-DEC,3,5:00,Dhiego Lima,Luke Jumeau,0,0,32,24,2,0,0,0,4,3cf68c1d17f66af7,http://www.ufcstats.com/fight-details/fd0fd9a2...
4,4,win,Heavyweight,KO/TKO Punch,1,2:10,Yorgan De Castro,Justin Tafa,1,0,4,6,0,0,0,0,5,3cf68c1d17f66af7,http://www.ufcstats.com/fight-details/9dfac33c...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5882,7,win,Lightweight,SUB Guillotine Choke,1,2:25,Joel Alvarez,Joe Duffy,0,0,18,14,0,1,1,0,8,ddbd0d6259ce57cc,http://www.ufcstats.com/fight-details/b76c0b0e...
5883,8,win,Bantamweight,U-DEC,3,5:00,Brett Johns,Montel Jackson,0,1,11,26,8,2,1,0,9,ddbd0d6259ce57cc,http://www.ufcstats.com/fight-details/b5f70d0c...
5884,9,win,Bantamweight,SUB Triangle Choke,1,4:42,Amir Albazi,Malcolm Gordon,0,0,12,3,1,0,1,0,10,ddbd0d6259ce57cc,http://www.ufcstats.com/fight-details/b9b0da84...
5885,10,win,Lightweight,U-DEC,3,5:00,Arman Tsarukyan,Davi Ramos,0,0,91,33,1,0,0,0,11,ddbd0d6259ce57cc,http://www.ufcstats.com/fight-details/ecb24300...


In [33]:
all_event_data.to_csv('data/final/events/All_Events_Fights_and_FightUrls.csv')