<a href="https://colab.research.google.com/github/tylerlum/ufc_automated_scoring_system/blob/main/UFC_Automated_Scoring_System.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# UFC Automated Scoring System

The goal of this notebook is to:
* Explore the FightMetrics webpage to scrape the fight and fighter information we need
* Store the fight and fighter data into csv files
* Preprocess the data
* Train and evaluate a neural network to predict fight outcomes

## All bouts and all fighters

In [None]:
import pandas as pd
from tqdm import tqdm

In [None]:
def get_all_fighter_tables():
  '''Get pandas table of all UFC fighters'''
  from string import ascii_lowercase

  all_fighters_tables = []
  for c in tqdm(ascii_lowercase):
    all_fighters_url = f"http://ufcstats.com/statistics/fighters?char={c}&page=all"
    all_fighters_table = pd.read_html(all_fighters_url)[0]
    all_fighters_tables.append(all_fighters_table)

  all_fighters_tables = pd.concat(all_fighters_tables)
  return all_fighters_tables

In [None]:
ALL_FIGHTER_TABLES = get_all_fighter_tables()

100%|██████████| 26/26 [00:23<00:00,  1.11it/s]


In [None]:
ALL_FIGHTER_TABLES.head()

Unnamed: 0,First,Last,Nickname,Ht.,Wt.,Reach,Stance,W,L,D,Belt
0,,,,,,,,,,,
1,Tom,Aaron,,--,155 lbs.,--,,5.0,3.0,0.0,
2,Danny,Abbadi,The Assassin,"5' 11""",155 lbs.,--,Orthodox,4.0,6.0,0.0,
3,David,Abbott,Tank,"6' 0""",265 lbs.,--,Switch,10.0,15.0,0.0,
4,Shamil,Abdurakhimov,Abrek,"6' 3""",235 lbs.,"76.0""",Orthodox,20.0,5.0,0.0,


## Going to future pages

In [None]:
from urllib.request import urlopen
import numpy as np
from string import ascii_uppercase
from dateutil import parser
from datetime import datetime
import pandas as pd
from tqdm import tqdm

In [None]:
ALL_PAST_EVENTS_URL = "http://ufcstats.com/statistics/events/completed?page=all"

In [None]:
def get_all_events(all_past_events_url):
    '''Takes in URL to all past events. Returns list of http strings, each one representing a UFC event'''
    def find_latest_index_with_char(string, chars):
        for i in reversed(range(len(string))):
            if string[i] in chars:
                return i
        return -1

    all_past_events_page = urlopen(all_past_events_url)
    all_past_events_html_bytes = all_past_events_page.read()
    all_past_events_html = all_past_events_html_bytes.decode("utf-8")
    all_past_events_tables = pd.read_html(all_past_events_url)[0]
    all_past_events_tables = all_past_events_tables[all_past_events_tables["Name/date"].notna()]

    event_names = []
    for i, row in all_past_events_tables.iterrows():
        date_index = find_latest_index_with_char(row["Name/date"], ascii_uppercase)
        event_name = row["Name/date"][:date_index-1].strip()
        date = parser.parse(row["Name/date"][date_index:])
        if date < datetime.now():
            event_names.append(event_name)

    all_http_strings = []
    for event_name in event_names:
        new_substring = all_past_events_html[:all_past_events_html.index(event_name)]
        http_index = new_substring.rfind("http://")
        http_string = new_substring[http_index:]
        http_string = http_string[:http_string.find("\"")]
        all_http_strings.append(http_string)
    return all_http_strings

In [None]:
def get_fighters(fighters_string, all_fighter_tables):
    '''Parses string containing two fighter names. Uses all_fighter_tables to remove ambiguity in parsing. Returns each fighter name'''
    for i, row in all_fighter_tables.iterrows():
        fighter_name = f'{row["First"]} {row["Last"]}'
        if fighters_string.startswith(fighter_name):
            first_fighter = fighter_name
            second_fighter = fighters_string[len(fighter_name)+1:]
            break
    return first_fighter, second_fighter

In [None]:
def get_all_fights_in_event(past_event_url, get_results=False):
    '''Takes in a single URL to a past event. Returns list of http strings, each one representing a UFC fight'''
    past_event_page = urlopen(past_event_url)
    past_event_html_bytes = past_event_page.read()
    past_event_html = past_event_html_bytes.decode("utf-8")
    past_event_tables = pd.read_html(past_event_url)[0]

    body = past_event_html[past_event_html.index("<body "):]
    fight_http_strings = []
    for i, row in past_event_tables.iterrows():
        result = row["W/L"].split(' ')[0]
        before_result = body[:body.index(result)]
        begin = before_result[before_result.rfind("http://"):]
        http = begin[:begin.find("\"")]
        if get_results:
            if result != "win": 
                fight_http_strings.append((http, None))
            else:
                winner, loser = get_fighters(row["Fighter"], ALL_FIGHTER_TABLES)
                fight_http_strings.append((http, winner))
        else:
            fight_http_strings.append(http)
        body = body[body.index(result)+len(result):]
    return fight_http_strings

In [None]:
def get_all_fights(all_event_http_strings, num_events=10):
    '''Takes in list of URLs to past events. Returns list of http strings, each one representing a UFC fight'''
    all_fight_http_strings = []
    for i, event_http_string in enumerate(tqdm(all_event_http_strings)):
        if i == num_events:
            break
        fight_http_strings = get_all_fights_in_event(event_http_string, get_results=True)
        all_fight_http_strings.extend(fight_http_strings)
    return all_fight_http_strings

In [None]:
def process_fight(fight_url):
    '''Takes in a URL to a fight. Returns a pandas dataframe representing the fight statistics'''
    def parse_string(row_string):
        string_split = row_string.split(" ")
        first_fighter_stat = " ".join(string_split[:len(string_split)//2])
        second_fighter_stat = " ".join(string_split[len(string_split)//2+1:])
        return first_fighter_stat, second_fighter_stat

    # Read in stat tables
    fight_tables = pd.read_html(fight_url)

    # There are multiple tables of data
    # rbr = round by round
    summary_table, rbr_summary_table, strike_location_table, rbr_strike_location_table = fight_tables

    fighters_string = summary_table["Fighter"][0]
    print(fighters_string)
    fighter1, fighter2 = get_fighters(fighters_string, ALL_FIGHTER_TABLES)
    
    new_columns = ["Fighter 1 Name", "Fighter 2 Name"]
    for i, column in enumerate(rbr_summary_table):
        if column[0] == "Fighter":
            continue
        new_columns.append(f"Fighter 1 {column[0]}")
        new_columns.append(f"Fighter 2 {column[0]}")

    new_rows = []
    for i, row in rbr_summary_table.iterrows():
        new_row = []
        new_row.append(fighter1)
        new_row.append(fighter2)
        for column in rbr_summary_table:
            if column[0] == "Fighter":
                continue
            stat1, stat2 = parse_string(row[column[0]][0])
            new_row.append(stat1)
            new_row.append(stat2)
        new_rows.append(new_row)

    df = pd.DataFrame(new_rows, columns=new_columns)
    return df 

In [None]:
def convert_fight_url_to_datapoint(fight_urls):
    '''Takes in list of URLS, each representing a fight. Returns datapoints'''
    fight_http_string, winner = FIGHT_HTTP_STRINGS[idx]
    fight_table = process_fight(fight_http_string)
    if winner is None:
        label = 0
    elif fight_table["Fighter 1 Name"][0] == winner:
        label = 1
    elif fight_table["Fighter 2 Name"][0] == winner:
        label = -1
    else:
        print(f'ERROR: fight_table["Fighter 1 Name"]={fight_table["Fighter 1 Name"]}, fight_table["Fighter 2 Name"]={fight_table["Fighter 2 Name"]}, winner={winner}')
    # fight_table['Result'] = label
    return fight_table

In [None]:
# Events
ALL_EVENT_HTTP_STRINGS = get_all_events(ALL_PAST_EVENTS_URL)
print(f"Got {len(ALL_EVENT_HTTP_STRINGS)} events")

# Fights
FIGHT_HTTP_STRINGS = get_all_fights(ALL_EVENT_HTTP_STRINGS, num_events=5)
print(f"Got {len(FIGHT_HTTP_STRINGS)} fights")



  0%|          | 0/551 [00:00<?, ?it/s][A[A

Got 551 events




  0%|          | 1/551 [00:02<26:15,  2.86s/it][A[A

  0%|          | 2/551 [00:05<26:41,  2.92s/it][A[A

  1%|          | 3/551 [00:09<29:08,  3.19s/it][A[A

  1%|          | 4/551 [00:12<28:50,  3.16s/it][A[A

  1%|          | 5/551 [00:16<29:29,  3.24s/it][A[A

Got 59 fights


In [None]:
x = convert_fight_url_to_datapoint(FIGHT_HTTP_STRINGS)
x.head()

Serghei Spivac Jared Vanderaa


Unnamed: 0,Fighter 1 Name,Fighter 2 Name,Fighter 1 KD,Fighter 2 KD,Fighter 1 Sig. str.,Fighter 2 Sig. str.,Fighter 1 Sig. str. %,Fighter 2 Sig. str. %,Fighter 1 Total str.,Fighter 2 Total str.,Fighter 1 Td %,Fighter 2 Td %,Fighter 1 Td %.1,Fighter 2 Td %.1,Fighter 1 Sub. att,Fighter 2 Sub. att,Fighter 1 Rev.,Fighter 2 Rev.,Fighter 1 Ctrl,Fighter 2 Ctrl
0,Serghei Spivac,Jared Vanderaa,0,0,15 of 20,10 of 15,75%,66%,38 of 46,18 of 23,2 of 2,0 of 0,2 of 2,0 of 0,0,0,0,0,4:26,0:00
1,Serghei Spivac,Jared Vanderaa,0,0,36 of 52,3 of 9,69%,33%,69 of 89,9 of 15,1 of 1,0 of 0,1 of 1,0 of 0,0,0,0,0,4:03,0:00


In [None]:
idx = 11

In [None]:
fight_http_string, winner = FIGHT_HTTP_STRINGS[idx]
fight_table = process_fight(fight_http_string)
if winner is None:
    label = 0
elif fight_table["Fighter 1 Name"][0] == winner:
    label = 1
elif fight_table["Fighter 2 Name"][0] == winner:
    label = -1
else:
    print(f'ERROR: fight_table["Fighter 1 Name"]={fight_table["Fighter 1 Name"]}, fight_table["Fighter 2 Name"]={fight_table["Fighter 2 Name"]}, winner={winner}')

Serghei Spivac Jared Vanderaa
label = 1, winner = Serghei Spivac


In [None]:
for i in range(len(FIGHT_HTTP_STRINGS)):
  fight_http_string, winner = FIGHT_HTTP_STRINGS[i]
  fight_table = process_fight(fight_http_string)
  if winner is None:
      label = 0
  elif fight_table["Fighter 1 Name"][0] == winner:
      label = 1
  elif fight_table["Fighter 2 Name"][0] == winner:
      label = -1
  else:
      print(f'ERROR: fight_table["Fighter 1 Name"]={fight_table["Fighter 1 Name"]}, fight_table["Fighter 2 Name"]={fight_table["Fighter 2 Name"]}, winner={winner}')
  print(f"label = {label}, winner = {winner}")

Curtis Blaydes Derrick Lewis
label = -1, winner = Derrick Lewis
Ketlen Vieira Yana Kunitskaya
label = -1, winner = Yana Kunitskaya
Charles Rosa Darrick Minner
label = -1, winner = Darrick Minner
Aleksei Oleinik Chris Daukaus
label = -1, winner = Chris Daukaus
Phil Hawes Nassourdine Imavov
label = 1, winner = Phil Hawes
Andrei Arlovski Tom Aspinall
label = -1, winner = Tom Aspinall
Jared Gordon Danny Chavez
label = 1, winner = Jared Gordon
Eddie Wineland John Castaneda
label = -1, winner = John Castaneda
Nate Landwehr Julian Erosa
label = -1, winner = Julian Erosa
Shana Dobson Casey O'Neill
label = -1, winner = Casey O'Neill
Aiemann Zahabi Drako Rodriguez
label = 1, winner = Aiemann Zahabi
Serghei Spivac Jared Vanderaa
label = 1, winner = Serghei Spivac
Kamaru Usman Gilbert Burns
label = 1, winner = Kamaru Usman
Maycee Barber Alexa Grasso
label = -1, winner = Alexa Grasso
Kelvin Gastelum Ian Heinisch
label = 1, winner = Kelvin Gastelum
Ricky Simon Brian Kelleher
label = 1, winner = Rick

KeyboardInterrupt: ignored

In [None]:
process_fight(FIGHT_HTTP_STRINGS[idx][0])

Eddie Wineland John Castaneda


Unnamed: 0,Fighter 1 Name,Fighter 2 Name,Fighter 1 KD,Fighter 2 KD,Fighter 1 Sig. str.,Fighter 2 Sig. str.,Fighter 1 Sig. str. %,Fighter 2 Sig. str. %,Fighter 1 Total str.,Fighter 2 Total str.,Fighter 1 Td %,Fighter 2 Td %,Fighter 1 Td %.1,Fighter 2 Td %.1,Fighter 1 Sub. att,Fighter 2 Sub. att,Fighter 1 Rev.,Fighter 2 Rev.,Fighter 1 Ctrl,Fighter 2 Ctrl
0,Eddie Wineland,John Castaneda,0,1,17 of 66,42 of 57,25%,73%,17 of 66,44 of 59,1 of 1,0 of 0,1 of 1,0 of 0,0,0,0,0,0:12,0:04


In [None]:
process_fight(all_fight_http_strings[idx][0])

NameError: ignored

In [None]:
print(f"Winner is {all_fight_http_strings[idx][1]}")

In [None]:
process_fight(all_fight_http_strings[idx][0])