In [1]:
import numpy as np
import pandas as pd
import os

from utils import *

np.set_printoptions(threshold=np.inf)
pd.set_option('mode.chained_assignment', 'raise')

In [2]:
#Load Data. Fill in NaNs and eleminate Draws, DQs, and NCs
file_path = "../data/raw_data.json"
alldata = pd.read_json(file_path)
alldata = alldata[alldata['winner'] != 'draw']
alldata = alldata[(alldata['win_method'] != 'Overturned') & (alldata['win_method'] != 'DQ')]
alldata = alldata.drop('time', axis=1)
alldata = alldata.replace(['-', '--', '---'], 0)
fightdata = alldata.sort_values('date').reset_index(drop=True).copy()
fightdata = fightdata.set_index('date', drop=True)
fightdata.head(5)

Unnamed: 0_level_0,event_title,f1,f2,win_method,round,weight_class,winner,f1_total_str,f1_td,f1_td_pct,...,f2_rev,f2_ctrl,f2_sig_str,f2_sig_str_pct,f2_head,f2_body,f2_leg,f2_distance,f2_clinch,f2_ground
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1994-03-11,UFC 2: No Way Out,Johnny Rhodes,David Levicki,KO/TKO,1,Open Weight Bout,f1,74 of 86,1 of 1,100%,...,0,0,4 of 5,80%,4 of 5,0 of 0,0 of 0,1 of 2,2 of 2,1 of 1
1994-03-11,UFC 2: No Way Out,Royce Gracie,Patrick Smith,KO/TKO,1,UFC 2 Tournament Title Bout,f1,11 of 11,1 of 2,50%,...,0,0,1 of 2,50%,0 of 0,1 of 2,0 of 0,0 of 1,1 of 1,0 of 0
1994-03-11,UFC 2: No Way Out,Jason DeLucia,Scott Baker,Submission,1,Open Weight Bout,f1,20 of 25,0 of 1,0%,...,2,0,0 of 2,0%,0 of 2,0 of 0,0 of 0,0 of 2,0 of 0,0 of 0
1994-03-11,UFC 2: No Way Out,Royce Gracie,Remco Pardoel,Submission,1,Open Weight Bout,f1,0 of 0,1 of 2,50%,...,0,0,0 of 0,0,0 of 0,0 of 0,0 of 0,0 of 0,0 of 0,0 of 0
1994-03-11,UFC 2: No Way Out,Scott Morris,Sean Daugherty,Submission,1,Open Weight Bout,f1,2 of 2,1 of 1,100%,...,0,0,0 of 4,0%,0 of 2,0 of 0,0 of 2,0 of 3,0 of 1,0 of 0


In [3]:
def split_of(data, column):
    """Split of columns with format {landed} of {attempted}"""
    data[column+'_landed'] = data[column].apply(lambda x: int(x.split('of')[0]))
    data[column+'_attempted'] = data[column].apply(lambda x: int(x.split('of')[1]))
    

of_cols = ['f1_total_str', 'f1_td', 'f1_sig_str', 'f1_head', 'f1_body', 'f1_leg', 'f1_distance', 'f1_clinch', 'f1_ground', 'f2_total_str', 'f2_td', 'f2_sig_str', 'f2_head', 'f2_body', 'f2_leg', 'f2_distance', 'f2_clinch', 'f2_ground']
[split_of(fightdata, col) for col in of_cols]
[fightdata.drop(col, axis=1, inplace=True) for col in of_cols]

print(fightdata.columns)

Index(['event_title', 'f1', 'f2', 'win_method', 'round', 'weight_class',
       'winner', 'f1_td_pct', 'f1_sub', 'f1_rev', 'f1_ctrl', 'f1_sig_str_pct',
       'f2_td_pct', 'f2_sub', 'f2_rev', 'f2_ctrl', 'f2_sig_str_pct',
       'f1_total_str_landed', 'f1_total_str_attempted', 'f1_td_landed',
       'f1_td_attempted', 'f1_sig_str_landed', 'f1_sig_str_attempted',
       'f1_head_landed', 'f1_head_attempted', 'f1_body_landed',
       'f1_body_attempted', 'f1_leg_landed', 'f1_leg_attempted',
       'f1_distance_landed', 'f1_distance_attempted', 'f1_clinch_landed',
       'f1_clinch_attempted', 'f1_ground_landed', 'f1_ground_attempted',
       'f2_total_str_landed', 'f2_total_str_attempted', 'f2_td_landed',
       'f2_td_attempted', 'f2_sig_str_landed', 'f2_sig_str_attempted',
       'f2_head_landed', 'f2_head_attempted', 'f2_body_landed',
       'f2_body_attempted', 'f2_leg_landed', 'f2_leg_attempted',
       'f2_distance_landed', 'f2_distance_attempted', 'f2_clinch_landed',
       'f2_cli

In [4]:
#Convert column data types to float. Handle percentages and times
for col in fightdata.columns:
    if 'pct' in col:
        fightdata[col] = fightdata[col].apply(lambda x: float(x.strip('%')) if type(x)==str else float(x))

make_seconds = lambda x: int(x.split(':')[0])*60 + int(x.split(':')[1]) if type(x) is str else x
for col in ['f1_ctrl', 'f2_ctrl']:
    fightdata[col] = fightdata[col].apply(make_seconds)

for col in fightdata.columns[8:]:
    fightdata[col] = fightdata[col].astype('float64')


In [5]:
#Check for null values and save
print(fightdata.isnull().sum().sum())
fightdata.to_pickle("../data/processed_fightdata.pkl")

0
