In [1]:
import numpy as np
import pandas as pd
import pickle 
import os
from tqdm import tqdm

from utils import *

%matplotlib inline
np.set_printoptions(threshold=np.inf)
pd.set_option('mode.chained_assignment', 'raise')

In [2]:
fightdata = pd.read_pickle("../data/processed_fightdata.pkl")
fightdata.head()

Unnamed: 0_level_0,event_title,f1,f2,win_method,round,weight_class,winner,f1_td_pct,f1_sub,f1_rev,...,f2_body_landed,f2_body_attempted,f2_leg_landed,f2_leg_attempted,f2_distance_landed,f2_distance_attempted,f2_clinch_landed,f2_clinch_attempted,f2_ground_landed,f2_ground_attempted
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1994-03-11,UFC 2: No Way Out,Johnny Rhodes,David Levicki,KO/TKO,1,Open Weight Bout,f1,100.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,2.0,2.0,2.0,1.0,1.0
1994-03-11,UFC 2: No Way Out,Royce Gracie,Patrick Smith,KO/TKO,1,UFC 2 Tournament Title Bout,f1,50.0,0.0,0.0,...,1.0,2.0,0.0,0.0,0.0,1.0,1.0,1.0,0.0,0.0
1994-03-11,UFC 2: No Way Out,Jason DeLucia,Scott Baker,Submission,1,Open Weight Bout,f1,0.0,5.0,4.0,...,0.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0
1994-03-11,UFC 2: No Way Out,Royce Gracie,Remco Pardoel,Submission,1,Open Weight Bout,f1,50.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1994-03-11,UFC 2: No Way Out,Scott Morris,Sean Daugherty,Submission,1,Open Weight Bout,f1,100.0,1.0,0.0,...,0.0,0.0,0.0,2.0,0.0,3.0,0.0,1.0,0.0,0.0


In [3]:
#Create %landed, %defended, and num absorbed stats for each type of stat 
pct = lambda x, y: (x/y) * 100 
pct_diff = lambda x, y: ((x - y)/ x) * 100 

fs1 = ['sig_str', 'td', 'head', 'body', 'leg']
for ftr in ['f1', 'f2']:
    oftr = 'f2' if ftr == 'f1' else 'f1' 
    for f in fs1:
        fightdata[joinnames(ftr, f, 'pct')] = pct(fightdata[joinnames(ftr, f, 'landed')], fightdata[joinnames(ftr, f, 'attempted')])
        fightdata[joinnames(ftr, f, 'absorbed')] = fightdata[joinnames(oftr, f, 'landed')]
        fightdata[joinnames(ftr, f, 'def')] = pct_diff(fightdata[joinnames(oftr, f, 'attempted')], fightdata[joinnames(oftr, f, 'landed')])

fs2 = ['distance', 'ground', 'clinch']
for ftr in ['f1', 'f2']:
    oftr = 'f2' if ftr == 'f1' else 'f1' 
    for f in fs2:
        fightdata[joinnames(ftr, f, 'pct')] = pct(fightdata[joinnames(ftr, f, 'landed')], fightdata[joinnames(ftr, 'sig_str', 'landed')])

fightdata = fightdata.fillna(0)
print(fightdata.columns)

Index(['event_title', 'f1', 'f2', 'win_method', 'round', 'weight_class',
       'winner', 'f1_td_pct', 'f1_sub', 'f1_rev', 'f1_ctrl', 'f1_sig_str_pct',
       'f2_td_pct', 'f2_sub', 'f2_rev', 'f2_ctrl', 'f2_sig_str_pct',
       'f1_total_str_landed', 'f1_total_str_attempted', 'f1_td_landed',
       'f1_td_attempted', 'f1_sig_str_landed', 'f1_sig_str_attempted',
       'f1_head_landed', 'f1_head_attempted', 'f1_body_landed',
       'f1_body_attempted', 'f1_leg_landed', 'f1_leg_attempted',
       'f1_distance_landed', 'f1_distance_attempted', 'f1_clinch_landed',
       'f1_clinch_attempted', 'f1_ground_landed', 'f1_ground_attempted',
       'f2_total_str_landed', 'f2_total_str_attempted', 'f2_td_landed',
       'f2_td_attempted', 'f2_sig_str_landed', 'f2_sig_str_attempted',
       'f2_head_landed', 'f2_head_attempted', 'f2_body_landed',
       'f2_body_attempted', 'f2_leg_landed', 'f2_leg_attempted',
       'f2_distance_landed', 'f2_distance_attempted', 'f2_clinch_landed',
       'f2_cli

In [5]:
#Read features to include in fighterinfo table
with open("../data/fighterinfo_features.txt", 'r') as f:
    lines = f.readlines()
    per_fighter_feats = [l.strip() for l in lines]
    per_fighter_feats = [p for p in per_fighter_feats if p]

In [6]:
#Create series of unque fighters and unique dates
fighters = pd.concat([fightdata['f1'], fightdata['f2']]).unique()
dates = fightdata.index.unique()

In [7]:
#Initialize fighterinfo dataframe
fighterinfo_init = np.empty((len(dates), len(fighters)*len(per_fighter_feats)))
fighterinfo_init[:] = np.NaN
index = dates
columns = pd.MultiIndex.from_product([fighters, per_fighter_feats], names=['fighter', 'features'])
fighterinfo = pd.DataFrame(fighterinfo_init, index=index, columns=columns)
print(fighterinfo.shape)
fighterinfo.head()

(540, 79990)


fighter,Johnny Rhodes,Johnny Rhodes,Johnny Rhodes,Johnny Rhodes,Johnny Rhodes,Johnny Rhodes,Johnny Rhodes,Johnny Rhodes,Johnny Rhodes,Johnny Rhodes,...,Jamie Pickett,Jamie Pickett,Jamie Pickett,Jamie Pickett,Jamie Pickett,Jamie Pickett,Jamie Pickett,Jamie Pickett,Jamie Pickett,Jamie Pickett
features,wins,losses,curr_win_strk,curr_loss_strk,sig_str_lnd_pf,sig_str_att_pf,sig_str_acc_pct,sig_str_abs_pf,sig_str_def_pct,td_lnd_pf,...,leg_def_pct,distance_lnd_pf,distance_acc_pct,ground_lnd_pf,ground_acc_pct,clinch_lnd_pf,clinch_acc_pct,sub_lnd_pf,ctrl_lnd_pf,rev_lnd_pf
date,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2
1994-03-11,,,,,,,,,,,...,,,,,,,,,,
1994-09-09,,,,,,,,,,,...,,,,,,,,,,
1994-12-16,,,,,,,,,,,...,,,,,,,,,,
1995-04-07,,,,,,,,,,,...,,,,,,,,,,
1995-07-14,,,,,,,,,,,...,,,,,,,,,,


In [8]:
#Populate fighterinfo table by iterating through rows of fightdata 
avg = lambda x, y: (x+y)/2

def update_fight_feats(fighterinfo, row, date, ftrs, starts, ends):
    """Update the fight stats related features of the fighters from 'row' (a row from fightdata df which represents one fight) """
    for ftr in ftrs:
        ftr_idx, ftr_name = ftr[0], ftr[1]
        for start in starts:
            for end in ends:
                fi_end, fd_end = end[0], end[1]
                fi_feat, fd_feat = joinnames(start, fi_end), joinnames(ftr_idx, start, fd_end)
                prev_feat = fighterinfo.loc[date, (ftr_name, fi_feat)]
                prev_feat = 0 if pd.isnull(prev_feat) else prev_feat
                fighterinfo.loc[date, (ftr_name, fi_feat)] = avg(prev_feat, row[fd_feat])

def update_wins_losses_strks(fighterinfo, row, date, winner_name, loser_name):
    """Update the wins, losses, and streak features of the fighters from 'row' """
    #First update win, loss, and streak related feats
    #Update winner wins
    prev_wins = fighterinfo.loc[date, (winner_name, 'wins')] 
    prev_wins = 0 if pd.isnull(prev_wins) else prev_wins
    fighterinfo.loc[date, (winner_name, 'wins')] = prev_wins + 1
    #Update winner losses
    prev_losses = fighterinfo.loc[date, (winner_name, 'losses')] 
    fighterinfo.loc[date, (winner_name, 'losses')] = 0 if pd.isnull(prev_losses) else prev_losses
    #Update winner curr_win_strk
    prev_win_strk = fighterinfo.loc[date, (winner_name, 'curr_win_strk')]
    prev_win_strk = 0 if pd.isnull(prev_win_strk) else prev_win_strk
    fighterinfo.loc[date, (winner_name, 'curr_win_strk')] = prev_win_strk + 1
    #Update winner curr_lose_strk 
    fighterinfo.loc[date, (winner_name, 'curr_loss_strk')] = 0
    #Update loser wins
    prev_wins = fighterinfo.loc[date, (loser_name, 'wins')] 
    fighterinfo.loc[date, (loser_name, 'wins')] = 0 if pd.isnull(prev_wins) else prev_wins
    #Update loser losses
    prev_losses = fighterinfo.loc[date, (loser_name, 'losses')] 
    prev_losses = 0 if pd.isnull(prev_losses) else prev_losses
    fighterinfo.loc[date, (loser_name, 'losses')] = prev_losses + 1
    #Update loser curr_win_strk
    fighterinfo.loc[date, (loser_name, 'curr_win_strk')] = 0
    #Update loser curr_loss_strk
    prev_loss_strk = fighterinfo.loc[date, (loser_name, 'curr_loss_strk')]
    prev_loss_strk = 0 if pd.isnull(prev_loss_strk) else prev_loss_strk
    fighterinfo.loc[date, (loser_name, 'curr_loss_strk')] = prev_loss_strk + 1

    
#Main loop to iterate through fightdata 
prev_date = fighterinfo.index[0]
for date, row in tqdm(fightdata.iterrows()):
    
    winner = row['winner']
    winner_name = row[winner]
    loser = 'f2' if winner == 'f1' else 'f1'
    loser_name = row[loser]
    
    #Make current date stats same as prev date stats so they can be updated
    fighterinfo.loc[date] = fighterinfo.loc[prev_date]
    prev_date = date
    #First update wins, losses, and streaks
    update_wins_losses_strks(fighterinfo, row, date, winner_name, loser_name)
    
    #Create ftrs list for fight features update
    ftrs = [(winner, winner_name), (loser, loser_name)]
    
    #Update 5 end variant feats
    starts = ['sig_str', 'td', 'head', 'body', 'leg']
    ends = [('lnd_pf', 'landed'), ('att_pf', 'attempted'), ('acc_pct', 'pct'), ('abs_pf', 'absorbed'), ('def_pct', 'def')]
    
    update_fight_feats(fighterinfo, row, date, ftrs, starts, ends)
    
    #Update 2 end variant feats
    starts = ['distance', 'ground', 'clinch']
    ends = [('lnd_pf', 'landed'), ('acc_pct', 'pct')]
    
    update_fight_feats(fighterinfo, row, date, ftrs, starts, ends)
    
    #Update 1 end variant feats
    starts = ['sub', 'ctrl', 'rev']
    ends = [('lnd_pf', None)]
    
    update_fight_feats(fighterinfo, row, date, ftrs, starts, ends)  

5787it [02:18, 41.92it/s]


In [10]:
#Save fighterinfo data
fighterinfo.to_pickle("../data/fighterinfo.pkl")

In [11]:
#function to display stats from fighterinfo
def disp_fighterinfo(fighter, startdate):
    """Display the stats of fighter from startdate (in YYYY-MM-DD format) to current day"""
    display_full(fighterinfo.loc[startdate:, fighter])

#disp_fighterinfo("Jon Jones", "2020-01-01")