In [77]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
import seaborn as sns
%matplotlib inline

In [179]:
df = pd.read_csv('./data/ufc-master.csv')

In [180]:
df.shape

(4307, 113)

In [181]:
df.head()

Unnamed: 0,R_fighter,B_fighter,R_odds,B_odds,R_ev,B_ev,date,location,country,Winner,...,B_Featherweight_rank,B_Bantamweight_rank,B_Flyweight_rank,B_Pound-for-Pound_rank,better_rank,finish,finish_details,finish_round,finish_round_time,total_fight_time_secs
0,Robert Whittaker,Darren Till,-130,107,76.923077,107.0,7/25/2020,"Abu Dhabi, Abu Dhabi, United Arab Emirates",United Arab Emirates,Red,...,,,,,Red,U-DEC,,5.0,5:00,1500.0
1,Mauricio Rua,Rogerio Nogueira,-190,150,52.631579,150.0,7/25/2020,"Abu Dhabi, Abu Dhabi, United Arab Emirates",United Arab Emirates,Red,...,,,,,Red,S-DEC,,3.0,5:00,900.0
2,Fabricio Werdum,Alexander Gustafsson,260,-335,260.0,29.850746,7/25/2020,"Abu Dhabi, Abu Dhabi, United Arab Emirates",United Arab Emirates,Red,...,,,,,Blue,SUB,Armbar,1.0,2:30,150.0
3,Carla Esparza,Marina Rodriguez,145,-182,145.0,54.945055,7/25/2020,"Abu Dhabi, Abu Dhabi, United Arab Emirates",United Arab Emirates,Red,...,,,,,Red,S-DEC,,3.0,5:00,900.0
4,Paul Craig,Gadzhimurad Antigulov,-137,110,72.992701,110.0,7/25/2020,"Abu Dhabi, Abu Dhabi, United Arab Emirates",United Arab Emirates,Red,...,,,,,neither,SUB,Triangle Choke,1.0,2:06,126.0


I'm gonna start with closer reviewing data. First let's separate numeric and categorical variables, check amount of null values.
B_odds and R_odds values are strings, will convert those to int.

In [182]:
df[['B_odds', 'R_odds']] = df[['B_odds', 'R_odds']].astype(int)

In [183]:
num_vars = df.select_dtypes(include=['float', 'int']).columns
cat_vars = df.select_dtypes(include=['object']).columns

In [184]:
df[num_vars].isna().sum()

R_odds                             0
B_odds                             0
R_ev                               0
B_ev                               0
B_avg_SIG_STR_landed             930
B_avg_SIG_STR_pct                765
B_avg_SUB_ATT                    832
B_avg_TD_landed                  833
B_avg_TD_pct                     842
B_Height_cms                       0
B_Reach_cms                        0
R_avg_SIG_STR_landed             455
R_avg_SIG_STR_pct                357
R_avg_SUB_ATT                    357
R_avg_TD_landed                  357
R_avg_TD_pct                     367
R_Height_cms                       0
R_Reach_cms                        0
height_dif                         0
reach_dif                          0
sig_str_dif                        0
avg_sub_att_dif                    0
avg_td_dif                         0
B_match_weightclass_rank        3551
R_match_weightclass_rank        3145
R_Women's Flyweight_rank        4271
R_Women's Featherweight_rank    4301
R

Some fighters used to fight in different weights than the current one, thus those also have their former ranks available. In order to keep it simple, I will drop those columns and only keep fight weight rank.

In [185]:
num_vars = num_vars[:25].append(num_vars[51:])

In [186]:
df[num_vars]

Unnamed: 0,R_odds,B_odds,R_ev,B_ev,B_avg_SIG_STR_landed,B_avg_SIG_STR_pct,B_avg_SUB_ATT,B_avg_TD_landed,B_avg_TD_pct,B_Height_cms,...,R_Reach_cms,height_dif,reach_dif,sig_str_dif,avg_sub_att_dif,avg_td_dif,B_match_weightclass_rank,R_match_weightclass_rank,finish_round,total_fight_time_secs
0,-130,107,76.923077,107.000000,2.410000,0.46,0.000000,0.660000,0.41,182.88,...,185.42,0.00,2.54,-2.360000,0.000000,0.300000,5.0,1.0,5.0,1500.0
1,-190,150,52.631579,150.000000,2.390000,0.34,0.800000,0.670000,0.44,187.96,...,193.04,2.54,-2.54,-1.240000,0.000000,-1.500000,15.0,,3.0,900.0
2,260,-335,260.000000,29.850746,4.020000,0.40,0.400000,1.560000,0.39,195.58,...,195.58,2.54,5.08,0.410000,-0.700000,-0.150000,7.0,14.0,1.0,150.0
3,145,-182,145.000000,54.945055,5.600000,0.50,0.200000,0.480000,0.33,167.64,...,160.02,12.70,5.08,3.300000,-0.100000,-2.800000,9.0,7.0,3.0,900.0
4,-137,110,72.992701,110.000000,2.320000,0.51,3.500000,8.700000,0.41,180.34,...,193.04,-10.16,-15.24,-0.010000,2.000000,7.050000,,,1.0,126.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4302,-155,135,64.516129,135.000000,,,,,,177.80,...,177.80,0.00,2.54,-13.666667,0.000000,0.000000,,,1.0,44.0
4303,-210,175,47.619048,175.000000,,,,,,177.80,...,180.34,7.62,7.62,-18.000000,-1.000000,-4.666667,,,1.0,121.0
4304,-260,220,38.461538,220.000000,8.000000,0.34,1.000000,1.000000,1.00,190.50,...,198.12,-2.54,-2.12,-4.000000,1.000000,1.000000,,,1.0,47.0
4305,-420,335,23.809524,335.000000,,,,,,182.88,...,177.80,10.16,7.62,-40.500000,0.000000,-3.500000,,,3.0,900.0


Match weight rank is also not available for all the fighters. Only top 15 are ranked, the lower the rank the better. We can not replace nan value with 0 here, so I will try to set it to a bigger value (20) and also add new column indicating that a fighter belongs to top 15.

In [187]:
df[['B_match_weightclass_rank', 'R_match_weightclass_rank']] = df[['B_match_weightclass_rank', 'R_match_weightclass_rank']].fillna(20)

In [188]:
df['B_is_in_top15'] = [1 if x < 20 else 0 for x in df['B_match_weightclass_rank']]
df['R_is_in_top15'] = [1 if x < 20 else 0 for x in df['R_match_weightclass_rank']]
num_vars = num_vars.append(pd.Index(['B_is_in_top15', 'R_is_in_top15']))

Some rows have "finish_round" and "total_fight_time_secs" empty, which is weird. These rows also do not have any details about how match finished. I guess I will put zeroes here.

What we have left with NaNs are submissions and takedowns data which do not happen in all off the fights. Seems legit to replace missing values with 0 here.

In [189]:
df[num_vars] = df[num_vars].fillna(0)

In [190]:
df[cat_vars].isna().sum()

R_fighter               0
B_fighter               0
date                    0
location                0
country                 0
Winner                  0
weight_class            0
gender                  0
B_Stance                0
R_Stance                0
better_rank             0
finish                225
finish_details       2311
finish_round_time     225
dtype: int64

Same 225 missing values in "finish" and "finish_round_time". "finish_details" missing values seem to be quite legit. Most of the fights end with decision, and there is nothing more to say about it. 

I will fill missing values with constant string 'undefined' for "finish" and "finish_details". Having in mind that there might be something special about these 225 fights, I will create new columsn indicating this.

"finish_round_time" intuitively does not seem to belong to categorical. I want to convert it to single number (seconds) and use ans numeric variable.

In [191]:
df[['finish', 'finish_details']] = df[['finish', 'finish_details']].fillna('undefined')
df['is_missing_finish'] = [1 if x == 'undefined' else 0 for x in df['finish']]

In [192]:
def min_to_sec(s):
    ss = str(s).split(':')
    return int(ss[0])*60 + int(ss[1])

df['finish_round_time'] = df['finish_round_time'].fillna('0:0')
df['finish_round_seconds'] = df['finish_round_time'].apply(min_to_sec)

In [193]:
num_vars = num_vars.append(pd.Index(['is_missing_finish', 'finish_round_seconds']))

Time to handle categorical variables. I am not going to remove anything from the original dataset, but rather continie building 2 Index entities (num_vars and cat_vars) which I will use further during model building. 

It's pretty clear that fighters' name hardly influence results, so I will not include those. Same applies to city where the fights is held. I will leave coutnry in place just in case.

I also want to convert "date" columns to datetime format for possible future usage during EDA or some auxilary manipulations. This column will not participate in model building.

In [194]:
df['date'] = pd.to_datetime(df['date'], yearfirst=True, format='%m/%d/%Y')

"gender" clearly can be boolean variable. I will also turn 'Winner' and 'better_rank' into boolean. 1 will be Red, 0 - Blue.

In [208]:
df['gender_bool'] = [1 if x == 'MALE' else 0 for x in df['gender']]
df['Winner_bool'] = [1 if x == 'Blue' else 0 for x in df['Winner']]
df['better_rank_bool'] = [1 if x == 'Blue' else 0 for x in df['better_rank']]

In [199]:
cat_vars = pd.Index(['country', 'weight_class', 'B_Stance', 'R_Stance', 'finish', 'finish_details'])

In [200]:
num_vars = num_vars.append(pd.Index(['gender_bool', 'better_rank_bool']))

In [201]:
df[cat_vars].head()

Unnamed: 0,country,weight_class,B_Stance,R_Stance,finish,finish_details
0,United Arab Emirates,Middleweight,Southpaw,Orthodox,U-DEC,undefined
1,United Arab Emirates,Light Heavyweight,Southpaw,Orthodox,S-DEC,undefined
2,United Arab Emirates,Heavyweight,Orthodox,Orthodox,SUB,Armbar
3,United Arab Emirates,Women's Strawweight,Orthodox,Orthodox,S-DEC,undefined
4,United Arab Emirates,Light Heavyweight,Orthodox,Orthodox,SUB,Triangle Choke


In [202]:
for ind in cat_vars:
    print(ind)
    print(df[ind].value_counts())

country
 USA                     2450
 Brazil                   400
 Canada                   337
 United Kingdom           165
 Australia                160
USA                       100
 Sweden                    72
 Mexico                    70
 China                     61
 Germany                   54
 Japan                     53
United Arab Emirates       51
 Singapore                 45
 Russia                    36
 New Zealand               33
 United Arab Emirates      29
 Netherlands               25
 South Korea               24
 Poland                    23
 Ireland                   19
 Uruguay                   13
 Chile                     13
 Czech Republic            13
 Croatia                   13
 Denmark                   13
 Philippines               12
 Argentina                 12
Brazil                     11
Name: country, dtype: int64
weight_class
Lightweight              789
Welterweight             759
Middleweight             516
Featherweight           

Some country names have leading whitespace, need to clean this up.

In [205]:
df['country'] = df['country'].apply(lambda x: str(x).strip())

All columns seem to have reasonably low cardinality, so we can apply One Hot encoding (or dummy variables) when we'll build the model.

## IT'S TIME  (in Bruce Buffer's voice)##

I will start my analysis trying to answer the following question: what influences fight result the most?

Do I have to look for something complicated or is there a simple winning strategy?

Main idea of this whole analysis is to see if I can bet UFS fight results with some profit. What if I always keep the same strategy? Say, always bet the guy having more wins in past will win?