In [159]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import OneHotEncoder, StandardScaler, MinMaxScaler
import seaborn as sns
%matplotlib inline

import warnings
warnings.filterwarnings('ignore')

In [160]:
df = pd.read_csv('./data/ufc-master.csv')

In [161]:
df.shape

(4307, 113)

In [162]:
df.head()

Unnamed: 0,R_fighter,B_fighter,R_odds,B_odds,R_ev,B_ev,date,location,country,Winner,...,B_Featherweight_rank,B_Bantamweight_rank,B_Flyweight_rank,B_Pound-for-Pound_rank,better_rank,finish,finish_details,finish_round,finish_round_time,total_fight_time_secs
0,Robert Whittaker,Darren Till,-130,107,76.923077,107.0,7/25/2020,"Abu Dhabi, Abu Dhabi, United Arab Emirates",United Arab Emirates,Red,...,,,,,Red,U-DEC,,5.0,5:00,1500.0
1,Mauricio Rua,Rogerio Nogueira,-190,150,52.631579,150.0,7/25/2020,"Abu Dhabi, Abu Dhabi, United Arab Emirates",United Arab Emirates,Red,...,,,,,Red,S-DEC,,3.0,5:00,900.0
2,Fabricio Werdum,Alexander Gustafsson,260,-335,260.0,29.850746,7/25/2020,"Abu Dhabi, Abu Dhabi, United Arab Emirates",United Arab Emirates,Red,...,,,,,Blue,SUB,Armbar,1.0,2:30,150.0
3,Carla Esparza,Marina Rodriguez,145,-182,145.0,54.945055,7/25/2020,"Abu Dhabi, Abu Dhabi, United Arab Emirates",United Arab Emirates,Red,...,,,,,Red,S-DEC,,3.0,5:00,900.0
4,Paul Craig,Gadzhimurad Antigulov,-137,110,72.992701,110.0,7/25/2020,"Abu Dhabi, Abu Dhabi, United Arab Emirates",United Arab Emirates,Red,...,,,,,neither,SUB,Triangle Choke,1.0,2:06,126.0


I'm gonna start with closer reviewing data. First let's separate numeric and categorical variables, check amount of null values.
B_odds and R_odds values are strings, will convert those to int.

In [163]:
df[['B_odds', 'R_odds']] = df[['B_odds', 'R_odds']].astype(int)

In [164]:
num_vars = df.select_dtypes(include=['float', 'int']).columns
cat_vars = df.select_dtypes(include=['object']).columns

In [165]:
df[num_vars].isna().sum()

R_odds                             0
B_odds                             0
R_ev                               0
B_ev                               0
B_avg_SIG_STR_landed             930
B_avg_SIG_STR_pct                765
B_avg_SUB_ATT                    832
B_avg_TD_landed                  833
B_avg_TD_pct                     842
B_Height_cms                       0
B_Reach_cms                        0
R_avg_SIG_STR_landed             455
R_avg_SIG_STR_pct                357
R_avg_SUB_ATT                    357
R_avg_TD_landed                  357
R_avg_TD_pct                     367
R_Height_cms                       0
R_Reach_cms                        0
height_dif                         0
reach_dif                          0
sig_str_dif                        0
avg_sub_att_dif                    0
avg_td_dif                         0
B_match_weightclass_rank        3551
R_match_weightclass_rank        3145
R_Women's Flyweight_rank        4271
R_Women's Featherweight_rank    4301
R

Some fighters used to fight in different weights than the current one, thus those also have their former ranks available. In order to keep it simple, I will drop those columns and only keep fight weight rank.

In [166]:
num_vars = num_vars[:25].append(num_vars[51:])

In [167]:
df[num_vars]

Unnamed: 0,R_odds,B_odds,R_ev,B_ev,B_avg_SIG_STR_landed,B_avg_SIG_STR_pct,B_avg_SUB_ATT,B_avg_TD_landed,B_avg_TD_pct,B_Height_cms,...,R_Reach_cms,height_dif,reach_dif,sig_str_dif,avg_sub_att_dif,avg_td_dif,B_match_weightclass_rank,R_match_weightclass_rank,finish_round,total_fight_time_secs
0,-130,107,76.923077,107.000000,2.410000,0.46,0.000000,0.660000,0.41,182.88,...,185.42,0.00,2.54,-2.360000,0.000000,0.300000,5.0,1.0,5.0,1500.0
1,-190,150,52.631579,150.000000,2.390000,0.34,0.800000,0.670000,0.44,187.96,...,193.04,2.54,-2.54,-1.240000,0.000000,-1.500000,15.0,,3.0,900.0
2,260,-335,260.000000,29.850746,4.020000,0.40,0.400000,1.560000,0.39,195.58,...,195.58,2.54,5.08,0.410000,-0.700000,-0.150000,7.0,14.0,1.0,150.0
3,145,-182,145.000000,54.945055,5.600000,0.50,0.200000,0.480000,0.33,167.64,...,160.02,12.70,5.08,3.300000,-0.100000,-2.800000,9.0,7.0,3.0,900.0
4,-137,110,72.992701,110.000000,2.320000,0.51,3.500000,8.700000,0.41,180.34,...,193.04,-10.16,-15.24,-0.010000,2.000000,7.050000,,,1.0,126.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4302,-155,135,64.516129,135.000000,,,,,,177.80,...,177.80,0.00,2.54,-13.666667,0.000000,0.000000,,,1.0,44.0
4303,-210,175,47.619048,175.000000,,,,,,177.80,...,180.34,7.62,7.62,-18.000000,-1.000000,-4.666667,,,1.0,121.0
4304,-260,220,38.461538,220.000000,8.000000,0.34,1.000000,1.000000,1.00,190.50,...,198.12,-2.54,-2.12,-4.000000,1.000000,1.000000,,,1.0,47.0
4305,-420,335,23.809524,335.000000,,,,,,182.88,...,177.80,10.16,7.62,-40.500000,0.000000,-3.500000,,,3.0,900.0


Match weight rank is also not available for all the fighters. Only top 15 are ranked, the lower the rank the better. We can not replace nan value with 0 here, so I will try to set it to a bigger value (20) and also add new column indicating that a fighter belongs to top 15.

In [168]:
df[['B_match_weightclass_rank', 'R_match_weightclass_rank']] = df[['B_match_weightclass_rank', 'R_match_weightclass_rank']].fillna(20)

In [169]:
df['B_is_in_top15'] = [1 if x < 20 else 0 for x in df['B_match_weightclass_rank']]
df['R_is_in_top15'] = [1 if x < 20 else 0 for x in df['R_match_weightclass_rank']]

Some rows have "finish_round" and "total_fight_time_secs" empty, which is weird. These rows also do not have any details about how match finished. I guess I will put zeroes here.

What we have left with NaNs are submissions and takedowns data which do not happen in all off the fights. Seems legit to replace missing values with 0 here.

In [170]:
df[num_vars] = df[num_vars].fillna(0)

In [171]:
df[cat_vars].isna().sum()

R_fighter               0
B_fighter               0
date                    0
location                0
country                 0
Winner                  0
weight_class            0
gender                  0
B_Stance                0
R_Stance                0
better_rank             0
finish                225
finish_details       2311
finish_round_time     225
dtype: int64

Same 225 missing values in "finish" and "finish_round_time". "finish_details" missing values seem to be quite legit. Most of the fights end with decision, and there is nothing more to say about it. 
I will fill missing values with constant string 'undefined' for "finish" and "finish_details". 

"finish_round_time" intuitively does not seem to belong to categorical. I want to convert it to single number (seconds) and use ans numeric variable.

In [172]:
df[['finish', 'finish_details']] = df[['finish', 'finish_details']].fillna('undefined')

In [173]:
def min_to_sec(s):
    ss = str(s).split(':')
    return int(ss[0])*60 + int(ss[1])

df['finish_round_time'] = df['finish_round_time'].fillna('0:0')
df['finish_round_seconds'] = df['finish_round_time'].apply(min_to_sec)

Time to handle categorical variables. I am not going to remove anything from the original dataset, but rather continie building 2 Index entities (num_vars and cat_vars) which I will use further during model building. 

It's pretty clear that fighters' name hardly influence results, so I will not include those. Same applies to location where the fights is held. 

I also want to convert "date" columns to datetime format for possible future usage during EDA or some auxilary manipulations. This column will not participate in model building.

In [174]:
df['date'] = pd.to_datetime(df['date'], yearfirst=True, format='%m/%d/%Y')

"gender" clearly can be boolean variable. I will also turn 'Winner' and 'better_rank' into boolean. 1 will be Red, 0 - Blue.

In [175]:
df['gender_bool'] = [1 if x == 'MALE' else 0 for x in df['gender']]
df['winner_bool'] = [1 if x == 'Blue' else 0 for x in df['Winner']]
df['better_rank_bool'] = [1 if x == 'Blue' else 0 for x in df['better_rank']]

In [227]:
cat_vars = pd.Index(['weight_class', 'B_Stance', 'R_Stance', 'finish', 'finish_details'])

In [177]:
num_vars = pd.Index(['B_odds', 'R_odds', 'B_ev', 'R_ev', 'B_avg_SIG_STR_landed',
       'B_avg_SIG_STR_pct', 'B_avg_SUB_ATT', 'B_avg_TD_landed', 'B_avg_TD_pct',
       'B_Height_cms', 'B_Reach_cms', 'R_avg_SIG_STR_landed',
       'R_avg_SIG_STR_pct', 'R_avg_SUB_ATT', 'R_avg_TD_landed', 'R_avg_TD_pct',
       'R_Height_cms', 'R_Reach_cms', 'height_dif', 'reach_dif', 'sig_str_dif',
       'avg_sub_att_dif', 'avg_td_dif', 'B_match_weightclass_rank',
       'R_match_weightclass_rank', 'finish_round', 'total_fight_time_secs',
       'B_is_in_top15', 'R_is_in_top15', 'finish_round_seconds', 'gender_bool', 
                     'better_rank_bool'])

In [178]:
df[cat_vars].head()

Unnamed: 0,country,weight_class,B_Stance,R_Stance,finish,finish_details
0,United Arab Emirates,Middleweight,Southpaw,Orthodox,U-DEC,undefined
1,United Arab Emirates,Light Heavyweight,Southpaw,Orthodox,S-DEC,undefined
2,United Arab Emirates,Heavyweight,Orthodox,Orthodox,SUB,Armbar
3,United Arab Emirates,Women's Strawweight,Orthodox,Orthodox,S-DEC,undefined
4,United Arab Emirates,Light Heavyweight,Orthodox,Orthodox,SUB,Triangle Choke


In [179]:
for ind in cat_vars:
    print(ind)
    print(df[ind].value_counts())

country
 USA                     2450
 Brazil                   400
 Canada                   337
 United Kingdom           165
 Australia                160
USA                       100
 Sweden                    72
 Mexico                    70
 China                     61
 Germany                   54
 Japan                     53
United Arab Emirates       51
 Singapore                 45
 Russia                    36
 New Zealand               33
 United Arab Emirates      29
 Netherlands               25
 South Korea               24
 Poland                    23
 Ireland                   19
 Czech Republic            13
 Denmark                   13
 Chile                     13
 Croatia                   13
 Uruguay                   13
 Philippines               12
 Argentina                 12
Brazil                     11
Name: country, dtype: int64
weight_class
Lightweight              789
Welterweight             759
Middleweight             516
Featherweight           

Some country names have leading whitespace, need to clean this up.

In [180]:
df['country'] = df['country'].apply(lambda x: str(x).strip())

All columns seem to have reasonably low cardinality, so we can apply One Hot encoding (or dummy variables) when we'll build the model.

## IT'S TIME  (in Bruce Buffer's voice)##

I will start my analysis trying to answer the following question: what influences fight result the most?

Do I have to look for something complicated or is there a simple winning strategy?

Main idea of this whole analysis is to see if I can bet UFC fight results with some profit. What if I always keep the same strategy? Say, always bet the guy having more wins in past will win?

But what does affect the match result in the end? Let's build the model!

Task can be treated as binary classification. In the dataframe we always have data available in fixed order: first it's fighter in the blue corner, then - in the right. Target varible 'winner_bool' is 1 if Blue wins, 0 - if Right. Thus B_ columns can be treated as 'fighter data', and R_ columns - as 'opponent data' and we want to know whether the fighter wins or not.

Some of the variables should be removed before building the model. If we want to predict result of the match, we can not use information about whether the match finished with decision or submission and in which round it finished. Numeric values also should be stardatized, categorical values should be encoded.

In [265]:
cat_vars = pd.Index(['weight_class', 'B_Stance', 'R_Stance'])
num_vars = pd.Index(['B_odds', 'R_odds', 'B_ev', 'R_ev', 'B_avg_SIG_STR_landed',
       'B_avg_SIG_STR_pct', 'B_avg_SUB_ATT', 'B_avg_TD_landed', 'B_avg_TD_pct',
       'B_Height_cms', 'B_Reach_cms', 'R_avg_SIG_STR_landed',
       'R_avg_SIG_STR_pct', 'R_avg_SUB_ATT', 'R_avg_TD_landed', 'R_avg_TD_pct',
       'R_Height_cms', 'R_Reach_cms', 'height_dif', 'reach_dif', 'sig_str_dif',
       'avg_sub_att_dif', 'avg_td_dif', 'B_match_weightclass_rank',
       'R_match_weightclass_rank', 'B_is_in_top15', 'R_is_in_top15', 'gender_bool', 
                     'better_rank_bool'])

In [266]:
X = pd.get_dummies(df[cat_vars])
X = pd.concat([df[num_vars], X], axis=1)
y = df['winner_bool']
#X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=123)

In [267]:
scaler = MinMaxScaler()
#X_train.loc[:, num_vars] = scaler.fit_transform(X_train[num_vars])
#X_test.loc[:, num_vars] = scaler.transform(X_test[num_vars])
X = scaler.fit_transform(X)

We can now build simple Logistic Regression model. Instead of splitting into train and test sets I will use k-fold cross-validation strategy with 3 folds.

In [224]:
log_reg = LogisticRegression()
#log_reg.fit(X_train, y_train)
#y_pred = log_reg.predict(X_test)
#print(accuracy_score(y_test, y_pred))
cross_val_score(log_reg, X, y, cv=3).mean()

0.6473217318723128

This does not seem to give too good accuracy. Regression is not the only model we can build. I also want to try Random Forest and Gradient Boosting.

In [206]:
gb_clf = GradientBoostingClassifier()
#gb_clf.fit(X_train, y_train)
#y_pred = gb_clf.predict(X_test)
#print(accuracy_score(y_test, y_pred))
cross_val_score(gb_clf, X, y, cv=3).mean()

0.6357099181815534

In [207]:
rf_clf = RandomForestClassifier()
#rf_clf.fit(X_train, y_train)
#y_pred = rf_clf.predict(X_test)
#print(accuracy_score(y_test, y_pred))
cross_val_score(rf_clf, X, y, cv=3).mean()

0.6466238971979852

I tried tuning some hyperparameters here, which made no significant improvements, so I will stick to default values. Looks like without additional precessing and maybe feature engineering this is the maximum we can get.

Even so, I want to check which columns make the most contribution.

In [268]:
X = pd.get_dummies(df[cat_vars])
X = pd.concat([df[num_vars], X], axis=1)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=123)
scaler = MinMaxScaler()
X_train[num_vars] = scaler.fit_transform(X_train[num_vars])
X_test[num_vars] = scaler.transform(X_test[num_vars])

In [269]:
def get_coef(model_coef, df_columns):
    coef_df = pd.DataFrame()
    coef_df['column'] = X_train.columns
    coef_df['coef'] = log_reg.coef_[0]
    coef_df['abs_coefs'] = np.abs(log_reg.coef_[0])
    coef_df = coef_df.sort_values('abs_coefs', ascending=False)
    return coef_df

In [270]:
log_reg = LogisticRegression()
log_reg.fit(X_train, y_train)
y_pred = log_reg.predict(X_test)
print(accuracy_score(y_test, y_pred))
coef_df = get_coef(log_reg.coef_, X_train.columns)
coef_df.head(10)

0.6635730858468677


Unnamed: 0,column,coef,abs_coefs
2,B_ev,-2.907721,2.907721
3,R_ev,1.961047,1.961047
1,R_odds,1.25315,1.25315
0,B_odds,-0.890719,0.890719
14,R_avg_TD_landed,-0.725293,0.725293
22,avg_td_dif,0.558659,0.558659
10,B_Reach_cms,0.532418,0.532418
6,B_avg_SUB_ATT,0.49843,0.49843
19,reach_dif,0.490708,0.490708
4,B_avg_SIG_STR_landed,0.462259,0.462259


What a surprise (no)! Most meaningful variables turn out to be the ones related to odds. 

Interenting enough. If we set logistic regression to use L1 penalty which eliminates features rather that optimizes coefficients, it removes "_odd" features.

In [273]:
log_reg = LogisticRegression(solver='liblinear', penalty='l1')
log_reg.fit(X_train, y_train)
y_pred = log_reg.predict(X_test)
print(accuracy_score(y_test, y_pred))
coef_df = get_coef(log_reg.coef_, X_train.columns)
coef_df.head(10)

0.662799690641918


Unnamed: 0,column,coef,abs_coefs
2,B_ev,-4.958829,4.958829
3,R_ev,2.544951,2.544951
14,R_avg_TD_landed,-0.857469,0.857469
6,B_avg_SUB_ATT,0.566767,0.566767
10,B_Reach_cms,0.509071,0.509071
4,B_avg_SIG_STR_landed,0.488276,0.488276
17,R_Reach_cms,-0.425189,0.425189
7,B_avg_TD_landed,0.417028,0.417028
40,weight_class_Women's Flyweight,-0.378045,0.378045
11,R_avg_SIG_STR_landed,0.297286,0.297286
