# UFC Fight Predictor

In [1]:
from   category_encoders          import *
import numpy as np
import pandas as pd
# Saving model
import pickle
import warnings
warnings.filterwarnings('ignore')

from   sklearn.compose            import *
from   sklearn.ensemble           import RandomForestClassifier, ExtraTreesClassifier
from   sklearn.metrics            import accuracy_score 
from   sklearn.pipeline           import Pipeline
from   sklearn.preprocessing      import *
from   sklearn.model_selection    import RandomizedSearchCV
from   sklearn.model_selection    import train_test_split
# Imputing
from   sklearn.impute             import KNNImputer
# Feature importance
from   sklearn.inspection         import permutation_importance

# Research Question / Hypothesis
### Research Question
- How accurately could we predict who wins a fight in the UFC given a variety of fighter features?
- What are the most important factors/features that lead to a fighter securing a win?

### Hypothesis
- The most important features should be the most obvious: a fighter's fight record, their age, and higher average strikes, submissions, and takedowns. I will perform a feature importance criterion to assess this hypothesis.

## Exploratory Data Analysis

In [2]:
df = pd.read_csv('ufc-master.csv')
df.shape

(4566, 137)

#### Mixed Types for Columns (79, 80)

In [3]:
df.loc[:,'B_match_weightclass_rank'].unique()

array(['6', '4', nan, '13', 'Dustin Poirier', 'Dan Hooker', 'Jessica Eye',
       'Andrew Sanchez', 'Marina Rodriguez', 'Brad Tavares',
       'Julianna Pena', 'Khalil Rountree Jr.', 'Nik Lentz', 'Amir Albazi',
       '9', '3', '15', '12', '8', '1', '2', '10', '5', '14', '7', '11'],
      dtype=object)

In [4]:
df.loc[:,'R_match_weightclass_rank'].unique()

array(['5', '2', '10', nan, 'Conor McGregor', 'Michael Chandler',
       'Joanne Calderwood', 'Makhmud Muradov', 'Amanda Ribas',
       'Antonio Carlos Junior', 'Sara McMann', 'Marcin Prachnio',
       'Movsar Evloev', 'Zhalgas Zhumagulov', '8', '9', '13', '1', '4',
       '15', '6', '0', '14', '7', '11', '12', '3'], dtype=object)

According to docs, these columns are "Rank in the weightclass this bout takes place in", let's drop for simplicity.

#### Label Column (Winner)

In [5]:
df['Winner'].unique()

array(['Blue', 'Red'], dtype=object)

## Data Preprocessing
- Remove all columns that contain information about fight after fight is over
- Remove features that are not relevant to prediction
- Remove codependent columns with redundant information

In [6]:
class HandPickedFeatureFilter:
    def __init__(self):
        self.picked_feats = [
            'R_odds', 'B_odds', 'gender',
            'B_avg_SIG_STR_landed', 'R_avg_SIG_STR_landed',
            'B_avg_SIG_STR_pct', 'R_avg_SIG_STR_pct',
            'B_avg_SUB_ATT', 'R_avg_SUB_ATT',
            'B_avg_TD_landed', 'R_avg_TD_landed',
            'B_avg_TD_pct', 'R_avg_TD_pct',
            'B_losses', 'R_losses',
            'B_wins', 'R_wins',
            'B_Stance', 'R_Stance',
            'B_Height_cms', 'R_Height_cms',
            'B_Reach_cms', 'R_Reach_cms',
            'B_age', 'R_age','Winner']
    
    def transform(self, X, **transform_params):
        df_copy = X.copy()
        df_copy = df_copy.filter(self.picked_feats)
        return df_copy

    def fit(self, X, y=None, **fit_params):
        return self

class EncodeVars:
    def __init__(self):
        pass
    
    def transform(self, X, **transform_params):
        # Encode "Blue" as 0 and "Red" as 1
        def encode_win(x):
            if x is np.nan:
                return x
            if x == 'neither':
                return np.nan
            if x.lower() == 'blue':
                return 0
            elif x.lower() == 'red':
                return 1
            
        # Remove white space
        def encode_stance(x):
            return x.strip()
        
        df_copy = X.copy()
        df_copy['Winner'] = df_copy['Winner'].apply(encode_win)
        df_copy['B_Stance'] = df_copy['B_Stance'].apply(encode_stance)
        return df_copy

    def fit(self, X, y=None, **fit_params):
        return self

In [7]:
pipe = Pipeline([('hand_pick', HandPickedFeatureFilter()),
                 ('encode_vars', EncodeVars())])
X = pipe.transform(df).iloc[:,:-1]
y = pipe.transform(df).iloc[:,-1]

In [8]:
X.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4566 entries, 0 to 4565
Data columns (total 25 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   R_odds                4566 non-null   int64  
 1   B_odds                4566 non-null   int64  
 2   gender                4566 non-null   object 
 3   B_avg_SIG_STR_landed  3636 non-null   float64
 4   R_avg_SIG_STR_landed  4111 non-null   float64
 5   B_avg_SIG_STR_pct     3801 non-null   float64
 6   R_avg_SIG_STR_pct     4209 non-null   float64
 7   B_avg_SUB_ATT         3734 non-null   float64
 8   R_avg_SUB_ATT         4209 non-null   float64
 9   B_avg_TD_landed       3733 non-null   float64
 10  R_avg_TD_landed       4209 non-null   float64
 11  B_avg_TD_pct          3724 non-null   float64
 12  R_avg_TD_pct          4199 non-null   float64
 13  B_losses              4566 non-null   int64  
 14  R_losses              4566 non-null   int64  
 15  B_wins               

## Create Data Processing Pipeline

In [9]:
categorical_cols = (X.dtypes == object)

In [10]:
def UFCPipe(clf):
    con_pipe = Pipeline([('imputer', KNNImputer(n_neighbors=14, missing_values=np.nan)), 
                         ('scaler', StandardScaler())])

    cat_pipe = Pipeline([('encoder', OneHotEncoder(handle_unknown='ignore'))])

    preprocessing = ColumnTransformer([('categorical', cat_pipe,  categorical_cols), 
                                       ('continuous',  con_pipe, ~categorical_cols)])

    pipe = Pipeline([('preprocessing', preprocessing), 
                     ('clf', clf)])
    return pipe

## Hyperparameter Tuning

#### Train-test Split

In [11]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=11)

In [12]:
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((3652, 25), (914, 25), (3652,), (914,))

In [13]:
candidates = {
    ExtraTreesClassifier(n_jobs=-1, random_state=11): {
        'clf__n_estimators': range(100,700),
        'clf__criterion' : ['gini', 'entropy'],
        'clf__min_samples_split' : [1,2,3,4,5],
        'clf__min_samples_leaf' : [1,2,3,4,5],
        'clf__max_features': ["auto", "sqrt", "log2"],
        'clf__min_samples_leaf': [1,2,3,4]
    },
    RandomForestClassifier(n_jobs=-1, random_state=11): {
        'clf__n_estimators': range(100,700),
        'clf__criterion' : ['gini', 'entropy'],
        'clf__min_samples_split' : [1,2,3,4,5],
        'clf__min_samples_leaf' : [1,2,3,4,5],
        'clf__max_features': ["auto", "sqrt", "log2"],
        'clf__min_samples_leaf': [1,2,3,4]
    }
}

In [14]:
# for candidate, hypers in candidates.items():
#     pipe = UFCPipe(candidate)
#     rscv = RandomizedSearchCV(
#         estimator=pipe,
#         param_distributions=hypers,
#         n_jobs=-1,
#         n_iter=150
#     )
    
#     rscv.fit(X_train, y_train)
    
#     best_params = {
#         k.replace('clf__', ''): v
#         for k, v in rscv.best_params_.items()   
#     }
    
#     print(f'{candidate} has best parameters:\n\n{best_params}\n\nWith best score {rscv.best_score_}')

In [15]:
best_hypers = {'n_estimators': 522, 'min_samples_split': 2, 'min_samples_leaf': 4, 'max_features': 'auto', 'criterion': 'gini'}

## Test Model on Hold-out Data

In [16]:
pipe = UFCPipe(RandomForestClassifier(**best_hypers, n_jobs=-1, random_state=11))
pipe.fit(X_train, y_train)
y_pred   = pipe.predict(X_test)
acc_test = accuracy_score(y_test, y_pred)
print(f'Final testing accuracy {round(acc_test*100,2)}%')

Final testing accuracy 64.66%


## Feature Importance

#### Permutation Importance

In [17]:
# r = permutation_importance(pipe, X_test, y_test,
#                            n_repeats=30,
#                            random_state=0)

# for i in r.importances_mean.argsort()[::-1]:
#     print(f"{np.array(X.columns)[i]:<8}"
#           f"{r.importances_mean[i]:.3f}"
#           f" +/- {r.importances_std[i]:.3f}")

## Train Final Model with all Data and Save

In [18]:
pipe = UFCPipe(RandomForestClassifier(**best_hypers, n_jobs=-1, random_state=11))
pipe.fit(X, y)
print('Fit')

Fit


In [19]:
# with open('model.pkl', 'wb') as f:
#     pickle.dump(pipe, f)