In [260]:
import pandas as pd
import numpy as np
import sklearn.feature_selection as sklearnfs
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.impute import SimpleImputer

In [261]:
data = pd.read_csv('./mma_data/ufc_historical_data.csv')

In [263]:
data.axes

[RangeIndex(start=0, stop=6012, step=1),
 Index(['Winner', 'B_avg_KD', 'B_avg_opp_KD', 'B_avg_SIG_STR_pct',
        'B_avg_opp_SIG_STR_pct', 'B_avg_TD_pct', 'B_avg_opp_TD_pct',
        'B_avg_SUB_ATT', 'B_avg_opp_SUB_ATT', 'B_avg_REV',
        ...
        'R_win_by_Decision_Unanimous', 'R_win_by_KO/TKO', 'R_win_by_Submission',
        'R_win_by_TKO_Doctor_Stoppage', 'R_Stance', 'R_Height_cms',
        'R_Reach_cms', 'R_Weight_lbs', 'B_age', 'R_age'],
       dtype='object', length=135)]

In [None]:
imputer = SimpleImputer(missing_values=np.nan, strategy='median')

In [262]:
data = data.drop(['R_fighter', 'B_fighter', 'Referee', 'date', 'location', 'title_bout', 'weight_class','B_draw','R_draw'], axis=1)

In [253]:
processed_data = pd.read_csv('./mma_data/ufc_historical_data_preprocessed.csv')

In [255]:
processed_data.info(verbose=True, show_counts=True)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5902 entries, 0 to 5901
Data columns (total 160 columns):
 #    Column                           Non-Null Count  Dtype  
---   ------                           --------------  -----  
 0    Winner                           5902 non-null   object 
 1    title_bout                       5902 non-null   bool   
 2    B_avg_KD                         5902 non-null   float64
 3    B_avg_opp_KD                     5902 non-null   float64
 4    B_avg_SIG_STR_pct                5902 non-null   float64
 5    B_avg_opp_SIG_STR_pct            5902 non-null   float64
 6    B_avg_TD_pct                     5902 non-null   float64
 7    B_avg_opp_TD_pct                 5902 non-null   float64
 8    B_avg_SUB_ATT                    5902 non-null   float64
 9    B_avg_opp_SUB_ATT                5902 non-null   float64
 10   B_avg_REV                        5902 non-null   float64
 11   B_avg_opp_REV                    5902 non-null   float64
 12   B_av

In [None]:
# check the balance of the data between classes
data['Winner'].value_counts()

In [None]:
data.info(verbose=True, show_counts=True)

In [191]:
# columns to remove: R_fighter (fighter name), B_fighter(fighter name), Referee (name), date, location
data = data.drop(['R_fighter', 'B_fighter', 'Referee', 'date', 'location', 'title_bout', 'weight_class','B_draw','R_draw'], axis=1)

In [192]:
# drop rows with missing values
data = data.dropna().copy()

In [193]:
# drop fights resulting in draw from dataset 
data = data[data.Winner != "Draw"]

In [194]:
data.info(verbose=True, show_counts=True)
data['Winner'].value_counts()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 3845 entries, 0 to 5884
Data columns (total 135 columns):
 #    Column                        Non-Null Count  Dtype  
---   ------                        --------------  -----  
 0    Winner                        3845 non-null   object 
 1    B_avg_KD                      3845 non-null   float64
 2    B_avg_opp_KD                  3845 non-null   float64
 3    B_avg_SIG_STR_pct             3845 non-null   float64
 4    B_avg_opp_SIG_STR_pct         3845 non-null   float64
 5    B_avg_TD_pct                  3845 non-null   float64
 6    B_avg_opp_TD_pct              3845 non-null   float64
 7    B_avg_SUB_ATT                 3845 non-null   float64
 8    B_avg_opp_SUB_ATT             3845 non-null   float64
 9    B_avg_REV                     3845 non-null   float64
 10   B_avg_opp_REV                 3845 non-null   float64
 11   B_avg_SIG_STR_att             3845 non-null   float64
 12   B_avg_SIG_STR_landed          3845 non-null   

Red     2411
Blue    1434
Name: Winner, dtype: int64

In [195]:
categorical_cols = ['B_Stance', 'R_Stance']
numeric_cols = list(set(data.columns) - set(categorical_cols) - {'Winner'})

In [None]:
data.info(verbose=True, show_counts=True)

In [None]:
# potentially take top x features based on anova score given
# given by sklearnfs.f_classif
fstat, pval = sklearn.feature_selection.f_classif(data[numeric_cols], data['Winner'])

In [196]:
data = pd.get_dummies(data, columns=categorical_cols)

In [None]:
data.info(verbose=True)

In [197]:
data_train, data_test = train_test_split(data, test_size=0.2, random_state=888, stratify=data['Winner'])

In [249]:
categorical_cols = ['B_Stance_Open Stance','B_Stance_Orthodox','B_Stance_Southpaw','B_Stance_Switch','R_Stance_Open Stance', 'R_Stance_Orthodox', 'R_Stance_Southpaw', 'R_Stance_Switch']
numeric_cols = list(set(data.columns) - set(categorical_cols) - {'Winner'})

selector = SelectKBest(f_classif, k=30)
data_train_20best = selector.fit_transform(data_train[numeric_cols], data_train['Winner'])
data_train_20best.shape




scaler = StandardScaler()
scaler.fit(data_train_20best)

def get_features_and_target_arrays(df, numeric_cols, cat_cols, scaler):
    X_numeric_scaled = scaler.transform(df[numeric_cols])
    X_categorical = df[cat_cols].to_numpy()
    X = np.hstack((X_categorical, X_numeric_scaled))
    y = df['Winner']
    return X, y

# X, y = get_features_and_target_arrays(data_train, numeric_cols, categorical_cols, scaler)

X_numeric_scaled = scaler.transform(data_train_20best)
X_categorical = data_train[categorical_cols]
X = np.hstack((X_categorical, X_numeric_scaled))
y = data_train['Winner']

In [250]:
clf = LogisticRegression(penalty='none', max_iter=10000)
clf.fit(X_numeric_scaled,y)

LogisticRegression(max_iter=10000, penalty='none')

In [251]:
# X_test, y_test = get_features_and_target_arrays(data_test, numeric_cols, categorical_cols, scaler)
X_test_num_20 = selector.transform(data_test[numeric_cols])

test_scaler = StandardScaler()
test_scaler.fit(X_test_num_20)
X_test_numeric_scaled = test_scaler.transform(X_test_num_20)
X_test_cat = data_test[categorical_cols]
X_test = np.hstack((X_test_cat, X_test_numeric_scaled))
y_test = data_test['Winner']

In [252]:
clf.score(X_test_numeric_scaled, y_test)

0.659297789336801

In [209]:
predictions = clf.predict(X_test)

In [210]:
from sklearn.metrics import plot_roc_curve, roc_curve, accuracy_score, classification_report, f1_score, multilabel_confusion_matrix, precision_recall_fscore_support, precision_score, recall_score, roc_auc_score, zero_one_loss

In [211]:
accuracy_score(y_test, predictions)

0.6436931079323797

In [212]:
print(classification_report(y_test, predictions))

              precision    recall  f1-score   support

        Blue       0.55      0.25      0.34       287
         Red       0.66      0.88      0.76       482

    accuracy                           0.64       769
   macro avg       0.61      0.56      0.55       769
weighted avg       0.62      0.64      0.60       769



In [213]:
multilabel_confusion_matrix(y_test, predictions)

array([[[423,  59],
        [215,  72]],

       [[ 72, 215],
        [ 59, 423]]])

In [214]:
roc_auc_score(y_test, clf.predict_proba(X_test)[:,1])

0.6596642907745023

In [215]:
from sklearn.ensemble import RandomForestClassifier

In [216]:
rf_clf = RandomForestClassifier(random_state=888)
rf_clf.fit(X,y)

RandomForestClassifier(random_state=888)

In [217]:
rf_clf.score(X_test, y_test)

0.6449934980494149

In [218]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.gaussian_process.kernels import RBF

In [220]:
knn_clf = KNeighborsClassifier(n_neighbors=5)

In [221]:
knn_clf.fit(X,y)

KNeighborsClassifier()

In [222]:
knn_clf.score(X_test,y_test)

0.6293888166449935

In [223]:
gpc = GaussianProcessClassifier(kernel = 1.0 * RBF(1.0))

In [224]:
gpc.fit(X,y)

GaussianProcessClassifier(kernel=1**2 * RBF(length_scale=1))

In [225]:
gpc.score(X_test,y_test)

0.6605981794538361

In [226]:
gnb = GaussianNB()
gnb.fit(X,y)

GaussianNB()

In [227]:
gnb.score(X_test, y_test)

0.5799739921976593

In [228]:
from sklearn.neural_network import MLPClassifier

In [229]:
mlp = MLPClassifier(random_state=1, max_iter=10000).fit(X,y)

In [230]:
mlp.score(X_test, y_test)

0.5955786736020806