In [1]:
import pandas as pd
import numpy as np
import datetime
from sklearn.model_selection import train_test_split, StratifiedKFold, GridSearchCV, cross_val_score, RandomizedSearchCV
from sklearn.preprocessing import PolynomialFeatures, StandardScaler, OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestClassifier, StackingClassifier
from xgboost import XGBClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score, f1_score
from sklearn.utils import resample
from scipy.stats import randint, uniform
from sklearn.ensemble import VotingClassifier
from sklearn.calibration import CalibratedClassifierCV

RANDOM_STATE = 42

In [2]:
DATA_PATH = 'Fight9Data.csv'
df = pd.read_csv(DATA_PATH)

print(f"Dataset shape: {df.shape}")
display(df.head())

print(df.describe(include='all'))

print(df.isnull().sum())

Dataset shape: (41, 41)


Unnamed: 0,fighter1,fighter2,fight_year,fighter1_wins,fighter2_wins,fighter1_losses,fighter2_losses,fighter1_last5,fighter2_last5,fighter1_SLpM,...,fighter1_height,fighter2_height,fighter1_reach,fighter2_reach,fighter1_subs,fighter2_subs,weight_class,round_finished,fight_rounds,outcome
0,Jared Cannonier,Caio Borralho,2024,17,16,7,1,11.0,19,4.49,...,1.8,1.88,77,75,0.117647,0.235294,185,6,5,0
1,Tony Ferguson,Michael Chiesa,2024,26,18,10,7,0.0,4,4.91,...,1.8,1.85,76,75,0.307692,0.631579,170,1,3,0
2,Leon Edwards,Belal Muhammad,2024,22,23,3,3,6.0,13,2.68,...,1.83,1.78,74,72,0.136364,0.041667,170,6,5,0
3,Dricus Du Plessis,Israel Adesanya,2024,21,24,2,3,18.0,7,6.18,...,1.83,1.93,76,80,0.5,0.0,185,4,5,1
4,Islam Makhachev,Dustin Poirier,2024,25,30,1,8,24.0,15,2.65,...,1.78,1.75,70,72,0.461538,0.233333,155,5,5,1


                fighter1           fighter2  fight_year  fighter1_wins  \
count                 41                 41        41.0      41.000000   
unique                39                 36         NaN            NaN   
top     Charles Oliveira  Aljamain Sterling         NaN            NaN   
freq                   2                  2         NaN            NaN   
mean                 NaN                NaN      2024.0      20.243902   
std                  NaN                NaN         0.0       6.696195   
min                  NaN                NaN      2024.0       6.000000   
25%                  NaN                NaN      2024.0      16.000000   
50%                  NaN                NaN      2024.0      18.000000   
75%                  NaN                NaN      2024.0      25.000000   
max                  NaN                NaN      2024.0      34.000000   

        fighter2_wins  fighter1_losses  fighter2_losses  fighter1_last5  \
count       41.000000        41.0000

In [11]:
# Data Preprocessing
X = df.drop(columns=['fighter1', 'fighter2', 'fight_year', 'outcome'])
y = df['outcome']

# Split into train/validation -0.25 hioldout
X_train, X_val, y_train, y_val = train_test_split(
    X, y,
    test_size=0.25,
    stratify=y,
    random_state=RANDOM_STATE
)

numeric_features = X_train.select_dtypes(include=['int64', 'float64']).columns.tolist()
categorical_features = ['weight_class']

# preprocessing transformers
numeric_transformer = Pipeline([
    ('imputer', SimpleImputer(strategy='mean')),
    # ('poly', PolynomialFeatures(degree=2, include_bias=False)),  # remove this
    ('scaler', StandardScaler())
])

categorical_transformer = Pipeline([
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

# Combine into a ColumnTransformer
preprocessor = ColumnTransformer([
    ('num', numeric_transformer, numeric_features),
    ('cat', categorical_transformer, categorical_features)
])

# Baseline logistic regression pipeline
grid_pipe = Pipeline([
    ('preprocessor', preprocessor),
    ('classifier', LogisticRegression(random_state=RANDOM_STATE, max_iter=1000))
])

# Evaluate baseline
grid_pipe.fit(X_train, y_train)
print(f"Baseline validation accuracy: {grid_pipe.score(X_val, y_val):.3f}")

Baseline validation accuracy: 0.636


In [4]:
current_year = datetime.datetime.now().year

# Calculate ages
df['fighter1_age'] = current_year - df['fighter1_birth_year']
df['fighter2_age'] = current_year - df['fighter2_birth_year']

# Comparative metrics
df['age_diff'] = df['fighter1_age'] - df['fighter2_age']
df['reach_diff'] = df['fighter1_reach'] - df['fighter2_reach']
df['height_diff'] = df['fighter1_height'] - df['fighter2_height']

# Performance rate differences
df['SLpM_diff'] = df['fighter1_SLpM'] - df['fighter2_SLpM']
df['StrAcc_diff'] = df['fighter1_StrAcc'] - df['fighter2_StrAcc']
df['TDAvg_diff'] = df['fighter1_TDAvg'] - df['fighter2_TDAvg']

# Win-rate ratio
epsilon = 1e-6
df['win_rate1'] = df['fighter1_wins'] / (df['fighter1_wins'] + df['fighter1_losses'] + epsilon)
df['win_rate2'] = df['fighter2_wins'] / (df['fighter2_wins'] + df['fighter2_losses'] + epsilon)
df['win_rate_ratio'] = df['win_rate1'] / (df['win_rate2'] + epsilon)

# extract engineered features
def get_feature_engineered_data(df):
    feats = [
        'age_diff', 'reach_diff', 'height_diff',
        'SLpM_diff', 'StrAcc_diff', 'TDAvg_diff',
        'win_rate_ratio', 'round_finished', 'fight_rounds',
        'weight_class'  # removed 'days_since_last_fight1' since no date info
    ]
    return df[feats]

raw_feats = [
    'fighter1_wins','fighter1_losses','fighter1_last5','fighter1_SLpM',
    'fighter1_StrAcc','fighter1_SApM','fighter1_StrDef','fighter1_TDAvg',
    'fighter1_TDAcc','fighter1_TDDef','fighter1_SubAvg','fighter1_KDAvg',
    'fighter1_ranking','fighter1_height','fighter1_reach','fighter1_subs',
    'fighter2_wins','fighter2_losses','fighter2_last5','fighter2_SLpM',
    'fighter2_StrAcc','fighter2_SApM','fighter2_StrDef','fighter2_TDAvg',
    'fighter2_TDAcc','fighter2_TDDef','fighter2_SubAvg','fighter2_KDAvg',
    'fighter2_ranking','fighter2_height','fighter2_reach','fighter2_subs',
    'round_finished','fight_rounds'
]

eng_feats = [
    'age_diff','reach_diff','height_diff',
    'SLpM_diff','StrAcc_diff','TDAvg_diff',
    'win_rate_ratio'
]

def get_feature_data(df):
    return df[ raw_feats + eng_feats + ['round_finished','fight_rounds','weight_class'] ]

# 4) Build X_fe from all of them
X_fe = get_feature_data(df)

In [5]:
# Define raw stats + engineered diffs
raw_feats = [
    'fighter1_wins','fighter1_losses','fighter1_last5','fighter1_SLpM',
    'fighter1_StrAcc','fighter1_SApM','fighter1_StrDef','fighter1_TDAvg',
    'fighter1_TDAcc','fighter1_TDDef','fighter1_SubAvg','fighter1_KDAvg',
    'fighter1_ranking','fighter1_height','fighter1_reach','fighter1_subs',
    'fighter2_wins','fighter2_losses','fighter2_last5','fighter2_SLpM',
    'fighter2_StrAcc','fighter2_SApM','fighter2_StrDef','fighter2_TDAvg',
    'fighter2_TDAcc','fighter2_TDDef','fighter2_SubAvg','fighter2_KDAvg',
    'fighter2_ranking','fighter2_height','fighter2_reach','fighter2_subs',
    'round_finished','fight_rounds'
]
eng_feats = [
    'age_diff','reach_diff','height_diff',
    'SLpM_diff','StrAcc_diff','TDAvg_diff',
    'win_rate_ratio'
]

def get_feature_data(df):
    return df[raw_feats + eng_feats + ['weight_class']]

# Build X, y
X = get_feature_data(df)
y = df['outcome'].astype(int)

# Train/validation split
X_train, X_val, y_train, y_val = train_test_split(
    X, y, test_size=0.25, stratify=y, random_state=RANDOM_STATE
)

# Identify feature types
numeric_features = X_train.select_dtypes(include=['int64','float64']).columns.tolist()
categorical_features = ['weight_class']

# Update preprocessin
numeric_transformer2 = Pipeline([
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())
])
categorical_transformer2 = Pipeline([
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])
preprocessor2 = ColumnTransformer([
    ('num', numeric_transformer2, numeric_features),
    ('cat', categorical_transformer2, categorical_features)
])

# Baseline logistic pipeline on the full feature set
baseline_full = Pipeline([
    ('preprocessor', preprocessor2),
    ('classifier', LogisticRegression(random_state=RANDOM_STATE, max_iter=1000))
])
baseline_full.fit(X_train, y_train)
print(f"Full‐feature logistic accuracy: {baseline_full.score(X_val, y_val):.3f}")


Full‐feature logistic accuracy: 0.636


In [6]:
# hyperparameter tuning

cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=RANDOM_STATE)

def build_preprocessor():
    return ColumnTransformer([
        ('num', numeric_transformer2, numeric_features),
        ('cat', categorical_transformer2, categorical_features)
    ])

# definding RF pipeline
rf_pipe = Pipeline([
    ('preprocessor', build_preprocessor()),
    ('classifier', RandomForestClassifier(
        class_weight='balanced',
        random_state=RANDOM_STATE
    ))
])

# hyperparamter grid tuned for accuracy
param_grid_acc = {
    'classifier__n_estimators': [100, 250, 500, 1000],
    'classifier__max_depth': [None, 5, 10, 20],
    'classifier__min_samples_leaf': [1, 2, 5],
    'classifier__max_features': ['sqrt', 'log2', 0.8]
}

# gridsearchcv optimization 
gs_acc = GridSearchCV(
    rf_pipe,
    param_grid_acc,
    scoring='accuracy',
    cv=cv,
    n_jobs=-1
)
gs_acc.fit(X_train, y_train)

# Extract and train estimator
best = gs_acc.best_estimator_
best.fit(X_train, y_train)

print("Best params for accuracy:", gs_acc.best_params_)
print("Best CV accuracy:", gs_acc.best_score_)


  _data = np.array(data, dtype=dtype, copy=copy,


Best params for accuracy: {'classifier__max_depth': None, 'classifier__max_features': 'sqrt', 'classifier__min_samples_leaf': 2, 'classifier__n_estimators': 100}
Best CV accuracy: 0.7


In [7]:
# Pull best params from accuracy‐gridsearch
best_params_acc = gs_acc.best_params_.copy()

rf_params = {k.replace('classifier__',''): v for k,v in best_params_acc.items()}
# override/add the OOB and gentle class_weight
rf_params.update({
    'class_weight': {0:1, 1:1.1},
    'oob_score': True,
    'bootstrap': True,
    'random_state': RANDOM_STATE
})

# improved rf pipeline
rf_pipe_improved = Pipeline([
    ('preprocessor', build_preprocessor()),
    ('classifier', RandomForestClassifier(**rf_params))
])

rf_pipe_improved.fit(X_train, y_train)

# check OOB and plain validation accuracy
print(f"OOB accuracy: {rf_pipe_improved.named_steps['classifier'].oob_score_:.3f}")
print(f"Validation accuracy: {rf_pipe_improved.score(X_val, y_val):.3f}")

# soft‐voting ensemble of three RFs for stability
rf2 = RandomForestClassifier(
    **{**rf_params, **{'n_estimators':500, 'max_depth':15, 'random_state':RANDOM_STATE+1}}
)
rf3 = RandomForestClassifier(
    **{**rf_params, **{'n_estimators':1500, 'max_depth':None, 'random_state':RANDOM_STATE+2}}
)

voting = VotingClassifier(
    estimators=[
        ('imp', rf_pipe_improved),
        ('mid', Pipeline([('preprocessor', build_preprocessor()), ('classifier', rf2)])),
        ('wide', Pipeline([('preprocessor', build_preprocessor()), ('classifier', rf3)]))
    ],
    voting='soft',
    n_jobs=-1
)
voting.fit(X_train, y_train)
print(f"Voting ensemble validation accuracy: {voting.score(X_val, y_val):.3f}")

# calibrate the improved RF to tighten probabilities
calibrated = CalibratedClassifierCV(rf_pipe_improved, cv=cv, method='isotonic')
calibrated.fit(X_train, y_train)
print(f"Calibrated RF validation accuracy: {calibrated.score(X_val, y_val):.3f}")


OOB accuracy: 0.667
Validation accuracy: 0.636
Voting ensemble validation accuracy: 0.636
Calibrated RF validation accuracy: 0.545


In [8]:
# cross validation
y = df['outcome'].astype(int)
X = get_feature_data(df)

# Random Forest vs XG Boost
models = {
    'RandomForest': RandomForestClassifier(random_state=RANDOM_STATE),
    'XGBoost': XGBClassifier(random_state=RANDOM_STATE, eval_metric='logloss')
}

for name, model in models.items():
    pipe = Pipeline([
        ('preprocessor', build_preprocessor()),
        ('classifier', model)
    ])
    scores = cross_val_score(pipe, X, y, cv=cv, scoring='roc_auc')
    print(f"{name} CV AUC: {scores.mean():.3f} ± {scores.std():.3f}")

# SVM baseline
from sklearn.svm import SVC
svc_pipe = Pipeline([
    ('preprocessor', build_preprocessor()),
    ('classifier', SVC(kernel='rbf', probability=True, random_state=RANDOM_STATE))
])
svc_scores = cross_val_score(svc_pipe, X, y, cv=cv, scoring='roc_auc')
print(f"SVM CV AUC: {svc_scores.mean():.3f} ± {svc_scores.std():.3f}")

# PCA test
from sklearn.decomposition import PCA
pipe_pca = Pipeline([
    ('preprocessor', build_preprocessor()),
    ('pca', PCA(n_components=5)),
    ('classifier', RandomForestClassifier(random_state=RANDOM_STATE))
])
pca_scores = cross_val_score(pipe_pca, X, y, cv=cv, scoring='roc_auc')
print(f"PCA + RF CV AUC: {pca_scores.mean():.3f} ± {pca_scores.std():.3f}")


RandomForest CV AUC: 0.653 ± 0.307
XGBoost CV AUC: 0.623 ± 0.231
SVM CV AUC: 0.690 ± 0.239
PCA + RF CV AUC: 0.700 ± 0.268


In [12]:
best_pipe = best 

# Baseline predictions at the default 0.5 threshold
y_pred_default = best_pipe.predict(X_val)
print("Classification Report (threshold=0.5):")
print(classification_report(y_val, y_pred_default,
                            target_names=['fighter2 wins','fighter1 wins']))
print("Confusion Matrix (threshold=0.5):")
print(confusion_matrix(y_val, y_pred_default))

# Probability outputs for threshold search
probs = best_pipe.predict_proba(X_val)[:, 1]

# Search thresholds from 0.01 to 0.99
thresholds = np.linspace(0.01, 0.99, 99)
results = []
for th in thresholds:
    preds = (probs >= th).astype(int)
    if preds.min() == preds.max():  # skip if all one class
        continue
    acc = (preds == y_val).mean()
    f1w = f1_score(y_val, preds, average='weighted')
    f1m = f1_score(y_val, preds, average='macro')
    results.append((th, acc, f1w, f1m))

# Pick threshold that maximizes accuracy
best_th, best_acc, best_f1w, best_f1m = max(results, key=lambda x: x[1])
print(f"\nOptimal threshold for accuracy: {best_th:.2f} → Accuracy: {best_acc:.3f}")
print(f"Weighted-F1 at this cutoff: {best_f1w:.3f}")
print(f"Macro-F1    at this cutoff: {best_f1m:.3f}")

# Confusion & classification report
preds_th = (probs >= best_th).astype(int)
print("\nClassification Report (optimal threshold):")
print(classification_report(y_val, preds_th,
                            target_names=['fighter2 wins','fighter1 wins']))
print("Confusion Matrix (optimal threshold):")
print(confusion_matrix(y_val, preds_th))

# Cross-validation sanity check for F1 and accuracy
from sklearn.model_selection import cross_val_score
cv_acc = cross_val_score(best_pipe, X, y, cv=cv, scoring='accuracy')
cv_f1w = cross_val_score(best_pipe, X, y, cv=cv, scoring='f1_weighted')
cv_f1m = cross_val_score(best_pipe, X, y, cv=cv, scoring='f1_macro')
print(f"\nCV accuracy:     {cv_acc.mean():.3f} ± {cv_acc.std():.3f}")
print(f"CV weighted-F1:  {cv_f1w.mean():.3f} ± {cv_f1w.std():.3f}")
print(f"CV macro-F1:     {cv_f1m.mean():.3f} ± {cv_f1m.std():.3f}")


Classification Report (threshold=0.5):
               precision    recall  f1-score   support

fighter2 wins       0.40      0.50      0.44         4
fighter1 wins       0.67      0.57      0.62         7

     accuracy                           0.55        11
    macro avg       0.53      0.54      0.53        11
 weighted avg       0.57      0.55      0.55        11

Confusion Matrix (threshold=0.5):
[[2 2]
 [3 4]]

Optimal threshold for accuracy: 0.41 → Accuracy: 0.727
Weighted-F1 at this cutoff: 0.717
Macro-F1    at this cutoff: 0.686

Classification Report (optimal threshold):
               precision    recall  f1-score   support

fighter2 wins       0.67      0.50      0.57         4
fighter1 wins       0.75      0.86      0.80         7

     accuracy                           0.73        11
    macro avg       0.71      0.68      0.69        11
 weighted avg       0.72      0.73      0.72        11

Confusion Matrix (optimal threshold):
[[2 2]
 [1 6]]

CV accuracy:     0.572 ±

In [10]:
# stacking estimators using tuned RF params
estimators = [
    ('rf', RandomForestClassifier(
        n_estimators=gs_acc.best_params_['classifier__n_estimators'],
        max_depth=gs_acc.best_params_['classifier__max_depth'],
        min_samples_leaf=gs_acc.best_params_['classifier__min_samples_leaf'],
        max_features=gs_acc.best_params_['classifier__max_features'],
        class_weight='balanced',
        random_state=RANDOM_STATE
    )),
    ('xgb', XGBClassifier(
        learning_rate=0.1,
        max_depth=5,
        eval_metric='logloss',
        random_state=RANDOM_STATE
    ))
]

stack = StackingClassifier(
    estimators=estimators,
    final_estimator=LogisticRegression(),
    cv=cv,
    n_jobs=-1
)

# Fit and evaluate stacking
stack.fit(X_train, y_train)
print(f"Stacking validation accuracy: {stack.score(X_val, y_val):.3f}")

# Threshold optimization for F1-score
y_prob = stack.predict_proba(X_val)[:, 1]
thresholds = np.linspace(0.1, 0.9, 81)
scores = [(th, f1_score(y_val, y_prob >= th)) for th in thresholds]
best_th, best_f1 = max(scores, key=lambda x: x[1])
print(f"Optimal threshold: {best_th:.2f} with F1 = {best_f1:.3f}")


Stacking validation accuracy: 0.636
Optimal threshold: 0.10 with F1 = 0.778
