In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LogisticRegression

DATA_PATH = 'Fight9Sheet1.csv'
RANDOM_STATE = 42

In [2]:
# Load dataset
df = pd.read_csv('Fight9Sheet1.csv')

# Inspect dimensions
print(f"Dataset shape: {df.shape}")
df.head()

# Summary statistics
print(df.describe(include='all'))
# Check missing values
print(df.isnull().sum())

Dataset shape: (42, 41)
               fighter1         fighter2  fight_year  fighter1_wins  \
count                42               42        42.0      42.000000   
unique               39               36         NaN            NaN   
top     Islam Makhachev  Joaquin Buckley         NaN            NaN   
freq                  2                2         NaN            NaN   
mean                NaN              NaN      2024.0      20.380952   
std                 NaN              NaN         0.0       6.673400   
min                 NaN              NaN      2024.0       6.000000   
25%                 NaN              NaN      2024.0      16.000000   
50%                 NaN              NaN      2024.0      18.500000   
75%                 NaN              NaN      2024.0      25.750000   
max                 NaN              NaN      2024.0      34.000000   

        fighter2_wins  fighter1_losses  fighter2_losses  fighter1_last5  \
count       42.000000        42.000000        42

In [3]:
# Load full two-fighter dataset / preprocessing
df = pd.read_csv(DATA_PATH)

df = df.dropna(subset=['outcome', 'round_finished'])

df['outcome'] = df['outcome'].astype(int)  # 1 if fighter1 wins, 0 otherwise

# Drop identifier columns
X = df.drop(columns=['fighter1', 'fighter2', 'fight_year', 'outcome'])
y = df['outcome']

# Train/validation split
X_train, X_val, y_train, y_val = train_test_split(
    X, y, stratify=y, random_state=RANDOM_STATE
)

# Identify features
numeric_features = X_train.select_dtypes(include=['int64', 'float64']).columns.tolist()
categorical_features = ['weight_class']

numeric_transformer = Pipeline([
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())
])

categorical_transformer = Pipeline([
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

# Combine into a ColumnTransformer
preprocessor = ColumnTransformer([
    ('num', numeric_transformer, numeric_features),
    ('cat', categorical_transformer, categorical_features)
])

# Build and evaluate baseline pipeline
clf = Pipeline([
    ('preprocessor', preprocessor),
    ('classifier', LogisticRegression(random_state=RANDOM_STATE, max_iter=1000))
])

clf.fit(X_train, y_train)
print(f"Baseline validation accuracy: {clf.score(X_val, y_val):.3f}")

Baseline validation accuracy: 0.600


In [4]:
import datetime

current_year = datetime.datetime.now().year
df['fighter1_age'] = current_year - df['fighter1_birth_year']
df['fighter2_age'] = current_year - df['fighter2_birth_year']

# Comparative features
df['age_diff'] = df['fighter1_age'] - df['fighter2_age']
df['reach_diff'] = df['fighter1_reach'] - df['fighter2_reach']
df['height_diff'] = df['fighter1_height'] - df['fighter2_height']

# Performance rate differences
df['SLpM_diff'] = df['fighter1_SLpM'] - df['fighter2_SLpM']
df['StrAcc_diff'] = df['fighter1_StrAcc'] - df['fighter2_StrAcc']
df['TDAvg_diff'] = df['fighter1_TDAvg'] - df['fighter2_TDAvg']

# ratios...add small epsilon to avoid divide-by-zero
epsilon = 1e-6
df['win_rate1'] = df['fighter1_wins'] / (df['fighter1_wins'] + df['fighter1_losses'] + epsilon)
df['win_rate2'] = df['fighter2_wins'] / (df['fighter2_wins'] + df['fighter2_losses'] + epsilon)
df['win_rate_ratio'] = df['win_rate1'] / (df['win_rate2'] + epsilon)

# Update feature set
def get_feature_engineered_data(df):
    feats = [
        'age_diff', 'reach_diff', 'height_diff',
        'SLpM_diff', 'StrAcc_diff', 'TDAvg_diff',
        'win_rate_ratio', 'round_finished', 'fight_rounds',
        'weight_class'
    ]
    return df[feats]

X_fe = get_feature_engineered_data(df)

In [5]:
# Re-define train/validation splits
X = get_feature_engineered_data(df)
y = df['outcome']
X_train, X_val, y_train, y_val = train_test_split(
    X, y, stratify=y, random_state=RANDOM_STATE
)

# Update  lists
numeric_features = X.select_dtypes(include=['int64', 'float64']).columns.tolist()
categorical_features = ['weight_class']

# Re-create preprocessor and pipeline
preprocessor = ColumnTransformer([
    ('num', numeric_transformer, numeric_features),
    ('cat', categorical_transformer, categorical_features)
])

clf = Pipeline([
    ('preprocessor', preprocessor),
    ('classifier', LogisticRegression(random_state=RANDOM_STATE, max_iter=1000))
])

clf.fit(X_train, y_train)
print(f"Engineered features accuracy: {clf.score(X_val, y_val):.3f}")

Engineered features accuracy: 0.700


In [6]:
from sklearn.model_selection import StratifiedKFold, GridSearchCV
from sklearn.metrics import roc_auc_score  # added import

param_grid = {
    'classifier__C': [0.01, 0.1, 1, 10],
    'classifier__penalty': ['l1', 'l2'],
    'classifier__solver': ['liblinear']
}
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=RANDOM_STATE)
gs = GridSearchCV(
    clf, param_grid, scoring='roc_auc', cv=cv, n_jobs=-1
)
gs.fit(X_train, y_train)
print(f"Best params: {gs.best_params_}")
print(f"CV AUC: {gs.best_score_:.3f}")
print(f"Validation AUC: {roc_auc_score(y_val, gs.predict_proba(X_val)[:,1]):.3f}")

Best params: {'classifier__C': 0.01, 'classifier__penalty': 'l2', 'classifier__solver': 'liblinear'}
CV AUC: 0.758
Validation AUC: 0.792


In [7]:
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.model_selection import cross_val_score  # added import

models = {
    'RandomForest': RandomForestClassifier(random_state=RANDOM_STATE),
    'XGBoost': XGBClassifier(random_state=RANDOM_STATE, use_label_encoder=False, eval_metric='logloss')
}
for name, model in models.items():
    pipe = Pipeline([
        ('preprocessor', preprocessor),
        ('classifier', model)
    ])
    score = cross_val_score(pipe, X, y, cv=cv, scoring='roc_auc')
    print(f"{name} CV AUC: {score.mean():.3f} ± {score.std():.3f}")

RandomForest CV AUC: 0.656 ± 0.264


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


XGBoost CV AUC: 0.543 ± 0.181


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


In [11]:
from sklearn.metrics import classification_report, confusion_matrix

# random forest pipe
best_pipe = Pipeline([
    ('preprocessor', preprocessor),
    ('classifier', RandomForestClassifier(random_state=RANDOM_STATE))
])
best_pipe.fit(X_train, y_train)

# prediction on validation set
y_pred = best_pipe.predict(X_val)
# Classification report
print(classification_report(y_val, y_pred, target_names=['fighter2 wins','fighter1 wins']))

# confusion matrix
cm = confusion_matrix(y_val, y_pred)
print("Confusion Matrix:")
print(cm)

               precision    recall  f1-score   support

fighter2 wins       0.25      0.25      0.25         4
fighter1 wins       0.50      0.50      0.50         6

     accuracy                           0.40        10
    macro avg       0.38      0.38      0.38        10
 weighted avg       0.40      0.40      0.40        10

Confusion Matrix:
[[1 3]
 [3 3]]
