In [370]:
import pandas as pd
import numpy as np

df = pd.read_csv("data/ufc-master.csv")
final_df = pd.read_csv("data/Out_374 - Out_374.csv")

In [371]:
import pandas as pd

def preprocess(df):
    df = df.copy()

    # --- Parse date ---
    df["Date"] = pd.to_datetime(df["Date"], errors="coerce")
    #df = df.dropna(subset=["Date"])

    # --- Columns to drop ---
    cols_to_drop1 = [
        'BlueExpectedValue','RedExpectedValue', 'Finish','FinishDetails','FinishRound','FinishRoundTime','TotalFightTimeSecs']
    df = df.drop(columns=cols_to_drop1, errors='ignore')
    df = df.drop(columns=df.filter(like="Odds").columns)

    # --- Difference features ---
    diff_pairs = {
        "LoseStreakDif": ("RedCurrentLoseStreak", "BlueCurrentLoseStreak"),
        "WinStreakDif": ("RedCurrentWinStreak", "BlueCurrentWinStreak"),
        "LongestWinStreakDif": ("RedLongestWinStreak", "BlueLongestWinStreak"),
        "WinDif": ("RedWins", "BlueWins"),
        "LossDif": ("RedLosses", "BlueLosses"),
        "TotalRoundDif": ("RedTotalRoundsFought", "BlueTotalRoundsFought"),
        "TotalTitleBoutDif": ("RedTotalTitleBouts", "BlueTotalTitleBouts"),
        "KODif": ("RedWinsByKO", "BlueWinsByKO"),
        "SubDif": ("RedWinsBySubmission", "BlueWinsBySubmission"),
        "HeightDif": ("RedHeightCms", "BlueHeightCms"),
        "ReachDif": ("RedReachCms", "BlueReachCms"),
        "AgeDif": ("RedAge", "BlueAge"),
        "SigStrDif": ("RedAvgSigStrLanded", "BlueAvgSigStrLanded"),
        "AvgSubAttDif": ("RedAvgSubAtt", "BlueAvgSubAtt"),
        "AvgTDDif": ("RedAvgTDLanded", "BlueAvgTDLanded"),
    }

    for newcol, (r, b) in diff_pairs.items():
        if r in df.columns and b in df.columns:
            df[newcol] = df[r] - df[b]
            #df = df.drop(columns=[r,b])

    # Date -> numeric
    df["Date"] = pd.to_datetime(df["Date"], errors='coerce')
    df["Date"] = df["Date"].dt.year * 12 + df["Date"].dt.month

    # Drop fighter name/text columns
    for col in ["RedFighter", "BlueFighter", "EmptyArena"]:
        df = df.drop(columns=[col], errors='ignore')

    # One-hot encoding list
    dummies = [
        "BlueStance","RedStance","Winner","TitleBout","WeightClass",
        "Gender","BetterRank","Finish","FinishDetails", 'Location','Country'
    ]

    cols_to_encode = [c for c in dummies if c in df.columns]
    df = pd.get_dummies(df, columns=cols_to_encode, drop_first=True, dtype=int)

    # Fill NAs
    for col in df.columns:
        if df[col].isna().sum() > 0:
            df[col] = df[col].fillna(df[col].mean())

    # Drop all KO/sub breakdowns
    drop_ko = list(df.filter(regex="^RedWinsBy").columns) + \
              list(df.filter(regex="^BlueWinsBy").columns)

    df = df.drop(columns=drop_ko, errors="ignore")

    return df

# Store row count BEFORE combining
n_train = df.shape[0]

# Combine df + final_df
combined = pd.concat([df, final_df], axis=0, ignore_index=True)

# Preprocess once on the combined dataset
processed = preprocess(combined)

# Split back to original pieces
df_processed = processed.iloc[:n_train].reset_index(drop=True)
final_df_processed = processed.iloc[n_train:].reset_index(drop=True)

print(df_processed.shape, final_df_processed.shape)


(6528, 289) (5, 289)


In [372]:
from sklearn.ensemble import RandomForestClassifier,GradientBoostingClassifier
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split

X = df_processed.drop('Winner_Red', axis =1)
y = df_processed['Winner_Red']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

In [373]:
clf = GradientBoostingClassifier()
clf.fit(X_train, y_train)

from sklearn.metrics import accuracy_score
y_pred = clf.predict(X_test)

print(accuracy_score(y_test, y_pred))

# Get feature importances
importances = clf.feature_importances_
feature_names = X_train.columns  # if using pandas DataFrame

# Sort and display nicely
feat_imp = pd.DataFrame({
    "Feature": feature_names,
    "Importance": importances
}).sort_values(by="Importance", ascending=False)

#print(feat_imp)

X_final = final_df_processed.drop("Winner_Red",axis=1)
preds = clf.predict(X_final)
class_probabilities = clf.predict_proba(X_final)

for idx, p in enumerate(preds):
    row = final_df.iloc[idx]

    red = row["RedFighter"]
    blue = row["BlueFighter"]

    winner = red if p == 1 else blue
    print(f"Fight {idx+1}: {red} vs {blue} → Predicted Winner: {winner} , {round(class_probabilities[idx][p],2)}%")

0.6188725490196079
Fight 1: Arman Tsarukyan vs Dan Hooker → Predicted Winner: Arman Tsarukyan , 0.7%
Fight 2: Belal Muhammad vs Ian Machado Garry → Predicted Winner: Ian Machado Garry , 0.62%
Fight 3: Volkan Oezdemir vs Alonzo Menifield → Predicted Winner: Alonzo Menifield , 0.53%
Fight 4: Waldo Cortes-Acosta vs Shamil Gaziev → Predicted Winner: Waldo Cortes-Acosta , 0.7%
Fight 5: Tagir Ulanbekov vs Kyoji Horiguchi → Predicted Winner: Kyoji Horiguchi , 0.65%
