In [5]:
import pandas as pd
import numpy as np

df = pd.read_csv("data/ufc-master.csv")
final_df = pd.read_csv("data/Out_374 - Out_374.csv")
df

Unnamed: 0,RedFighter,BlueFighter,RedOdds,BlueOdds,RedExpectedValue,BlueExpectedValue,Date,Location,Country,Winner,...,FinishDetails,FinishRound,FinishRoundTime,TotalFightTimeSecs,RedDecOdds,BlueDecOdds,RSubOdds,BSubOdds,RKOOdds,BKOOdds
0,Alexandre Pantoja,Kai Asakura,-250.0,215.0,40.0000,215.0,2024-12-07,"Las Vegas, Nevada, USA",USA,Red,...,Rear Naked Choke,2.0,2:05,425.0,300.0,800.0,150.0,2500.0,400.0,350.0
1,Shavkat Rakhmonov,Ian Machado Garry,-210.0,295.0,47.6190,295.0,2024-12-07,"Las Vegas, Nevada, USA",USA,Red,...,,5.0,5:00,1500.0,250.0,650.0,180.0,3000.0,240.0,700.0
2,Ciryl Gane,Alexander Volkov,-380.0,300.0,26.3158,300.0,2024-12-07,"Las Vegas, Nevada, USA",USA,Red,...,,3.0,5:00,900.0,-160.0,450.0,1100.0,3000.0,350.0,1100.0
3,Bryce Mitchell,Kron Gracie,-950.0,625.0,10.5263,625.0,2024-12-07,"Las Vegas, Nevada, USA",USA,Red,...,Elbows,3.0,0:39,639.0,-200.0,1100.0,380.0,1400.0,500.0,4000.0
4,Nate Landwehr,Dooho Choi,-130.0,110.0,76.9231,110.0,2024-12-07,"Las Vegas, Nevada, USA",USA,Blue,...,Elbows,3.0,3:21,801.0,275.0,550.0,500.0,700.0,300.0,250.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6523,Duane Ludwig,Darren Elkins,-155.0,135.0,64.5161,135.0,2010-03-21,"Broomfield, Colorado, USA",USA,Blue,...,,1.0,0:44,44.0,,,,,,
6524,John Howard,Daniel Roberts,-210.0,175.0,47.6190,175.0,2010-03-21,"Broomfield, Colorado, USA",USA,Red,...,Punch,1.0,2:01,121.0,,,,,,
6525,Brendan Schaub,Chase Gormley,-260.0,220.0,38.4615,220.0,2010-03-21,"Broomfield, Colorado, USA",USA,Red,...,Punches,1.0,0:47,47.0,,,,,,
6526,Mike Pierce,Julio Paulino,-420.0,335.0,23.8095,335.0,2010-03-21,"Broomfield, Colorado, USA",USA,Red,...,,3.0,5:00,900.0,,,,,,


In [6]:
import pandas as pd

def preprocess(df):
    df = df.copy()

    # --- Parse date ---
    df["Date"] = pd.to_datetime(df["Date"], errors="coerce")
    #df = df.dropna(subset=["Date"])

    # --- Columns to drop ---
    cols_to_drop1 = ['Finish','FinishDetails','FinishRound','FinishRoundTime','TotalFightTimeSecs','Location','Country']
    df = df.drop(columns=cols_to_drop1, errors='ignore')
   # df = df.drop(columns=df.filter(like="Odds").columns)

    # --- Difference features ---
    diff_pairs = {
        "OddsDif": ("RedOdds", "BlueOdds"),
        "LoseStreakDif": ("RedCurrentLoseStreak", "BlueCurrentLoseStreak"),
        "WinStreakDif": ("RedCurrentWinStreak", "BlueCurrentWinStreak"),
        "LongestWinStreakDif": ("RedLongestWinStreak", "BlueLongestWinStreak"),
        "WinDif": ("RedWins", "BlueWins"),
        "LossDif": ("RedLosses", "BlueLosses"),
        "TotalRoundDif": ("RedTotalRoundsFought", "BlueTotalRoundsFought"),
        "TotalTitleBoutDif": ("RedTotalTitleBouts", "BlueTotalTitleBouts"),
        "KODif": ("RedWinsByKO", "BlueWinsByKO"),
        "SubDif": ("RedWinsBySubmission", "BlueWinsBySubmission"),
        "HeightDif": ("RedHeightCms", "BlueHeightCms"),
        "ReachDif": ("RedReachCms", "BlueReachCms"),
        "AgeDif": ("RedAge", "BlueAge"),
        "SigStrDif": ("RedAvgSigStrLanded", "BlueAvgSigStrLanded"),
        "AvgSubAttDif": ("RedAvgSubAtt", "BlueAvgSubAtt"),
        "AvgTDDif": ("RedAvgTDLanded", "BlueAvgTDLanded"),
    }

    for newcol, (r, b) in diff_pairs.items():
        if r in df.columns and b in df.columns:
            df[newcol] = df[r] - df[b]
            #df = df.drop(columns=[r,b])

    # Date -> numeric
    df["Date"] = pd.to_datetime(df["Date"], errors='coerce')
    df["Date"] = df["Date"].dt.year * 12 + df["Date"].dt.month

    # Drop fighter name/text columns
    for col in ["RedFighter", "BlueFighter", "EmptyArena"]:
        df = df.drop(columns=[col], errors='ignore')

    # One-hot encoding list
    dummies = [

        "BlueStance","RedStance","Winner","TitleBout","WeightClass",
        "Gender","BetterRank","Finish","FinishDetails"
    ]

    cols_to_encode = [c for c in dummies if c in df.columns]
    df = pd.get_dummies(df, columns=cols_to_encode, drop_first=True, dtype=int)

    # Fill NAs
    for col in df.columns:
        if df[col].isna().sum() > 0:
            if "rank" in col.lower():
                df[col] = df[col].fillna(1000)
            else:
                df[col] = df[col].fillna(df[col].mean())

    # Drop all KO/sub breakdowns
    drop_ko = list(df.filter(regex="^RedWinsBy").columns) + \
              list(df.filter(regex="^BlueWinsBy").columns)

    df = df.drop(columns=drop_ko, errors="ignore")

    return df

# Store row count BEFORE combining
n_train = df.shape[0]

# Combine df + final_df
combined = pd.concat([df, final_df], axis=0, ignore_index=True)

# Preprocess once on the combined dataset
processed = preprocess(combined)

# Split back to original pieces
df_processed = processed.iloc[:n_train].reset_index(drop=True)
final_df_processed = processed.iloc[n_train:].reset_index(drop=True)

print(df_processed.shape, final_df_processed.shape)


(6528, 114) (5, 114)


In [7]:
from sklearn.ensemble import RandomForestClassifier,GradientBoostingClassifier
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split

X = df_processed.drop('Winner_Red', axis =1)
y = df_processed['Winner_Red']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

In [8]:
clf = GradientBoostingClassifier()
clf.fit(X_train, y_train)

from sklearn.metrics import accuracy_score
y_pred = clf.predict(X_test)

print(accuracy_score(y_test, y_pred))

# Get feature importances
importances = clf.feature_importances_

feature_names = X_train.columns  # if using pandas DataFrame

# Sort and display nicely
feat_imp = pd.DataFrame({
    "Feature": feature_names,
    "Importance": importances
}).sort_values(by="Importance", ascending=False)

feat_imp

0.6533642691415313


Unnamed: 0,Feature,Importance
2,RedExpectedValue,0.254834
1,BlueOdds,0.128493
89,OddsDif,0.076755
3,BlueExpectedValue,0.037198
10,BlueAvgSigStrPct,0.031979
...,...,...
23,RedCurrentWinStreak,0.000000
22,RedCurrentLoseStreak,0.000000
71,BWFeatherweightRank,0.000000
92,BlueStance_Switch,0.000000


In [9]:

"""
X_final = final_df_processed.drop("Winner_Red",axis=1)
preds = clf.predict(X_final)
class_probabilities = clf.predict_proba(X_final)

for idx, p in enumerate(preds):
    row = final_df.iloc[idx]

    red = row["RedFighter"]
    blue = row["BlueFighter"]

    winner = red if p == 1 else blue
    print(f"Fight {idx+1}: {red} vs {blue} → Predicted Winner: {winner} , {round(class_probabilities[idx][p],2)}%")
"""

'\nX_final = final_df_processed.drop("Winner_Red",axis=1)\npreds = clf.predict(X_final)\nclass_probabilities = clf.predict_proba(X_final)\n\nfor idx, p in enumerate(preds):\n    row = final_df.iloc[idx]\n\n    red = row["RedFighter"]\n    blue = row["BlueFighter"]\n\n    winner = red if p == 1 else blue\n    print(f"Fight {idx+1}: {red} vs {blue} → Predicted Winner: {winner} , {round(class_probabilities[idx][p],2)}%")\n'