In [13]:
import pandas as pd
import sklearn
import numpy as  np


In [14]:
df = pd.read_csv("UFC_processed.csv") # original dataset
##print(f"Initial shape: {df.shape}")

keep = np.array([4,2,1,6,10,11,12,37,38,63,65,66,67,68,69,70,71,72,73,74,75,76,77,78,79,104,105,130,132,133,134,135,136,137,138,139,140,141,142,143,144,145]) - 1
df = df.iloc[:,:] # interesting fields

df["date"] = pd.to_datetime(df["date"]) # date as datetime


df = df.replace(r'^\s*$', np.nan, regex=True) # replace empty string with nan
df = df.fillna(np.nan) # Fill empty and NaNs values with NaN

df = df.round(3) # format data

df.rename(columns={ # some renaming
    "B_win_by_KO/TKO": "B_win_by_KO_TKO",
    "R_win_by_KO/TKO": "R_win_by_KO_TKO"},inplace=True)
##print(f"shape: {df.shape}")

# considering matches with a winner as this will be a binary classification
drawRate = round(len(df["Winner"][df["Winner"]== "Draw"])/len(df.index)*100,3)
print(f"Draw rate is: {drawRate}%, converting to nan")
df["Winner"] = df["Winner"].replace("Draw", np.nan) # convert to nan to be dropped later
##df.dropna()

selected_features = [
    # Core Fighter Performance Stats
    "KD", "SIG_STR_pct", "SIG_STR", "TOTAL_STR", "TD", "TD_pct", 
    "SUB_ATT", "PASS", "REV",
    
    # Fighter-Specific Historical and Career Stats
    "current_win_streak", "current_lose_streak", "wins", "losses", 
    "draw", "total_rounds_fought", "total_time_fought(seconds)", 
    "total_title_bouts",
    
    # Physical Attributes
    "Height_cms", "Reach_cms", "Weight_lbs", "age", "Stance",
    
    # Fight Context Features
    "Format", "no_of_rounds", "title_bout", "Fight_type",
    
    # Outcome History
    "win_by_Decision_Majority", "win_by_Decision_Split", 
    "win_by_Decision_Unanimous", "win_by_KO_TKO", 
    "win_by_Submission", "win_by_TKO_Doctor_Stoppage",
    
    # Opponent-Based Metrics (Prefix R_ and B_ for red and blue corner fighters)
    "R_opp_SIG_STR", "B_opp_SIG_STR", "R_opp_TOTAL_STR", "B_opp_TOTAL_STR",
    "R_opp_TD", "B_opp_TD", "R_opp_PASS", "B_opp_PASS", "R_opp_REV", "B_opp_REV"
]



df.columns

Draw rate is: 0.0%, converting to nan


Index(['date', 'B_fighter', 'R_fighter', 'Winner', 'B_current_lose_streak',
       'B_current_win_streak', 'B_longest_win_streak', 'B_losses',
       'B_total_rounds_fought', 'B_total_title_bouts',
       'B_win_by_Decision_Majority', 'B_win_by_Decision_Split',
       'B_win_by_Decision_Unanimous', 'B_win_by_KO_TKO', 'B_win_by_Submission',
       'B_win_by_TKO_Doctor_Stoppage', 'B_wins', 'B_Height_cms', 'B_Reach_cms',
       'B_Weight_lbs', 'B_age', 'B_Stance_Open_Stance', 'B_Stance_Orthodox',
       'B_Stance_Southpaw', 'B_Stance_Switch', 'R_current_lose_streak',
       'R_current_win_streak', 'R_longest_win_streak', 'R_losses',
       'R_total_rounds_fought', 'R_total_title_bouts',
       'R_win_by_Decision_Majority', 'R_win_by_Decision_Split',
       'R_win_by_Decision_Unanimous', 'R_win_by_KO_TKO', 'R_win_by_Submission',
       'R_win_by_TKO_Doctor_Stoppage', 'R_wins', 'R_Height_cms', 'R_Reach_cms',
       'R_Weight_lbs', 'R_age', 'R_Stance_Open_Stance', 'R_Stance_Orthodox',
      

In [15]:
##shuffling the data 
df_shuffled = df.sample(frac = 1)
df_shuffled.head(n=5)

Unnamed: 0,date,B_fighter,R_fighter,Winner,B_current_lose_streak,B_current_win_streak,B_longest_win_streak,B_losses,B_total_rounds_fought,B_total_title_bouts,...,R_win_by_TKO_Doctor_Stoppage,R_wins,R_Height_cms,R_Reach_cms,R_Weight_lbs,R_age,R_Stance_Open_Stance,R_Stance_Orthodox,R_Stance_Southpaw,R_Stance_Switch
2248,2008-11-15,Jorge Gurgel,Aaron Riley,0,1.0,0.0,2.0,3.0,18.0,0.0,...,0.0,0.0,172.72,175.26,155.0,27.0,0,0,1,0
443,2002-09-27,Ivan Salaverry,Matt Lindland,0,0.0,1.0,1.0,0.0,3.0,0.0,...,0.0,4.0,182.88,187.96,185.0,32.0,0,0,1,0
4320,2019-02-23,Lucie Pudilova,Liz Carmouche,0,1.0,0.0,2.0,2.0,12.0,0.0,...,0.0,4.0,167.64,167.64,125.0,35.0,0,1,0,0
1563,2016-05-29,Adam Milstead,Chris de la Rocha,1,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,190.5,203.2,250.0,37.0,0,1,0,0
1549,2015-06-27,Tony Sims,Steve Montgomery,1,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,193.04,190.5,185.0,24.0,0,0,1,0


In [39]:
##split up the data 
df_shuffled.columns

# x = df_shuffled.drop(["date", "Winner","B_fighter", "R_fighter"], axis = 1).values
y = df_shuffled["Winner"].values
df_shuffled.drop(["date", "Winner","B_fighter", "R_fighter"], axis = 1).info()
x = df_shuffled.drop(["date", "Winner","B_fighter", 
                      "R_fighter", 
                      "B_Stance_Open_Stance", 
                      "B_Stance_Orthodox", 
                      "B_Stance_Southpaw", 
                      "B_Stance_Switch", 
                      "R_Stance_Open_Stance",
                      "R_Stance_Orthodox", 
                      "R_Stance_Southpaw", 
                      "R_Stance_Switch", 
                      "R_total_title_bouts",
                      "B_total_title_bouts",
                      "B_losses",
                      "R_losses",
                      
                      
                      
                        ], axis = 1).values


<class 'pandas.core.frame.DataFrame'>
Index: 4635 entries, 2248 to 1821
Data columns (total 42 columns):
 #   Column                        Non-Null Count  Dtype  
---  ------                        --------------  -----  
 0   B_current_lose_streak         4635 non-null   float64
 1   B_current_win_streak          4635 non-null   float64
 2   B_longest_win_streak          4635 non-null   float64
 3   B_losses                      4635 non-null   float64
 4   B_total_rounds_fought         4635 non-null   float64
 5   B_total_title_bouts           4635 non-null   float64
 6   B_win_by_Decision_Majority    4635 non-null   float64
 7   B_win_by_Decision_Split       4635 non-null   float64
 8   B_win_by_Decision_Unanimous   4635 non-null   float64
 9   B_win_by_KO_TKO               4635 non-null   float64
 10  B_win_by_Submission           4635 non-null   float64
 11  B_win_by_TKO_Doctor_Stoppage  4635 non-null   float64
 12  B_wins                        4635 non-null   float64
 13  B_Hei

In [40]:
import numpy as np

class CustomMinMaxScaler:
    def __init__(self):
        self.min_ = None
        self.max_ = None

    def fit(self, data):
        # Compute min and max values for each feature
        self.min_ = np.min(data, axis=0)
        self.max_ = np.max(data, axis=0)

    def transform(self, data):
        # Scale data to the range [0, 1]
        return (data - self.min_) / (self.max_ - self.min_)

    def fit_transform(self, data):
        self.fit(data)
        return self.transform(data)

In [41]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import joblib
xTrain, xTest, yTrain, yTest = train_test_split(x,y,test_size = .5)

In [42]:
winners = y = df_shuffled["Winner"].values

scaler = CustomMinMaxScaler()

xTrainScaled = scaler.fit_transform(xTrain)
xTestScaled = scaler.fit_transform(xTest)



In [43]:
from sklearn.linear_model import LogisticRegression
model1 = LogisticRegression(penalty = 'l1', tol = 1, solver = 'liblinear',
                            multi_class = "auto",
                            fit_intercept = False, 
                            max_iter =3)
model1.fit(xTrainScaled, yTrain)
joblib.dump(model1, 'model1.joblib')




['model1.joblib']

In [44]:
testing_model = joblib.load("model1.joblib")
prediction = testing_model.predict(xTestScaled)
# print(prediction)
# for i in prediction:
#     print(i)

In [45]:
print("Accuracy score for Logistic Regression Model: %s" % (sklearn.metrics.accuracy_score(yTest,prediction)))
print("Precision score for logistic Regression Model: %s" % (sklearn.metrics.precision_score(yTest,prediction)))
print("Recall score for Logistic Regression Model: %s" % (sklearn.metrics.recall_score(yTest,prediction)))
print("F1 score for Logistic Regression Model: %s" % (sklearn.metrics.f1_score(yTest,prediction)))

Accuracy score for Logistic Regression Model: 0.6721311475409836
Precision score for logistic Regression Model: 0.4426229508196721
Recall score for Logistic Regression Model: 0.035856573705179286
F1 score for Logistic Regression Model: 0.06633906633906633


In [46]:
from sklearn.neighbors import KNeighborsClassifier
model2 = KNeighborsClassifier(n_neighbors = 4, p =2)
model2.fit(xTrainScaled,yTrain)
joblib.dump(model2, 'model2.joblib')

['model2.joblib']

In [47]:
knnModel = joblib.load("model2.joblib")
knnPrediction = knnModel.predict(xTestScaled)
# for i in range(len(knnPrediction)): 
    # print("Model 2 prediction based on data B for item: %d : %s" % (i + 1, knnPrediction[i]))

In [48]:
knnModel_c = joblib.load("model2.joblib")
knnPrediction_c = knnModel_c.predict(xTestScaled)

C_answers = yTest
TP, FP = 0,0
#for i in range(len(knnPrediction_c)): 
    #print("Model 2 prediction based on data C: %s" % (knnPrediction_c[i]))
print("Accuracy Score for KNN Model: %s" %(sklearn.metrics.accuracy_score(C_answers,knnPrediction_c)))
print("Precision score for KNN Model: %s" %(sklearn.metrics.precision_score(C_answers,knnPrediction_c)))
print("Recall Score for KNN Model: %s" % (sklearn.metrics.recall_score(C_answers,knnPrediction_c)))
print("F1 Score for KNN Model: %s" % (sklearn.metrics.f1_score(C_answers,knnPrediction_c)))
FDR_knn = 1 - sklearn.metrics.precision_score(C_answers, knnPrediction_c)
print("False Detection Rate for KNN Model: %s" %(FDR_knn))
print("Matthews Coeffiecen for KNN Model: %s" % (sklearn.metrics.matthews_corrcoef(C_answers,knnPrediction_c)))

print(f'=====================================================================================================')

Accuracy Score for KNN Model: 0.6647972389991372
Precision score for KNN Model: 0.45918367346938777
Recall Score for KNN Model: 0.17928286852589642
F1 Score for KNN Model: 0.25787965616045844
False Detection Rate for KNN Model: 0.5408163265306123
Matthews Coeffiecen for KNN Model: 0.10932394188138414
