In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm import tqdm
import math
from datetime import datetime

# Set display options to show all rows and columns
pd.set_option('display.max_rows', None)  # None to show all rows
pd.set_option('display.max_columns', None)  # None to show all columns
pd.set_option('display.width', None)  # Adjust the display width to fit content
pd.set_option('display.max_colwidth', None)  # Show full column content


import os
import warnings

warnings.filterwarnings("ignore")

from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics import (
    accuracy_score,
    precision_score,
    recall_score,
    f1_score,
    roc_auc_score,
    roc_curve,
    auc,
    confusion_matrix,
    classification_report,
    ConfusionMatrixDisplay,
    multilabel_confusion_matrix,
)
from lightgbm import LGBMClassifier
from openpyxl import load_workbook
import itertools
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV


In [2]:
df = pd.read_csv('Cleaned and manipulated UFC data.csv')
df.head()

Unnamed: 0,fight_id,date,fighter_x_name,fighter_y_name,fighter_x_win,Winner,Fight ID_x,Age (in days)_x,Height_x,ELO_fighter_x,new_ELO_fighter_x,NumberOf_Fight_x,NumberOf_WIN_x,NumberOf_LOSE_x,WIN_RATE_x,Height Feet_x,Height Inches_x,Weight Pounds_x,Reach Inches_x,Stance_x,DOB Month_x,DOB Day_x,DOB Year_x,WIN_AVG_Knockdown Total_x,LOSE_AVG_Knockdown Total_x,NumberOf_WIN_shift_2_x,NumberOf_LOSE_shift_2_x,WIN_RATE_shift_2_x,NumberOf_WIN_shift_3_x,NumberOf_LOSE_shift_3_x,WIN_RATE_shift_3_x,NumberOf_WIN_shift_4_x,NumberOf_LOSE_shift_4_x,WIN_RATE_shift_4_x,Fight ID_y,Age (in days)_y,Height_y,ELO_fighter_y,new_ELO_fighter_y,NumberOf_Fight_y,NumberOf_WIN_y,NumberOf_LOSE_y,WIN_RATE_y,Height Feet_y,Height Inches_y,Weight Pounds_y,Reach Inches_y,Stance_y,DOB Month_y,DOB Day_y,DOB Year_y,WIN_AVG_Knockdown Total_y,LOSE_AVG_Knockdown Total_y,NumberOf_WIN_shift_2_y,NumberOf_LOSE_shift_2_y,WIN_RATE_shift_2_y,NumberOf_WIN_shift_3_y,NumberOf_LOSE_shift_3_y,WIN_RATE_shift_3_y,NumberOf_WIN_shift_4_y,NumberOf_LOSE_shift_4_y,WIN_RATE_shift_4_y,Age (in days)_diff,Height_diff,ELO_fighter_diff,new_ELO_fighter_diff,NumberOf_Fight_diff,NumberOf_WIN_diff,NumberOf_LOSE_diff,WIN_RATE_diff,Weight Pounds_diff,Reach Inches_diff,DOB Month_diff,DOB Day_diff,DOB Year_diff,WIN_AVG_Knockdown Total_diff,LOSE_AVG_Knockdown Total_diff,NumberOf_WIN_shift_2_diff,NumberOf_LOSE_shift_2_diff,WIN_RATE_shift_2_diff,NumberOf_WIN_shift_3_diff,NumberOf_LOSE_shift_3_diff,WIN_RATE_shift_3_diff,NumberOf_WIN_shift_4_diff,NumberOf_LOSE_shift_4_diff,WIN_RATE_shift_4_diff
0,1,04-02-2012,DAN STITTGEN,STEPHEN THOMPSON,0,STEPHEN THOMPSON,1,11401,73,1500.0,1500.0,0,0,0,0.0,6,1,170,,Orthodox,11,17,1980,1.0,0.0,0,0,0.0,0,0,0.0,0,0,0.0,1,10585,72,1500.0,1500.0,0,0,0,0.0,6,0,170,75.0,Orthodox,2,11,1983,0.0,0.0,0,0,0.0,0,0,0.0,0,0,0.0,816,1,0.0,0.0,0,0,0,0.0,0,,9,6,-3,1.0,0.0,0,0,0.0,0,0,0.0,0,0,0.0
1,1,04-02-2012,STEPHEN THOMPSON,DAN STITTGEN,1,STEPHEN THOMPSON,1,10585,72,1500.0,1500.0,0,0,0,0.0,6,0,170,75.0,Orthodox,2,11,1983,0.0,0.0,0,0,0.0,0,0,0.0,0,0,0.0,1,11401,73,1500.0,1500.0,0,0,0,0.0,6,1,170,,Orthodox,11,17,1980,1.0,0.0,0,0,0.0,0,0,0.0,0,0,0.0,-816,-1,0.0,0.0,0,0,0,0.0,0,,-9,-6,3,-1.0,0.0,0,0,0.0,0,0,0.0,0,0,0.0
2,2,04-02-2012,RAFAEL NATAL,MICHAEL KUIPER,1,RAFAEL NATAL,2,10633,72,1500.0,1500.0,0,0,0,0.0,6,0,185,77.0,Orthodox,12,25,1982,0.0,0.0,0,0,0.0,0,0,0.0,0,0,0.0,2,8277,72,1500.0,1500.0,0,0,0,0.0,6,0,185,73.0,Orthodox,6,7,1989,0.181818,0.076923,0,0,0.0,0,0,0.0,0,0,0.0,2356,0,0.0,0.0,0,0,0,0.0,0,4.0,6,18,-7,-0.181818,-0.076923,0,0,0.0,0,0,0.0,0,0,0.0
3,2,04-02-2012,MICHAEL KUIPER,RAFAEL NATAL,0,RAFAEL NATAL,2,8277,72,1500.0,1500.0,0,0,0,0.0,6,0,185,73.0,Orthodox,6,7,1989,0.181818,0.076923,0,0,0.0,0,0,0.0,0,0,0.0,2,10633,72,1500.0,1500.0,0,0,0,0.0,6,0,185,77.0,Orthodox,12,25,1982,0.0,0.0,0,0,0.0,0,0,0.0,0,0,0.0,-2356,0,0.0,0.0,0,0,0,0.0,0,-4.0,-6,-18,7,0.181818,0.076923,0,0,0.0,0,0,0.0,0,0,0.0
4,3,04-02-2012,HENRY MARTINEZ,MATTHEW RIDDLE,0,MATTHEW RIDDLE,3,10408,67,1500.0,1500.0,0,0,0,0.0,5,7,155,69.0,Southpaw,8,7,1983,0.4,0.333333,0,0,0.0,0,0,0.0,0,0,0.0,3,9517,73,1500.0,1500.0,0,0,0,0.0,6,1,170,76.0,Southpaw,1,14,1986,0.5,0.0,0,0,0.0,0,0,0.0,0,0,0.0,891,-6,0.0,0.0,0,0,0,0.0,-15,-7.0,7,-7,-3,-0.1,0.333333,0,0,0.0,0,0,0.0,0,0,0.0


In [3]:
def evaluate_model(y_true, y_pred):
    """
    :param y_true: ground truth values
    :param y_pred: predictions
    """
    y_true = np.array(y_true)
    y_pred = np.array(y_pred)

    report = classification_report(y_true, y_pred)
    print("Classification Report\n", report)

    cm = confusion_matrix(y_true, y_pred)

In [4]:
drop_list = ['Height Feet_x', 'Height Inches_x', 'DOB Month_x', 'DOB Day_x', 'DOB Year_x', 
             'Height Feet_y', 'Height Inches_y', 'DOB Month_y', 'DOB Day_y', 'DOB Year_y']

df1 = df.drop(columns=drop_list, axis=0)

In [5]:
for col in df1.columns:
    if df1[col].dtype == "object":
            df1[col].fillna(df1[col].mode()[0],inplace=True)
    else:
        df1[col].fillna(df1[col].mean(), inplace=True)

In [6]:
test_dates = df1[df1['date'] > '2022-06-04']['date'].unique()
test_dates[:5]

array(['25-02-2012', '21-04-2012', '26-05-2012', '22-06-2012',
       '23-06-2012'], dtype=object)

In [7]:
def train_model(train_data_encoded, test_dates, file_path="backtest_prediction_202501_v2.csv"):
    model_trained = 0
    # train_data_encoded = pd.get_dummies(train_data_encoded, columns=['Stance_x', 'Stance_y'])
    new_future_x_win_list = []
    
    for future_date in test_dates:
        print(future_date)
        train_df = train_data_encoded[train_data_encoded['date']<future_date]
        train_df = train_df.copy()
        
        val_df = train_data_encoded[train_data_encoded['date']>=future_date]
        val_df = val_df.copy()

        train_data_without_leakage = train_data_encoded[train_data_encoded['date']==future_date]
        new_future_x_win = train_data_without_leakage['fighter_x_win']
        new_future_x_win_list = new_future_x_win_list + list(new_future_x_win)
        new_future = train_data_without_leakage.drop(columns=['fighter_x_win'],axis=1)

        if model_trained == 0:
            n_HP_points_to_test = 100
            param_grid = {
            'num_leaves': [5, 20, 31],
            'learning_rate': [0.05, 0.1, 0.2],
            'n_estimators': [50, 100, 150]
            }
            model = LGBMClassifier(max_depth=-1, random_state=314, silent=True, metric='None', n_jobs=4, n_estimators=5000)
            gs = RandomizedSearchCV(
                estimator=model, param_distributions=param_grid, 
                n_iter=n_HP_points_to_test,
                scoring='roc_auc',
                cv=3,
                refit=True,
                random_state=314,
                verbose=True
                )
            
            gs.fit(train_df.drop(['Winner',"fight_id","Fight ID_x","Fight ID_y","date",'fighter_x_name','fighter_y_name', 'fighter_x_win'], axis=1), train_df["fighter_x_win"])

            print("Training Evaluation")
            evaluate_model(train_df["fighter_x_win"],gs.predict(train_df.drop(['Winner',"fight_id","Fight ID_x","Fight ID_y","date",'fighter_x_name','fighter_y_name', 'fighter_x_win'], axis=1)))
            print("Testing Evaluation")
            evaluate_model(val_df["fighter_x_win"],gs.predict(val_df.drop(['Winner',"fight_id","Fight ID_x","Fight ID_y","date",'fighter_x_name','fighter_y_name', 'fighter_x_win'], axis=1)))
            
            model_trained = 1

        y_pred = gs.predict(new_future.drop(['Winner',"fight_id","Fight ID_x","Fight ID_y","date",'fighter_x_name','fighter_y_name'],axis=1))
        y_proba_all = gs.predict_proba(new_future.drop(['Winner',"fight_id","Fight ID_x","Fight ID_y","date",'fighter_x_name','fighter_y_name'],axis=1))[:, 1]
        print(f'Future Date: {future_date} ............ Evaluation')
        evaluate_model(new_future_x_win,y_pred)

        new_future["x_win"]  = y_pred
        new_future["probability"] = y_proba_all
        if os.path.exists(file_path):
            # Load the existing CSV into a DataFrame
            existing_df = pd.read_csv(file_path)
            # Append the new data (future_test DataFrame)       
            updated_df = pd.concat([existing_df, new_future], ignore_index=True)
            # Write the updated DataFrame back to the CSV file
            updated_df.to_csv(file_path, index=False)
        else:
            # If the file doesn't exist, write the future_test DataFrame to a new CSV file
            new_future.to_csv(file_path, index=False)

    
    train_set = train_df.drop(['Winner',"fight_id","Fight ID_x","Fight ID_y","date",'fighter_x_name','fighter_y_name', 'fighter_x_win'], axis=1)

    # Access the best estimator and feature importances
    best_model = gs.best_estimator_

    # If the estimator supports feature_importances_
    if hasattr(best_model, "feature_importances_"):
        # feature_importances = best_model.feature_importances_

        print("Feature Importances:")

        importance_df = pd.DataFrame({
            'Feature': train_set.columns,
            'Importance': best_model.feature_importances_
        }).sort_values(by='Importance', ascending=False)
        
        print(importance_df)
    else:
        print("The estimator does not support feature_importances_.")

    return new_future_x_win_list, importance_df

## FEATURE ENGINEERING

In [8]:
df2 = df.copy()

In [9]:
for col in df2.columns:
    if df2[col].dtype == "object":
            df2[col].fillna(df2[col].mode()[0],inplace=True)
    else:
        df2[col].fillna(df2[col].mean(), inplace=True)

In [10]:
# Feet and inches to meters
df2['Height_x_m'] = (df2['Height Feet_x'] * 12 + df2['Height Inches_x']) * 0.0254  
df2['Height_y_m'] = (df2['Height Feet_y'] * 12 + df2['Height Inches_y']) * 0.0254  

# Pounds to kilograms
df2['Weight_x_kg'] = df2['Weight Pounds_x'] * 0.453592  
df2['Weight_y_kg'] = df2['Weight Pounds_y'] * 0.453592  

# Inches to meters
df2['Reach Meters_x'] = df2['Reach Inches_x']*(0.0254)
df2['Reach Meters_y'] = df2['Reach Inches_y']*(0.0254)

# Height-to-Reach Ratio
df2['Height_to_Reach_Ratio_x'] = df2['Height_x_m'] / df2['Reach Meters_x']
df2['Height_to_Reach_Ratio_y'] = df2['Height_y_m'] / df2['Reach Meters_y']

# BMI Calculation
df2['BMI_x'] = df2['Weight_x_kg'] / (df2['Height_x_m'] ** 2)
df2['BMI_y'] = df2['Weight_y_kg'] / (df2['Height_y_m'] ** 2)

# Drop redundant columns
columns_to_drop = [
    'Height Feet_x', 'Height Inches_x', 'Weight Pounds_x', 'Height_x', 'Reach Inches_x',       
    'Height Feet_y', 'Height Inches_y', 'Weight Pounds_y', 'Height_y', 'Reach Inches_y']
df2.drop(columns=columns_to_drop, inplace=True)

In [11]:
# Calculating Win-to-Loss Ratios for both fighters
df2['Win_to_Loss_Ratio_x'] = df2['NumberOf_WIN_x'] / (df2['NumberOf_LOSE_x'] + 1)
df2['Win_to_Loss_Ratio_y'] = df2['NumberOf_WIN_y'] / (df2['NumberOf_LOSE_y'] + 1)

In [12]:
# Calculating weighted WIN_RATE for fighter_x
df2['Weighted_WIN_RATE_x'] = (
    0.5 * df2['WIN_RATE_shift_2_x'] + 
    0.3 * df2['WIN_RATE_shift_3_x'] + 
    0.2 * df2['WIN_RATE_shift_4_x']
)

# Calculating weighted WIN_RATE for fighter_y
df2['Weighted_WIN_RATE_y'] = (
    0.5 * df2['WIN_RATE_shift_2_y'] + 
    0.3 * df2['WIN_RATE_shift_3_y'] + 
    0.2 * df2['WIN_RATE_shift_4_y']
)

# Dropping the original columns if no longer needed
columns_to_drop = [
    'WIN_RATE_shift_2_x', 'WIN_RATE_shift_3_x', 'WIN_RATE_shift_4_x',
    'WIN_RATE_shift_2_y', 'WIN_RATE_shift_3_y', 'WIN_RATE_shift_4_y'
]
# df2 = df2.drop(columns=columns_to_drop, errors='ignore')

In [13]:
# Define weights
weights = [0.5, 0.3, 0.2]  # Weights for shift 2, 3, and 4
total_weight = sum(weights)
normalized_weights = [w / total_weight for w in weights]  # Normalize the weights

# List of columns for each shift
shift_columns_x = ['WIN_RATE_shift_2_x', 'WIN_RATE_shift_3_x', 'WIN_RATE_shift_4_x']
shift_columns_y = ['WIN_RATE_shift_2_y', 'WIN_RATE_shift_3_y', 'WIN_RATE_shift_4_y']

# Loop for x and y to calculate weighted averages
for prefix, shift_columns in zip(['x', 'y'], [shift_columns_x, shift_columns_y]):
    weight_columns = [f'WIN_RATE_shift_2_{prefix}', f'WIN_RATE_shift_3_{prefix}', f'WIN_RATE_shift_4_{prefix}']
    weighted_average_column = f'Weighted_WIN_RATE_{prefix}'

    # Calculate the weighted average
    df2[weighted_average_column] = sum(df2[shift_columns[i]] * normalized_weights[i] for i in range(3))

# Drop the individual shift columns
df2.drop(columns=shift_columns_x + shift_columns_y, inplace=True)

In [14]:
# Drop the individual DOB components for Fighter x and y
df2.drop(['DOB Year_x', 'DOB Month_x', 'DOB Day_x', 'DOB Year_y', 'DOB Month_y', 'DOB Day_y'], axis=1, inplace=True)

In [15]:
# Convert Age (in days)_k to Age_k (in years)
df2['Age_x'] = df2['Age (in days)_x'] // 365
df2['Age_y'] = df2['Age (in days)_y'] // 365

# Calculate Experience Growth for Fighter x
df2['Experience_Growth_x'] = df2['NumberOf_Fight_x'] / df2['Age_x']
df2['Experience_Growth_y'] = df2['NumberOf_Fight_y'] / df2['Age_y']

# Drop Age_x and Age_y columns after calculation
df2.drop(['Age_x', 'Age_y'], axis=1, inplace=True)


In [16]:
df2.tail()

Unnamed: 0,fight_id,date,fighter_x_name,fighter_y_name,fighter_x_win,Winner,Fight ID_x,Age (in days)_x,ELO_fighter_x,new_ELO_fighter_x,NumberOf_Fight_x,NumberOf_WIN_x,NumberOf_LOSE_x,WIN_RATE_x,Stance_x,WIN_AVG_Knockdown Total_x,LOSE_AVG_Knockdown Total_x,NumberOf_WIN_shift_2_x,NumberOf_LOSE_shift_2_x,NumberOf_WIN_shift_3_x,NumberOf_LOSE_shift_3_x,NumberOf_WIN_shift_4_x,NumberOf_LOSE_shift_4_x,Fight ID_y,Age (in days)_y,ELO_fighter_y,new_ELO_fighter_y,NumberOf_Fight_y,NumberOf_WIN_y,NumberOf_LOSE_y,WIN_RATE_y,Stance_y,WIN_AVG_Knockdown Total_y,LOSE_AVG_Knockdown Total_y,NumberOf_WIN_shift_2_y,NumberOf_LOSE_shift_2_y,NumberOf_WIN_shift_3_y,NumberOf_LOSE_shift_3_y,NumberOf_WIN_shift_4_y,NumberOf_LOSE_shift_4_y,Age (in days)_diff,Height_diff,ELO_fighter_diff,new_ELO_fighter_diff,NumberOf_Fight_diff,NumberOf_WIN_diff,NumberOf_LOSE_diff,WIN_RATE_diff,Weight Pounds_diff,Reach Inches_diff,DOB Month_diff,DOB Day_diff,DOB Year_diff,WIN_AVG_Knockdown Total_diff,LOSE_AVG_Knockdown Total_diff,NumberOf_WIN_shift_2_diff,NumberOf_LOSE_shift_2_diff,WIN_RATE_shift_2_diff,NumberOf_WIN_shift_3_diff,NumberOf_LOSE_shift_3_diff,WIN_RATE_shift_3_diff,NumberOf_WIN_shift_4_diff,NumberOf_LOSE_shift_4_diff,WIN_RATE_shift_4_diff,Height_x_m,Height_y_m,Weight_x_kg,Weight_y_kg,Reach Meters_x,Reach Meters_y,Height_to_Reach_Ratio_x,Height_to_Reach_Ratio_y,BMI_x,BMI_y,Win_to_Loss_Ratio_x,Win_to_Loss_Ratio_y,Weighted_WIN_RATE_x,Weighted_WIN_RATE_y,Experience_Growth_x,Experience_Growth_y
12045,6023,10-12-2024,CORY MCKENNA,JULIA POLASTRI,0,JULIA POLASTRI,6023,9284,1512.999739,1525.626133,5,3,2,0.6,Orthodox,0.0,0.0,1,1,2,1,2,2,6023,9777,1484.736307,1481.509201,1,0,1,0.0,Orthodox,0.333333,0.0,0,1,0,1,0,1,-493,1,28.263433,44.116932,4,3,1,0.6,0,-5.0,4,6,1,-0.333333,0.0,1,0,0.5,2,0,0.666667,2,1,0.5,1.6002,1.5748,52.16308,52.16308,1.4732,1.6002,1.086207,0.984127,20.37111,21.033542,1.0,0.0,0.55,0.0,0.2,0.038462
12046,6024,10-12-2024,CODY HADDON,DAN ARGUETA,0,DAN ARGUETA,6024,9590,1500.0,1500.0,0,0,0,0.0,Orthodox,0.0,0.2,0,0,0,0,0,0,6024,11442,1517.158166,1474.634607,5,3,2,0.6,Southpaw,0.0,0.0,1,1,2,1,3,1,-1852,0,-17.158166,25.365393,-5,-3,-2,-0.6,0,1.0,1,-5,5,0.0,0.2,-1,-1,-0.5,-2,-1,-0.666667,-3,-1,-0.75,1.7018,1.7018,61.23492,61.23492,1.7526,1.7272,0.971014,0.985294,21.143755,21.143755,0.0,1.0,0.0,0.6,0.0,0.16129
12047,6024,10-12-2024,DAN ARGUETA,CODY HADDON,0,CODY HADDON,6024,11442,1517.158166,1474.634607,5,3,2,0.6,Southpaw,0.0,0.0,1,1,2,1,3,1,6024,9590,1500.0,1500.0,0,0,0,0.0,Orthodox,0.0,0.2,0,0,0,0,0,0,1852,0,17.158166,-25.365393,5,3,2,0.6,0,-1.0,-1,5,-5,0.0,-0.2,1,1,0.5,2,1,0.666667,3,1,0.75,1.7018,1.7018,61.23492,61.23492,1.7272,1.7526,0.985294,0.971014,21.143755,21.143755,1.0,0.0,0.6,0.0,0.16129,0.0
12048,6025,10-12-2024,LUCAS ROCHA,CLAYTON CARPENTER,0,CLAYTON CARPENTER,6025,8886,1500.0,1500.0,0,0,0,0.0,Orthodox,1.0,0.333333,0,0,0,0,0,0,6025,10411,1515.328319,1522.186576,1,1,0,1.0,Orthodox,0.0,0.090909,1,0,1,0,1,0,-1525,-3,-15.328319,-22.186576,-1,-1,0,-1.0,0,-2.0,2,3,4,1.0,0.242424,-1,0,-1.0,-1,0,-1.0,-1,0,-1.0,1.6002,1.6764,56.699,56.699,1.6256,1.6764,0.984375,1.0,22.142511,20.175304,0.0,1.0,0.0,1.0,0.0,0.035714
12049,6025,10-12-2024,CLAYTON CARPENTER,LUCAS ROCHA,0,LUCAS ROCHA,6025,10411,1515.328319,1522.186576,1,1,0,1.0,Orthodox,0.0,0.090909,1,0,1,0,1,0,6025,8886,1500.0,1500.0,0,0,0,0.0,Orthodox,1.0,0.333333,0,0,0,0,0,0,1525,3,15.328319,22.186576,1,1,0,1.0,0,2.0,-2,-3,-4,-1.0,-0.242424,1,0,1.0,1,0,1.0,1,0,1.0,1.6764,1.6002,56.699,56.699,1.6764,1.6256,1.0,0.984375,20.175304,22.142511,1.0,0.0,1.0,0.0,0.035714,0.0


In [17]:
df_hist = pd.read_csv('historical_data.csv')
df_hist.head()

Unnamed: 0,Fight ID,Event Title,date,Event Location,Weight Class,Winning Method,Winning Round,Winning Time,Winner First Name,Winner Last Name,Fighter First Name,Fighter Last Name,Height Feet,Height Inches,Weight Pounds,Reach Inches,Stance,Date of Birth,Knockdown Total,Significant Strike Total Attempted,Significant Strike Total Landed,Takedown Total Attempted,Takedown Total Landed,Submission Attempted,Reversal,Ground and Cage Control Time,Significant Strike Head Attempted,Significant Strike Head Landed,Significant Strike Body Attempted,Significant Strike Body Landed,Significant Strike Leg Attempted,Significant Strike Leg Landed,Significant Strike Clinch Attempted,Significant Strike Clinch Landed,Significant Strike Ground Attempted,Significant Strike Ground Landed,Round 1 Knockdown Total,Round 1 Significant Strike Total Attempted,Round 1 Significant Strike Total Landed,Round 1 Takedown Total Attempted,Round 1 Takedown Total Landed,Round 1 Submission Attempted,Round 1 Reversal,Round 1 Ground and Cage Control Time,Round 1 Significant Strike Head Attempted,Round 1 Significant Strike Head Landed,Round 1 Significant Strike Body Attempted,Round 1 Significant Strike Body Landed,Round 1 Significant Strike Leg Attempted,Round 1 Significant Strike Leg Landed,Round 1 Significant Strike Clinch Attempted,Round 1 Significant Strike Clinch Landed,Round 1 Significant Strike Ground Attempted,Round 1 Significant Strike Ground Landed,Round 2 Knockdown Total,Round 2 Significant Strike Total Attempted,Round 2 Significant Strike Total Landed,Round 2 Takedown Total Attempted,Round 2 Takedown Total Landed,Round 2 Submission Attempted,Round 2 Reversal,Round 2 Ground and Cage Control Time,Round 2 Significant Strike Head Attempted,Round 2 Significant Strike Head Landed,Round 2 Significant Strike Body Attempted,Round 2 Significant Strike Body Landed,Round 2 Significant Strike Leg Attempted,Round 2 Significant Strike Leg Landed,Round 2 Significant Strike Clinch Attempted,Round 2 Significant Strike Clinch Landed,Round 2 Significant Strike Ground Attempted,Round 2 Significant Strike Ground Landed,Round 3 Knockdown Total,Round 3 Significant Strike Total Attempted,Round 3 Significant Strike Total Landed,Round 3 Takedown Total Attempted,Round 3 Takedown Total Landed,Round 3 Submission Attempted,Round 3 Reversal,Round 3 Ground and Cage Control Time,Round 3 Significant Strike Head Attempted,Round 3 Significant Strike Head Landed,Round 3 Significant Strike Body Attempted,Round 3 Significant Strike Body Landed,Round 3 Significant Strike Leg Attempted,Round 3 Significant Strike Leg Landed,Round 3 Significant Strike Clinch Attempted,Round 3 Significant Strike Clinch Landed,Round 3 Significant Strike Ground Attempted,Round 3 Significant Strike Ground Landed,Round 4 Knockdown Total,Round 4 Significant Strike Total Attempted,Round 4 Significant Strike Total Landed,Round 4 Takedown Total Attempted,Round 4 Takedown Total Landed,Round 4 Submission Attempted,Round 4 Reversal,Round 4 Ground and Cage Control Time,Round 4 Significant Strike Head Attempted,Round 4 Significant Strike Head Landed,Round 4 Significant Strike Body Attempted,Round 4 Significant Strike Body Landed,Round 4 Significant Strike Leg Attempted,Round 4 Significant Strike Leg Landed,Round 4 Significant Strike Clinch Attempted,Round 4 Significant Strike Clinch Landed,Round 4 Significant Strike Ground Attempted,Round 4 Significant Strike Ground Landed,Round 5 Knockdown Total,Round 5 Significant Strike Total Attempted,Round 5 Significant Strike Total Landed,Round 5 Takedown Total Attempted,Round 5 Takedown Total Landed,Round 5 Submission Attempted,Round 5 Reversal,Round 5 Ground and Cage Control Time,Round 5 Significant Strike Head Attempted,Round 5 Significant Strike Head Landed,Round 5 Significant Strike Body Attempted,Round 5 Significant Strike Body Landed,Round 5 Significant Strike Leg Attempted,Round 5 Significant Strike Leg Landed,Round 5 Significant Strike Clinch Attempted,Round 5 Significant Strike Clinch Landed,Round 5 Significant Strike Ground Attempted,Round 5 Significant Strike Ground Landed,odds
0,1,UFC on FX: Guillard vs Miller,20-01-2012,"Nashville, Tennessee, USA",Bantamweight,KO/TKO,1.0,0.22,Nick,Denis,Joseph,Sandoval,5,7,135,,Southpaw,11-05-1986,1.0,23.0,11.0,0.0,0.0,0.0,0.0,0.01,23.0,11.0,0.0,0.0,0.0,0.0,5.0,4.0,0.0,0.0,1.0,23.0,11.0,0.0,0.0,0.0,0.0,0.01,23.0,11.0,0.0,0.0,0.0,0.0,5.0,4.0,0.0,0.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,236.0
1,1,UFC on FX: Guillard vs Miller,20-01-2012,"Nashville, Tennessee, USA",Bantamweight,KO/TKO,1.0,0.22,Nick,Denis,Nick,Denis,5,7,135,,Orthodox,11-10-1983,0.0,4.0,3.0,0.0,0.0,0.0,0.0,0.0,4.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,3.0,0.0,0.0,0.0,0.0,0.0,4.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,-292.86
2,2,UFC on FX: Guillard vs Miller,20-01-2012,"Nashville, Tennessee, USA",Featherweight,SUB,1.0,1.37,Daniel,Pineda,Pat,Schilling,5,8,145,,Orthodox,28-08-1988,0.0,7.0,4.0,0.0,0.0,0.0,0.0,0.0,6.0,3.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,7.0,4.0,0.0,0.0,0.0,0.0,0.0,6.0,3.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,119.14
3,2,UFC on FX: Guillard vs Miller,20-01-2012,"Nashville, Tennessee, USA",Featherweight,SUB,1.0,1.37,Daniel,Pineda,Daniel,Pineda,5,7,145,69.0,Orthodox,06-08-1985,0.0,24.0,19.0,1.0,1.0,1.0,0.0,0.45,21.0,16.0,1.0,1.0,2.0,2.0,0.0,0.0,12.0,11.0,0.0,24.0,19.0,1.0,1.0,1.0,0.0,0.45,21.0,16.0,1.0,1.0,2.0,2.0,0.0,0.0,12.0,11.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,-143.71
4,3,UFC on FX: Guillard vs Miller,20-01-2012,"Nashville, Tennessee, USA",Lightweight,SUB,1.0,4.03,Fabricio,Camoes,Tommy,Hayden,5,9,145,,Southpaw,11-03-1986,0.0,21.0,7.0,3.0,2.0,0.0,0.0,1.2,18.0,5.0,1.0,1.0,2.0,1.0,1.0,1.0,0.0,0.0,0.0,21.0,7.0,3.0,2.0,0.0,0.0,1.2,18.0,5.0,1.0,1.0,2.0,1.0,1.0,1.0,0.0,0.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,262.86


In [18]:
df_hist['date'] = pd.to_datetime(df_hist['date'], format='%d-%m-%Y')

In [19]:
df_hist = df_hist[df_hist['date'] > '2012-02-01']
df_hist.shape

(12252, 127)

In [20]:
df_hist.date.describe()

count                            12252
mean     2018-11-05 19:05:56.121449728
min                2012-02-04 00:00:00
25%                2015-08-23 00:00:00
50%                2018-12-08 00:00:00
75%                2022-01-22 00:00:00
max                2024-12-14 00:00:00
Name: date, dtype: object

In [21]:
df1.shape

(12050, 76)

In [24]:
# Get value counts of the 'date' column
date_counts = df_hist['date'].value_counts().reset_index()

# Rename the columns
date_counts.columns = ['date', 'count']

# Sort the DataFrame by 'date'
date_counts_sorted = date_counts.sort_values(by='date', ascending=True)


df3 = df2.copy()
df3['date'] = pd.to_datetime(df3['date'], format='%d-%m-%Y')
# Get value counts of the 'date' column
date_counts_clean = df3['date'].value_counts().reset_index()

# Rename the columns
date_counts_clean.columns = ['date', 'count']

# Sort the DataFrame by 'date'
date_counts_clean_sorted = date_counts_clean.sort_values(by='date', ascending=True)


# Merge the two dataframes on the 'date' column, with an outer join to keep all dates
merged_df = pd.merge(date_counts_sorted, date_counts_clean_sorted, on='date', how='outer', suffixes=('_df_hist', '_df2'))

# Display the result
merged_df

Unnamed: 0,date,count_df_hist,count_df2
0,2012-02-04,22.0,22.0
1,2012-02-15,18.0,18.0
2,2012-02-25,24.0,24.0
3,2012-03-02,22.0,22.0
4,2012-04-14,24.0,24.0
5,2012-04-21,24.0,24.0
6,2012-05-05,24.0,24.0
7,2012-05-15,24.0,24.0
8,2012-05-26,24.0,24.0
9,2012-06-01,20.0,20.0


In [25]:
df_hist_final = df_hist[df_hist.date < '2024-10-12']

In [26]:
df_hist_final.shape

(12028, 127)

In [27]:
df2_final = df3[df3.date < '2024-10-12']

In [28]:
df2_final.shape

(12028, 80)

In [29]:
# Combine 'Fighter First Name' and 'Fighter Last Name' into a new column 'fighter_name'
df_hist_final['fighter_x_name'] = df_hist_final['Fighter First Name'] + ' ' + df_hist_final['Fighter Last Name']

In [30]:
df_hist_final.head()

Unnamed: 0,Fight ID,Event Title,date,Event Location,Weight Class,Winning Method,Winning Round,Winning Time,Winner First Name,Winner Last Name,Fighter First Name,Fighter Last Name,Height Feet,Height Inches,Weight Pounds,Reach Inches,Stance,Date of Birth,Knockdown Total,Significant Strike Total Attempted,Significant Strike Total Landed,Takedown Total Attempted,Takedown Total Landed,Submission Attempted,Reversal,Ground and Cage Control Time,Significant Strike Head Attempted,Significant Strike Head Landed,Significant Strike Body Attempted,Significant Strike Body Landed,Significant Strike Leg Attempted,Significant Strike Leg Landed,Significant Strike Clinch Attempted,Significant Strike Clinch Landed,Significant Strike Ground Attempted,Significant Strike Ground Landed,Round 1 Knockdown Total,Round 1 Significant Strike Total Attempted,Round 1 Significant Strike Total Landed,Round 1 Takedown Total Attempted,Round 1 Takedown Total Landed,Round 1 Submission Attempted,Round 1 Reversal,Round 1 Ground and Cage Control Time,Round 1 Significant Strike Head Attempted,Round 1 Significant Strike Head Landed,Round 1 Significant Strike Body Attempted,Round 1 Significant Strike Body Landed,Round 1 Significant Strike Leg Attempted,Round 1 Significant Strike Leg Landed,Round 1 Significant Strike Clinch Attempted,Round 1 Significant Strike Clinch Landed,Round 1 Significant Strike Ground Attempted,Round 1 Significant Strike Ground Landed,Round 2 Knockdown Total,Round 2 Significant Strike Total Attempted,Round 2 Significant Strike Total Landed,Round 2 Takedown Total Attempted,Round 2 Takedown Total Landed,Round 2 Submission Attempted,Round 2 Reversal,Round 2 Ground and Cage Control Time,Round 2 Significant Strike Head Attempted,Round 2 Significant Strike Head Landed,Round 2 Significant Strike Body Attempted,Round 2 Significant Strike Body Landed,Round 2 Significant Strike Leg Attempted,Round 2 Significant Strike Leg Landed,Round 2 Significant Strike Clinch Attempted,Round 2 Significant Strike Clinch Landed,Round 2 Significant Strike Ground Attempted,Round 2 Significant Strike Ground Landed,Round 3 Knockdown Total,Round 3 Significant Strike Total Attempted,Round 3 Significant Strike Total Landed,Round 3 Takedown Total Attempted,Round 3 Takedown Total Landed,Round 3 Submission Attempted,Round 3 Reversal,Round 3 Ground and Cage Control Time,Round 3 Significant Strike Head Attempted,Round 3 Significant Strike Head Landed,Round 3 Significant Strike Body Attempted,Round 3 Significant Strike Body Landed,Round 3 Significant Strike Leg Attempted,Round 3 Significant Strike Leg Landed,Round 3 Significant Strike Clinch Attempted,Round 3 Significant Strike Clinch Landed,Round 3 Significant Strike Ground Attempted,Round 3 Significant Strike Ground Landed,Round 4 Knockdown Total,Round 4 Significant Strike Total Attempted,Round 4 Significant Strike Total Landed,Round 4 Takedown Total Attempted,Round 4 Takedown Total Landed,Round 4 Submission Attempted,Round 4 Reversal,Round 4 Ground and Cage Control Time,Round 4 Significant Strike Head Attempted,Round 4 Significant Strike Head Landed,Round 4 Significant Strike Body Attempted,Round 4 Significant Strike Body Landed,Round 4 Significant Strike Leg Attempted,Round 4 Significant Strike Leg Landed,Round 4 Significant Strike Clinch Attempted,Round 4 Significant Strike Clinch Landed,Round 4 Significant Strike Ground Attempted,Round 4 Significant Strike Ground Landed,Round 5 Knockdown Total,Round 5 Significant Strike Total Attempted,Round 5 Significant Strike Total Landed,Round 5 Takedown Total Attempted,Round 5 Takedown Total Landed,Round 5 Submission Attempted,Round 5 Reversal,Round 5 Ground and Cage Control Time,Round 5 Significant Strike Head Attempted,Round 5 Significant Strike Head Landed,Round 5 Significant Strike Body Attempted,Round 5 Significant Strike Body Landed,Round 5 Significant Strike Leg Attempted,Round 5 Significant Strike Leg Landed,Round 5 Significant Strike Clinch Attempted,Round 5 Significant Strike Clinch Landed,Round 5 Significant Strike Ground Attempted,Round 5 Significant Strike Ground Landed,odds,fighter_x_name
40,21,UFC 143: Diaz vs Condit,2012-02-04,"Las Vegas, Nevada, USA",Welterweight,KO/TKO,1.0,4.13,Stephen,Thompson,Stephen,Thompson,6,0,170,75.0,Orthodox,11-02-1983,0.0,22.0,8.0,0.0,0.0,0.0,0.0,0.0,5.0,0.0,5.0,1.0,12.0,7.0,0.0,0.0,0.0,0.0,0.0,22.0,8.0,0.0,0.0,0.0,0.0,0.0,5.0,0.0,5.0,1.0,12.0,7.0,0.0,0.0,0.0,0.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,-345.0,Stephen Thompson
41,21,UFC 143: Diaz vs Condit,2012-02-04,"Las Vegas, Nevada, USA",Welterweight,KO/TKO,1.0,4.13,Stephen,Thompson,Dan,Stittgen,6,1,170,,Orthodox,17-11-1980,1.0,38.0,22.0,0.0,0.0,0.0,0.0,0.01,23.0,9.0,9.0,7.0,6.0,6.0,0.0,0.0,2.0,2.0,1.0,38.0,22.0,0.0,0.0,0.0,0.0,0.01,23.0,9.0,9.0,7.0,6.0,6.0,0.0,0.0,2.0,2.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,273.57,Dan Stittgen
42,22,UFC 143: Diaz vs Condit,2012-02-04,"Las Vegas, Nevada, USA",Middleweight,U-DEC,3.0,5.0,Rafael,Natal,Rafael,Natal,6,0,185,77.0,Orthodox,25-12-1982,0.0,72.0,36.0,16.0,6.0,2.0,0.0,6.24,56.0,25.0,8.0,5.0,8.0,6.0,19.0,11.0,2.0,1.0,0.0,9.0,3.0,7.0,4.0,1.0,0.0,2.28,6.0,0.0,1.0,1.0,2.0,2.0,1.0,1.0,0.0,0.0,0.0,59.0,31.0,7.0,1.0,0.0,0.0,1.23,47.0,24.0,7.0,4.0,5.0,3.0,18.0,10.0,0.0,0.0,0.0,4.0,2.0,2.0,1.0,1.0,0.0,2.33,3.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,2.0,1.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,-162.14,Rafael Natal
43,22,UFC 143: Diaz vs Condit,2012-02-04,"Las Vegas, Nevada, USA",Middleweight,U-DEC,3.0,5.0,Rafael,Natal,Michael,Kuiper,6,0,185,73.0,Orthodox,07-06-1989,1.0,62.0,30.0,0.0,0.0,0.0,1.0,3.43,52.0,24.0,4.0,3.0,6.0,3.0,12.0,4.0,24.0,18.0,0.0,11.0,4.0,0.0,0.0,0.0,1.0,1.18,6.0,1.0,1.0,1.0,4.0,2.0,2.0,2.0,3.0,1.0,0.0,29.0,8.0,0.0,0.0,0.0,0.0,0.16,24.0,5.0,3.0,2.0,2.0,1.0,10.0,2.0,0.0,0.0,1.0,22.0,18.0,0.0,0.0,0.0,0.0,2.09,22.0,18.0,0.0,0.0,0.0,0.0,0.0,0.0,21.0,17.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,134.71,Michael Kuiper
44,23,UFC 143: Diaz vs Condit,2012-02-04,"Las Vegas, Nevada, USA",Welterweight,S-DEC,3.0,5.0,Matthew,Riddle,Henry,Martinez,5,7,155,69.0,Southpaw,07-08-1983,0.0,169.0,67.0,1.0,0.0,0.0,0.0,0.0,149.0,50.0,6.0,3.0,14.0,14.0,0.0,0.0,0.0,0.0,0.0,70.0,29.0,0.0,0.0,0.0,0.0,0.0,61.0,22.0,2.0,0.0,7.0,7.0,0.0,0.0,0.0,0.0,0.0,69.0,27.0,1.0,0.0,0.0,0.0,0.0,63.0,22.0,3.0,2.0,3.0,3.0,0.0,0.0,0.0,0.0,0.0,30.0,11.0,0.0,0.0,0.0,0.0,0.0,25.0,6.0,1.0,1.0,4.0,4.0,0.0,0.0,0.0,0.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,310.14,Henry Martinez


----------------

In [31]:
df_hist_merging_data = df_hist_final.iloc[:, [2] + list(range(18, df_hist_final.shape[1]))]
df_hist_merging_data.head()

Unnamed: 0,date,Knockdown Total,Significant Strike Total Attempted,Significant Strike Total Landed,Takedown Total Attempted,Takedown Total Landed,Submission Attempted,Reversal,Ground and Cage Control Time,Significant Strike Head Attempted,Significant Strike Head Landed,Significant Strike Body Attempted,Significant Strike Body Landed,Significant Strike Leg Attempted,Significant Strike Leg Landed,Significant Strike Clinch Attempted,Significant Strike Clinch Landed,Significant Strike Ground Attempted,Significant Strike Ground Landed,Round 1 Knockdown Total,Round 1 Significant Strike Total Attempted,Round 1 Significant Strike Total Landed,Round 1 Takedown Total Attempted,Round 1 Takedown Total Landed,Round 1 Submission Attempted,Round 1 Reversal,Round 1 Ground and Cage Control Time,Round 1 Significant Strike Head Attempted,Round 1 Significant Strike Head Landed,Round 1 Significant Strike Body Attempted,Round 1 Significant Strike Body Landed,Round 1 Significant Strike Leg Attempted,Round 1 Significant Strike Leg Landed,Round 1 Significant Strike Clinch Attempted,Round 1 Significant Strike Clinch Landed,Round 1 Significant Strike Ground Attempted,Round 1 Significant Strike Ground Landed,Round 2 Knockdown Total,Round 2 Significant Strike Total Attempted,Round 2 Significant Strike Total Landed,Round 2 Takedown Total Attempted,Round 2 Takedown Total Landed,Round 2 Submission Attempted,Round 2 Reversal,Round 2 Ground and Cage Control Time,Round 2 Significant Strike Head Attempted,Round 2 Significant Strike Head Landed,Round 2 Significant Strike Body Attempted,Round 2 Significant Strike Body Landed,Round 2 Significant Strike Leg Attempted,Round 2 Significant Strike Leg Landed,Round 2 Significant Strike Clinch Attempted,Round 2 Significant Strike Clinch Landed,Round 2 Significant Strike Ground Attempted,Round 2 Significant Strike Ground Landed,Round 3 Knockdown Total,Round 3 Significant Strike Total Attempted,Round 3 Significant Strike Total Landed,Round 3 Takedown Total Attempted,Round 3 Takedown Total Landed,Round 3 Submission Attempted,Round 3 Reversal,Round 3 Ground and Cage Control Time,Round 3 Significant Strike Head Attempted,Round 3 Significant Strike Head Landed,Round 3 Significant Strike Body Attempted,Round 3 Significant Strike Body Landed,Round 3 Significant Strike Leg Attempted,Round 3 Significant Strike Leg Landed,Round 3 Significant Strike Clinch Attempted,Round 3 Significant Strike Clinch Landed,Round 3 Significant Strike Ground Attempted,Round 3 Significant Strike Ground Landed,Round 4 Knockdown Total,Round 4 Significant Strike Total Attempted,Round 4 Significant Strike Total Landed,Round 4 Takedown Total Attempted,Round 4 Takedown Total Landed,Round 4 Submission Attempted,Round 4 Reversal,Round 4 Ground and Cage Control Time,Round 4 Significant Strike Head Attempted,Round 4 Significant Strike Head Landed,Round 4 Significant Strike Body Attempted,Round 4 Significant Strike Body Landed,Round 4 Significant Strike Leg Attempted,Round 4 Significant Strike Leg Landed,Round 4 Significant Strike Clinch Attempted,Round 4 Significant Strike Clinch Landed,Round 4 Significant Strike Ground Attempted,Round 4 Significant Strike Ground Landed,Round 5 Knockdown Total,Round 5 Significant Strike Total Attempted,Round 5 Significant Strike Total Landed,Round 5 Takedown Total Attempted,Round 5 Takedown Total Landed,Round 5 Submission Attempted,Round 5 Reversal,Round 5 Ground and Cage Control Time,Round 5 Significant Strike Head Attempted,Round 5 Significant Strike Head Landed,Round 5 Significant Strike Body Attempted,Round 5 Significant Strike Body Landed,Round 5 Significant Strike Leg Attempted,Round 5 Significant Strike Leg Landed,Round 5 Significant Strike Clinch Attempted,Round 5 Significant Strike Clinch Landed,Round 5 Significant Strike Ground Attempted,Round 5 Significant Strike Ground Landed,odds,fighter_x_name
40,2012-02-04,0.0,22.0,8.0,0.0,0.0,0.0,0.0,0.0,5.0,0.0,5.0,1.0,12.0,7.0,0.0,0.0,0.0,0.0,0.0,22.0,8.0,0.0,0.0,0.0,0.0,0.0,5.0,0.0,5.0,1.0,12.0,7.0,0.0,0.0,0.0,0.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,-345.0,Stephen Thompson
41,2012-02-04,1.0,38.0,22.0,0.0,0.0,0.0,0.0,0.01,23.0,9.0,9.0,7.0,6.0,6.0,0.0,0.0,2.0,2.0,1.0,38.0,22.0,0.0,0.0,0.0,0.0,0.01,23.0,9.0,9.0,7.0,6.0,6.0,0.0,0.0,2.0,2.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,273.57,Dan Stittgen
42,2012-02-04,0.0,72.0,36.0,16.0,6.0,2.0,0.0,6.24,56.0,25.0,8.0,5.0,8.0,6.0,19.0,11.0,2.0,1.0,0.0,9.0,3.0,7.0,4.0,1.0,0.0,2.28,6.0,0.0,1.0,1.0,2.0,2.0,1.0,1.0,0.0,0.0,0.0,59.0,31.0,7.0,1.0,0.0,0.0,1.23,47.0,24.0,7.0,4.0,5.0,3.0,18.0,10.0,0.0,0.0,0.0,4.0,2.0,2.0,1.0,1.0,0.0,2.33,3.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,2.0,1.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,-162.14,Rafael Natal
43,2012-02-04,1.0,62.0,30.0,0.0,0.0,0.0,1.0,3.43,52.0,24.0,4.0,3.0,6.0,3.0,12.0,4.0,24.0,18.0,0.0,11.0,4.0,0.0,0.0,0.0,1.0,1.18,6.0,1.0,1.0,1.0,4.0,2.0,2.0,2.0,3.0,1.0,0.0,29.0,8.0,0.0,0.0,0.0,0.0,0.16,24.0,5.0,3.0,2.0,2.0,1.0,10.0,2.0,0.0,0.0,1.0,22.0,18.0,0.0,0.0,0.0,0.0,2.09,22.0,18.0,0.0,0.0,0.0,0.0,0.0,0.0,21.0,17.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,134.71,Michael Kuiper
44,2012-02-04,0.0,169.0,67.0,1.0,0.0,0.0,0.0,0.0,149.0,50.0,6.0,3.0,14.0,14.0,0.0,0.0,0.0,0.0,0.0,70.0,29.0,0.0,0.0,0.0,0.0,0.0,61.0,22.0,2.0,0.0,7.0,7.0,0.0,0.0,0.0,0.0,0.0,69.0,27.0,1.0,0.0,0.0,0.0,0.0,63.0,22.0,3.0,2.0,3.0,3.0,0.0,0.0,0.0,0.0,0.0,30.0,11.0,0.0,0.0,0.0,0.0,0.0,25.0,6.0,1.0,1.0,4.0,4.0,0.0,0.0,0.0,0.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,310.14,Henry Martinez


In [32]:
# Drop columns from index 73 to second-last
df_hist_merging_data = df_hist_merging_data.drop(df_hist_merging_data.columns[73:-2], axis=1)

# Display the result to verify the drop
print(df_hist_merging_data.head())


         date  Knockdown Total  Significant Strike Total Attempted  \
40 2012-02-04              0.0                                22.0   
41 2012-02-04              1.0                                38.0   
42 2012-02-04              0.0                                72.0   
43 2012-02-04              1.0                                62.0   
44 2012-02-04              0.0                               169.0   

    Significant Strike Total Landed  Takedown Total Attempted  \
40                              8.0                       0.0   
41                             22.0                       0.0   
42                             36.0                      16.0   
43                             30.0                       0.0   
44                             67.0                       1.0   

    Takedown Total Landed  Submission Attempted  Reversal  \
40                    0.0                   0.0       0.0   
41                    0.0                   0.0       0.0   
42   

In [33]:
# Fill NA values with 0
df_hist_merging_data = df_hist_merging_data.fillna(0)

In [34]:
df2_final['fighter_x_name'] = df2_final['fighter_x_name'].str.lower()
df_hist_merging_data['fighter_x_name'] = df_hist_merging_data['fighter_x_name'].str.lower()

In [35]:
# Assuming df_cleaned is the new cleaned dataset and df_old is the old dataset with additional features
df_merged = pd.merge(df2_final, df_hist_merging_data, on=['fighter_x_name','date'], how='left')

print(df_merged.shape)
print(df_merged.tail())

(12028, 153)
       fight_id       date      fighter_x_name    fighter_y_name  \
12023      6012 2024-10-05  ovince saint preux        RYAN SPANN   
12024      6013 2024-10-05       carla esparza  TECIA PENNINGTON   
12025      6013 2024-10-05    tecia pennington     CARLA ESPARZA   
12026      6014 2024-10-05         court mcgee         TIM MEANS   
12027      6014 2024-10-05           tim means       COURT MCGEE   

       fighter_x_win            Winner  Fight ID_x  Age (in days)_x  \
12023              0        RYAN SPANN        6012            15156   
12024              0  TECIA PENNINGTON        6013            13510   
12025              1  TECIA PENNINGTON        6013            12834   
12026              1       COURT MCGEE        6014            14542   
12027              0       COURT MCGEE        6014            14838   

       ELO_fighter_x  new_ELO_fighter_x  NumberOf_Fight_x  NumberOf_WIN_x  \
12023    1525.451318        1578.629084                27              15 

In [36]:
df2_final.shape, df_hist_merging_data.shape,df_merged.shape

((12028, 80), (12028, 75), (12028, 153))

In [37]:
df_merged.tail()

Unnamed: 0,fight_id,date,fighter_x_name,fighter_y_name,fighter_x_win,Winner,Fight ID_x,Age (in days)_x,ELO_fighter_x,new_ELO_fighter_x,NumberOf_Fight_x,NumberOf_WIN_x,NumberOf_LOSE_x,WIN_RATE_x,Stance_x,WIN_AVG_Knockdown Total_x,LOSE_AVG_Knockdown Total_x,NumberOf_WIN_shift_2_x,NumberOf_LOSE_shift_2_x,NumberOf_WIN_shift_3_x,NumberOf_LOSE_shift_3_x,NumberOf_WIN_shift_4_x,NumberOf_LOSE_shift_4_x,Fight ID_y,Age (in days)_y,ELO_fighter_y,new_ELO_fighter_y,NumberOf_Fight_y,NumberOf_WIN_y,NumberOf_LOSE_y,WIN_RATE_y,Stance_y,WIN_AVG_Knockdown Total_y,LOSE_AVG_Knockdown Total_y,NumberOf_WIN_shift_2_y,NumberOf_LOSE_shift_2_y,NumberOf_WIN_shift_3_y,NumberOf_LOSE_shift_3_y,NumberOf_WIN_shift_4_y,NumberOf_LOSE_shift_4_y,Age (in days)_diff,Height_diff,ELO_fighter_diff,new_ELO_fighter_diff,NumberOf_Fight_diff,NumberOf_WIN_diff,NumberOf_LOSE_diff,WIN_RATE_diff,Weight Pounds_diff,Reach Inches_diff,DOB Month_diff,DOB Day_diff,DOB Year_diff,WIN_AVG_Knockdown Total_diff,LOSE_AVG_Knockdown Total_diff,NumberOf_WIN_shift_2_diff,NumberOf_LOSE_shift_2_diff,WIN_RATE_shift_2_diff,NumberOf_WIN_shift_3_diff,NumberOf_LOSE_shift_3_diff,WIN_RATE_shift_3_diff,NumberOf_WIN_shift_4_diff,NumberOf_LOSE_shift_4_diff,WIN_RATE_shift_4_diff,Height_x_m,Height_y_m,Weight_x_kg,Weight_y_kg,Reach Meters_x,Reach Meters_y,Height_to_Reach_Ratio_x,Height_to_Reach_Ratio_y,BMI_x,BMI_y,Win_to_Loss_Ratio_x,Win_to_Loss_Ratio_y,Weighted_WIN_RATE_x,Weighted_WIN_RATE_y,Experience_Growth_x,Experience_Growth_y,Knockdown Total,Significant Strike Total Attempted,Significant Strike Total Landed,Takedown Total Attempted,Takedown Total Landed,Submission Attempted,Reversal,Ground and Cage Control Time,Significant Strike Head Attempted,Significant Strike Head Landed,Significant Strike Body Attempted,Significant Strike Body Landed,Significant Strike Leg Attempted,Significant Strike Leg Landed,Significant Strike Clinch Attempted,Significant Strike Clinch Landed,Significant Strike Ground Attempted,Significant Strike Ground Landed,Round 1 Knockdown Total,Round 1 Significant Strike Total Attempted,Round 1 Significant Strike Total Landed,Round 1 Takedown Total Attempted,Round 1 Takedown Total Landed,Round 1 Submission Attempted,Round 1 Reversal,Round 1 Ground and Cage Control Time,Round 1 Significant Strike Head Attempted,Round 1 Significant Strike Head Landed,Round 1 Significant Strike Body Attempted,Round 1 Significant Strike Body Landed,Round 1 Significant Strike Leg Attempted,Round 1 Significant Strike Leg Landed,Round 1 Significant Strike Clinch Attempted,Round 1 Significant Strike Clinch Landed,Round 1 Significant Strike Ground Attempted,Round 1 Significant Strike Ground Landed,Round 2 Knockdown Total,Round 2 Significant Strike Total Attempted,Round 2 Significant Strike Total Landed,Round 2 Takedown Total Attempted,Round 2 Takedown Total Landed,Round 2 Submission Attempted,Round 2 Reversal,Round 2 Ground and Cage Control Time,Round 2 Significant Strike Head Attempted,Round 2 Significant Strike Head Landed,Round 2 Significant Strike Body Attempted,Round 2 Significant Strike Body Landed,Round 2 Significant Strike Leg Attempted,Round 2 Significant Strike Leg Landed,Round 2 Significant Strike Clinch Attempted,Round 2 Significant Strike Clinch Landed,Round 2 Significant Strike Ground Attempted,Round 2 Significant Strike Ground Landed,Round 3 Knockdown Total,Round 3 Significant Strike Total Attempted,Round 3 Significant Strike Total Landed,Round 3 Takedown Total Attempted,Round 3 Takedown Total Landed,Round 3 Submission Attempted,Round 3 Reversal,Round 3 Ground and Cage Control Time,Round 3 Significant Strike Head Attempted,Round 3 Significant Strike Head Landed,Round 3 Significant Strike Body Attempted,Round 3 Significant Strike Body Landed,Round 3 Significant Strike Leg Attempted,Round 3 Significant Strike Leg Landed,Round 3 Significant Strike Clinch Attempted,Round 3 Significant Strike Clinch Landed,Round 3 Significant Strike Ground Attempted,Round 3 Significant Strike Ground Landed,odds
12023,6012,2024-10-05,ovince saint preux,RYAN SPANN,0,RYAN SPANN,6012,15156,1525.451318,1578.629084,27,15,12,0.555556,Southpaw,0.266667,0.416667,1,1,2,1,2,2,6012,12096,1522.294183,1541.888474,12,7,5,0.583333,Orthodox,0.0,0.6,0,2,0,3,1,3,3060,-2,3.157134,36.740611,15,8,7,-0.027778,0,1.0,-4,-16,-8,0.266667,-0.183333,1,-1,0.5,2,-2,0.666667,1,-1,0.25,1.905,1.9558,92.98636,92.98636,2.032,2.0066,0.9375,0.974684,25.622959,24.309183,1.153846,1.166667,0.55,0.05,0.658537,0.363636,0.0,10.0,7.0,1.0,0.0,0.0,0.0,0.03,3.0,2.0,2.0,0.0,5.0,5.0,0.0,0.0,0.0,0.0,0.0,10.0,7.0,1.0,0.0,0.0,0.0,0.03,3.0,2.0,2.0,0.0,5.0,5.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,265.0
12024,6013,2024-10-05,carla esparza,TECIA PENNINGTON,0,TECIA PENNINGTON,6013,13510,1582.996573,1584.162724,15,10,5,0.666667,Orthodox,0.0,0.0,1,1,2,1,3,1,6013,12834,1520.966052,1547.824865,16,9,7,0.5625,Orthodox,0.111111,0.0,0,2,1,2,2,2,676,0,62.030521,36.337859,-1,1,-2,0.104167,0,3.0,2,-6,-2,-0.111111,0.0,1,-1,0.5,1,-1,0.333333,1,-1,0.25,1.5494,1.5494,52.16308,52.16308,1.6002,1.524,0.968254,1.016667,21.728819,21.728819,1.666667,1.125,0.6,0.2,0.405405,0.457143,0.0,150.0,80.0,0.0,0.0,0.0,0.0,0.0,92.0,28.0,12.0,11.0,46.0,41.0,8.0,5.0,0.0,0.0,0.0,38.0,13.0,0.0,0.0,0.0,0.0,0.0,32.0,8.0,3.0,3.0,3.0,2.0,4.0,3.0,0.0,0.0,0.0,55.0,31.0,0.0,0.0,0.0,0.0,0.0,37.0,14.0,5.0,5.0,13.0,12.0,1.0,0.0,0.0,0.0,0.0,57.0,36.0,0.0,0.0,0.0,0.0,0.0,23.0,6.0,4.0,3.0,30.0,27.0,3.0,2.0,0.0,0.0,107.0
12025,6013,2024-10-05,tecia pennington,CARLA ESPARZA,1,TECIA PENNINGTON,6013,12834,1520.966052,1547.824865,16,9,7,0.5625,Orthodox,0.111111,0.0,0,2,1,2,2,2,6013,13510,1582.996573,1584.162724,15,10,5,0.666667,Orthodox,0.0,0.0,1,1,2,1,3,1,-676,0,-62.030521,-36.337859,1,-1,2,-0.104167,0,-3.0,-2,6,2,0.111111,0.0,-1,1,-0.5,-1,1,-0.333333,-1,1,-0.25,1.5494,1.5494,52.16308,52.16308,1.524,1.6002,1.016667,0.968254,21.728819,21.728819,1.125,1.666667,0.2,0.6,0.457143,0.405405,0.0,123.0,39.0,7.0,4.0,0.0,0.0,3.4,112.0,30.0,6.0,4.0,5.0,5.0,3.0,0.0,10.0,6.0,0.0,26.0,6.0,3.0,1.0,0.0,0.0,2.52,24.0,4.0,2.0,2.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0,43.0,12.0,3.0,2.0,0.0,0.0,0.29,35.0,6.0,4.0,2.0,4.0,4.0,1.0,0.0,0.0,0.0,0.0,54.0,21.0,1.0,1.0,0.0,0.0,0.19,53.0,20.0,0.0,0.0,1.0,1.0,0.0,0.0,10.0,6.0,-132.0
12026,6014,2024-10-05,court mcgee,TIM MEANS,1,COURT MCGEE,6014,14542,1445.311654,1374.006095,19,7,12,0.368421,Orthodox,0.142857,0.25,0,2,0,3,1,3,6014,14838,1509.469532,1527.250276,29,15,14,0.517241,Orthodox,0.4,0.357143,1,1,1,2,1,3,-296,-3,-64.157878,-153.244181,-10,-8,-2,-0.14882,0,0.0,10,-8,0,-0.257143,-0.107143,-1,1,-0.5,-1,1,-0.333333,0,0,0.0,1.8034,1.8796,77.11064,77.11064,1.905,1.905,0.946667,0.986667,23.709925,21.826467,0.538462,1.0,0.05,0.4,0.487179,0.725,0.0,24.0,14.0,1.0,1.0,1.0,0.0,1.58,18.0,9.0,4.0,3.0,2.0,2.0,2.0,0.0,0.0,0.0,0.0,24.0,14.0,1.0,1.0,1.0,0.0,1.58,18.0,9.0,4.0,3.0,2.0,2.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,160.0
12027,6014,2024-10-05,tim means,COURT MCGEE,0,COURT MCGEE,6014,14838,1509.469532,1527.250276,29,15,14,0.517241,Orthodox,0.4,0.357143,1,1,1,2,1,3,6014,14542,1445.311654,1374.006095,19,7,12,0.368421,Orthodox,0.142857,0.25,0,2,0,3,1,3,296,3,64.157878,153.244181,10,8,2,0.14882,0,0.0,-10,8,0,0.257143,0.107143,1,-1,0.5,1,-1,0.333333,0,0,0.0,1.8796,1.8034,77.11064,77.11064,1.905,1.905,0.986667,0.946667,21.826467,23.709925,1.0,0.538462,0.4,0.05,0.725,0.487179,0.0,13.0,5.0,0.0,0.0,0.0,0.0,0.0,9.0,2.0,3.0,2.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,13.0,5.0,0.0,0.0,0.0,0.0,0.0,9.0,2.0,3.0,2.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-192.0


In [38]:
# df_merged.to_csv('Merged.csv',index=False)

## Feature Engineering on the Merged Data

In [39]:
# List of feature names to create accuracy
columns_to_check = [
    'Significant Strike Total', 'Takedown Total', 
    'Significant Strike Head', 'Significant Strike Body', 
    'Significant Strike Leg', 'Significant Strike Clinch', 
    'Significant Strike Ground'
]


others = ['Submission', 'Reversal', 'Ground and Cage Control']

In [40]:
# Example: Aggregating round-wise statistics into overall statistics
rounds = [1, 2, 3]

# Loop through the columns and create accuracy features
for col in columns_to_check:
    # Attempted column
    attempted_col = col + ' Attempted'
    # Landed column
    landed_col = col + ' Landed'
    
    # Check if both columns exist in the DataFrame
    if attempted_col in df_merged.columns and landed_col in df_merged.columns:
        # Create accuracy column
        # df_merged[col + ' Accuracy'] = df_merged[landed_col] / df_merged[attempted_col]
        df_merged[col + ' Accuracy'] = np.round(np.where(df_merged[attempted_col] == 0,0,df_merged[landed_col] / df_merged[attempted_col]),3)
        # df_merged.drop([attempted_col, landed_col], axis=1, inplace=True)

    # Calculate accuracy for each round (if applicable)
    for round_num in rounds:
        round_landed_col = f'Round {round_num} {landed_col}'
        round_attempted_col = f'Round {round_num} {attempted_col}'

        df_merged[f'Round {round_num} {col} Accuracy'] = np.round(np.where(df_merged[round_attempted_col] == 0,0,df_merged[round_landed_col] / df_merged[round_attempted_col]),3)
        # df_merged.drop([round_attempted_col, round_landed_col], axis=1, inplace=True)

In [41]:
def american_odds_to_probability(odds):
    if odds > 0:
        return 100 / (odds + 100)
    else:
        return abs(odds) / (abs(odds) + 100)

df_merged['odds_probability'] = df_merged['odds'].apply(american_odds_to_probability)
df_merged.drop(['odds'], axis=1, inplace=True)

In [42]:
df_merged.tail()

Unnamed: 0,fight_id,date,fighter_x_name,fighter_y_name,fighter_x_win,Winner,Fight ID_x,Age (in days)_x,ELO_fighter_x,new_ELO_fighter_x,NumberOf_Fight_x,NumberOf_WIN_x,NumberOf_LOSE_x,WIN_RATE_x,Stance_x,WIN_AVG_Knockdown Total_x,LOSE_AVG_Knockdown Total_x,NumberOf_WIN_shift_2_x,NumberOf_LOSE_shift_2_x,NumberOf_WIN_shift_3_x,NumberOf_LOSE_shift_3_x,NumberOf_WIN_shift_4_x,NumberOf_LOSE_shift_4_x,Fight ID_y,Age (in days)_y,ELO_fighter_y,new_ELO_fighter_y,NumberOf_Fight_y,NumberOf_WIN_y,NumberOf_LOSE_y,WIN_RATE_y,Stance_y,WIN_AVG_Knockdown Total_y,LOSE_AVG_Knockdown Total_y,NumberOf_WIN_shift_2_y,NumberOf_LOSE_shift_2_y,NumberOf_WIN_shift_3_y,NumberOf_LOSE_shift_3_y,NumberOf_WIN_shift_4_y,NumberOf_LOSE_shift_4_y,Age (in days)_diff,Height_diff,ELO_fighter_diff,new_ELO_fighter_diff,NumberOf_Fight_diff,NumberOf_WIN_diff,NumberOf_LOSE_diff,WIN_RATE_diff,Weight Pounds_diff,Reach Inches_diff,DOB Month_diff,DOB Day_diff,DOB Year_diff,WIN_AVG_Knockdown Total_diff,LOSE_AVG_Knockdown Total_diff,NumberOf_WIN_shift_2_diff,NumberOf_LOSE_shift_2_diff,WIN_RATE_shift_2_diff,NumberOf_WIN_shift_3_diff,NumberOf_LOSE_shift_3_diff,WIN_RATE_shift_3_diff,NumberOf_WIN_shift_4_diff,NumberOf_LOSE_shift_4_diff,WIN_RATE_shift_4_diff,Height_x_m,Height_y_m,Weight_x_kg,Weight_y_kg,Reach Meters_x,Reach Meters_y,Height_to_Reach_Ratio_x,Height_to_Reach_Ratio_y,BMI_x,BMI_y,Win_to_Loss_Ratio_x,Win_to_Loss_Ratio_y,Weighted_WIN_RATE_x,Weighted_WIN_RATE_y,Experience_Growth_x,Experience_Growth_y,Knockdown Total,Significant Strike Total Attempted,Significant Strike Total Landed,Takedown Total Attempted,Takedown Total Landed,Submission Attempted,Reversal,Ground and Cage Control Time,Significant Strike Head Attempted,Significant Strike Head Landed,Significant Strike Body Attempted,Significant Strike Body Landed,Significant Strike Leg Attempted,Significant Strike Leg Landed,Significant Strike Clinch Attempted,Significant Strike Clinch Landed,Significant Strike Ground Attempted,Significant Strike Ground Landed,Round 1 Knockdown Total,Round 1 Significant Strike Total Attempted,Round 1 Significant Strike Total Landed,Round 1 Takedown Total Attempted,Round 1 Takedown Total Landed,Round 1 Submission Attempted,Round 1 Reversal,Round 1 Ground and Cage Control Time,Round 1 Significant Strike Head Attempted,Round 1 Significant Strike Head Landed,Round 1 Significant Strike Body Attempted,Round 1 Significant Strike Body Landed,Round 1 Significant Strike Leg Attempted,Round 1 Significant Strike Leg Landed,Round 1 Significant Strike Clinch Attempted,Round 1 Significant Strike Clinch Landed,Round 1 Significant Strike Ground Attempted,Round 1 Significant Strike Ground Landed,Round 2 Knockdown Total,Round 2 Significant Strike Total Attempted,Round 2 Significant Strike Total Landed,Round 2 Takedown Total Attempted,Round 2 Takedown Total Landed,Round 2 Submission Attempted,Round 2 Reversal,Round 2 Ground and Cage Control Time,Round 2 Significant Strike Head Attempted,Round 2 Significant Strike Head Landed,Round 2 Significant Strike Body Attempted,Round 2 Significant Strike Body Landed,Round 2 Significant Strike Leg Attempted,Round 2 Significant Strike Leg Landed,Round 2 Significant Strike Clinch Attempted,Round 2 Significant Strike Clinch Landed,Round 2 Significant Strike Ground Attempted,Round 2 Significant Strike Ground Landed,Round 3 Knockdown Total,Round 3 Significant Strike Total Attempted,Round 3 Significant Strike Total Landed,Round 3 Takedown Total Attempted,Round 3 Takedown Total Landed,Round 3 Submission Attempted,Round 3 Reversal,Round 3 Ground and Cage Control Time,Round 3 Significant Strike Head Attempted,Round 3 Significant Strike Head Landed,Round 3 Significant Strike Body Attempted,Round 3 Significant Strike Body Landed,Round 3 Significant Strike Leg Attempted,Round 3 Significant Strike Leg Landed,Round 3 Significant Strike Clinch Attempted,Round 3 Significant Strike Clinch Landed,Round 3 Significant Strike Ground Attempted,Round 3 Significant Strike Ground Landed,Significant Strike Total Accuracy,Round 1 Significant Strike Total Accuracy,Round 2 Significant Strike Total Accuracy,Round 3 Significant Strike Total Accuracy,Takedown Total Accuracy,Round 1 Takedown Total Accuracy,Round 2 Takedown Total Accuracy,Round 3 Takedown Total Accuracy,Significant Strike Head Accuracy,Round 1 Significant Strike Head Accuracy,Round 2 Significant Strike Head Accuracy,Round 3 Significant Strike Head Accuracy,Significant Strike Body Accuracy,Round 1 Significant Strike Body Accuracy,Round 2 Significant Strike Body Accuracy,Round 3 Significant Strike Body Accuracy,Significant Strike Leg Accuracy,Round 1 Significant Strike Leg Accuracy,Round 2 Significant Strike Leg Accuracy,Round 3 Significant Strike Leg Accuracy,Significant Strike Clinch Accuracy,Round 1 Significant Strike Clinch Accuracy,Round 2 Significant Strike Clinch Accuracy,Round 3 Significant Strike Clinch Accuracy,Significant Strike Ground Accuracy,Round 1 Significant Strike Ground Accuracy,Round 2 Significant Strike Ground Accuracy,Round 3 Significant Strike Ground Accuracy,odds_probability
12023,6012,2024-10-05,ovince saint preux,RYAN SPANN,0,RYAN SPANN,6012,15156,1525.451318,1578.629084,27,15,12,0.555556,Southpaw,0.266667,0.416667,1,1,2,1,2,2,6012,12096,1522.294183,1541.888474,12,7,5,0.583333,Orthodox,0.0,0.6,0,2,0,3,1,3,3060,-2,3.157134,36.740611,15,8,7,-0.027778,0,1.0,-4,-16,-8,0.266667,-0.183333,1,-1,0.5,2,-2,0.666667,1,-1,0.25,1.905,1.9558,92.98636,92.98636,2.032,2.0066,0.9375,0.974684,25.622959,24.309183,1.153846,1.166667,0.55,0.05,0.658537,0.363636,0.0,10.0,7.0,1.0,0.0,0.0,0.0,0.03,3.0,2.0,2.0,0.0,5.0,5.0,0.0,0.0,0.0,0.0,0.0,10.0,7.0,1.0,0.0,0.0,0.0,0.03,3.0,2.0,2.0,0.0,5.0,5.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.7,0.7,0.0,0.0,0.0,0.0,0.0,0.0,0.667,0.667,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.273973
12024,6013,2024-10-05,carla esparza,TECIA PENNINGTON,0,TECIA PENNINGTON,6013,13510,1582.996573,1584.162724,15,10,5,0.666667,Orthodox,0.0,0.0,1,1,2,1,3,1,6013,12834,1520.966052,1547.824865,16,9,7,0.5625,Orthodox,0.111111,0.0,0,2,1,2,2,2,676,0,62.030521,36.337859,-1,1,-2,0.104167,0,3.0,2,-6,-2,-0.111111,0.0,1,-1,0.5,1,-1,0.333333,1,-1,0.25,1.5494,1.5494,52.16308,52.16308,1.6002,1.524,0.968254,1.016667,21.728819,21.728819,1.666667,1.125,0.6,0.2,0.405405,0.457143,0.0,150.0,80.0,0.0,0.0,0.0,0.0,0.0,92.0,28.0,12.0,11.0,46.0,41.0,8.0,5.0,0.0,0.0,0.0,38.0,13.0,0.0,0.0,0.0,0.0,0.0,32.0,8.0,3.0,3.0,3.0,2.0,4.0,3.0,0.0,0.0,0.0,55.0,31.0,0.0,0.0,0.0,0.0,0.0,37.0,14.0,5.0,5.0,13.0,12.0,1.0,0.0,0.0,0.0,0.0,57.0,36.0,0.0,0.0,0.0,0.0,0.0,23.0,6.0,4.0,3.0,30.0,27.0,3.0,2.0,0.0,0.0,0.533,0.342,0.564,0.632,0.0,0.0,0.0,0.0,0.304,0.25,0.378,0.261,0.917,1.0,1.0,0.75,0.891,0.667,0.923,0.9,0.625,0.75,0.0,0.667,0.0,0.0,0.0,0.0,0.483092
12025,6013,2024-10-05,tecia pennington,CARLA ESPARZA,1,TECIA PENNINGTON,6013,12834,1520.966052,1547.824865,16,9,7,0.5625,Orthodox,0.111111,0.0,0,2,1,2,2,2,6013,13510,1582.996573,1584.162724,15,10,5,0.666667,Orthodox,0.0,0.0,1,1,2,1,3,1,-676,0,-62.030521,-36.337859,1,-1,2,-0.104167,0,-3.0,-2,6,2,0.111111,0.0,-1,1,-0.5,-1,1,-0.333333,-1,1,-0.25,1.5494,1.5494,52.16308,52.16308,1.524,1.6002,1.016667,0.968254,21.728819,21.728819,1.125,1.666667,0.2,0.6,0.457143,0.405405,0.0,123.0,39.0,7.0,4.0,0.0,0.0,3.4,112.0,30.0,6.0,4.0,5.0,5.0,3.0,0.0,10.0,6.0,0.0,26.0,6.0,3.0,1.0,0.0,0.0,2.52,24.0,4.0,2.0,2.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0,43.0,12.0,3.0,2.0,0.0,0.0,0.29,35.0,6.0,4.0,2.0,4.0,4.0,1.0,0.0,0.0,0.0,0.0,54.0,21.0,1.0,1.0,0.0,0.0,0.19,53.0,20.0,0.0,0.0,1.0,1.0,0.0,0.0,10.0,6.0,0.317,0.231,0.279,0.389,0.571,0.333,0.667,1.0,0.268,0.167,0.171,0.377,0.667,1.0,0.5,0.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.6,0.0,0.0,0.6,0.568966
12026,6014,2024-10-05,court mcgee,TIM MEANS,1,COURT MCGEE,6014,14542,1445.311654,1374.006095,19,7,12,0.368421,Orthodox,0.142857,0.25,0,2,0,3,1,3,6014,14838,1509.469532,1527.250276,29,15,14,0.517241,Orthodox,0.4,0.357143,1,1,1,2,1,3,-296,-3,-64.157878,-153.244181,-10,-8,-2,-0.14882,0,0.0,10,-8,0,-0.257143,-0.107143,-1,1,-0.5,-1,1,-0.333333,0,0,0.0,1.8034,1.8796,77.11064,77.11064,1.905,1.905,0.946667,0.986667,23.709925,21.826467,0.538462,1.0,0.05,0.4,0.487179,0.725,0.0,24.0,14.0,1.0,1.0,1.0,0.0,1.58,18.0,9.0,4.0,3.0,2.0,2.0,2.0,0.0,0.0,0.0,0.0,24.0,14.0,1.0,1.0,1.0,0.0,1.58,18.0,9.0,4.0,3.0,2.0,2.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.583,0.583,0.0,0.0,1.0,1.0,0.0,0.0,0.5,0.5,0.0,0.0,0.75,0.75,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.384615
12027,6014,2024-10-05,tim means,COURT MCGEE,0,COURT MCGEE,6014,14838,1509.469532,1527.250276,29,15,14,0.517241,Orthodox,0.4,0.357143,1,1,1,2,1,3,6014,14542,1445.311654,1374.006095,19,7,12,0.368421,Orthodox,0.142857,0.25,0,2,0,3,1,3,296,3,64.157878,153.244181,10,8,2,0.14882,0,0.0,-10,8,0,0.257143,0.107143,1,-1,0.5,1,-1,0.333333,0,0,0.0,1.8796,1.8034,77.11064,77.11064,1.905,1.905,0.986667,0.946667,21.826467,23.709925,1.0,0.538462,0.4,0.05,0.725,0.487179,0.0,13.0,5.0,0.0,0.0,0.0,0.0,0.0,9.0,2.0,3.0,2.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,13.0,5.0,0.0,0.0,0.0,0.0,0.0,9.0,2.0,3.0,2.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.385,0.385,0.0,0.0,0.0,0.0,0.0,0.0,0.222,0.222,0.0,0.0,0.667,0.667,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.657534


In [43]:
test_dates = df_merged[df_merged['date'] > '2022-06-04']['date'].unique()
test_dates[:5]

<DatetimeArray>
['2022-06-11 00:00:00', '2022-06-18 00:00:00', '2022-06-25 00:00:00',
 '2022-07-02 00:00:00', '2022-07-09 00:00:00']
Length: 5, dtype: datetime64[ns]

In [44]:
train_data_encoded = pd.get_dummies(df_merged, columns=['Stance_x', 'Stance_y'])
new_future_x_win_ls, feature_importance = train_model(train_data_encoded,test_dates)

2022-06-11 00:00:00
Fitting 3 folds for each of 27 candidates, totalling 81 fits
[LightGBM] [Info] Number of positive: 3199, number of negative: 3198
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.008306 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 13572
[LightGBM] [Info] Number of data points in the train set: 6397, number of used features: 177
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500078 -> initscore=0.000313
[LightGBM] [Info] Start training from score 0.000313
[LightGBM] [Info] Number of positive: 3198, number of negative: 3199
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.008313 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 13548
[LightGBM] [Info] Number of data points in the train set: 6397, number of used features: 177
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.499922 -> initscore

--------------------------------

In [45]:
col_list = ['date','fighter_x_win','Winner', 'fight_id', 'Fight ID_x', 'Fight ID_y', 'fighter_x_name', 'fighter_y_name']

for x,y in zip(feature_importance['Feature'], feature_importance['Importance']):
    if y > 0:
        col_list.append(x)

col_list

['date',
 'fighter_x_win',
 'Winner',
 'fight_id',
 'Fight ID_x',
 'Fight ID_y',
 'fighter_x_name',
 'fighter_y_name',
 'odds_probability',
 'Round 1 Ground and Cage Control Time',
 'Age (in days)_diff',
 'Height_to_Reach_Ratio_y',
 'new_ELO_fighter_diff',
 'Height_to_Reach_Ratio_x',
 'Reach Meters_x',
 'Reach Meters_y',
 'LOSE_AVG_Knockdown Total_diff',
 'NumberOf_WIN_diff',
 'Weight Pounds_diff',
 'Ground and Cage Control Time',
 'Significant Strike Body Landed',
 'Significant Strike Head Accuracy',
 'DOB Month_diff',
 'Significant Strike Clinch Attempted',
 'Round 1 Significant Strike Head Landed',
 'ELO_fighter_diff',
 'Takedown Total Accuracy',
 'Round 3 Significant Strike Body Accuracy',
 'Age (in days)_x',
 'Significant Strike Ground Landed',
 'Significant Strike Ground Attempted',
 'Significant Strike Total Accuracy',
 'Significant Strike Head Attempted',
 'Round 1 Significant Strike Clinch Attempted',
 'Significant Strike Leg Accuracy',
 'NumberOf_WIN_shift_4_diff',
 'Round 1 

In [46]:
train_data_encoded[col_list].head()

Unnamed: 0,date,fighter_x_win,Winner,fight_id,Fight ID_x,Fight ID_y,fighter_x_name,fighter_y_name,odds_probability,Round 1 Ground and Cage Control Time,Age (in days)_diff,Height_to_Reach_Ratio_y,new_ELO_fighter_diff,Height_to_Reach_Ratio_x,Reach Meters_x,Reach Meters_y,LOSE_AVG_Knockdown Total_diff,NumberOf_WIN_diff,Weight Pounds_diff,Ground and Cage Control Time,Significant Strike Body Landed,Significant Strike Head Accuracy,DOB Month_diff,Significant Strike Clinch Attempted,Round 1 Significant Strike Head Landed,ELO_fighter_diff,Takedown Total Accuracy,Round 3 Significant Strike Body Accuracy,Age (in days)_x,Significant Strike Ground Landed,Significant Strike Ground Attempted,Significant Strike Total Accuracy,Significant Strike Head Attempted,Round 1 Significant Strike Clinch Attempted,Significant Strike Leg Accuracy,NumberOf_WIN_shift_4_diff,Round 1 Significant Strike Leg Attempted,Round 1 Significant Strike Total Accuracy,new_ELO_fighter_y,Round 1 Takedown Total Attempted,Round 2 Significant Strike Head Accuracy,Significant Strike Head Landed,NumberOf_LOSE_shift_2_y,Round 3 Significant Strike Leg Landed,Round 3 Significant Strike Total Landed,Significant Strike Clinch Landed,Significant Strike Leg Landed,WIN_RATE_x,Round 2 Ground and Cage Control Time,Significant Strike Body Accuracy,ELO_fighter_x,Takedown Total Attempted,Submission Attempted,Round 1 Significant Strike Body Accuracy,Reversal,Round 3 Significant Strike Body Landed,Win_to_Loss_Ratio_y,Round 2 Significant Strike Total Attempted,Stance_y_Orthodox,BMI_x,Round 3 Significant Strike Head Attempted,Round 3 Reversal,new_ELO_fighter_x,Round 1 Significant Strike Body Attempted,Experience_Growth_y,Round 2 Significant Strike Leg Attempted,Round 3 Significant Strike Clinch Accuracy,WIN_RATE_shift_3_diff,Age (in days)_y,Round 1 Significant Strike Leg Accuracy,NumberOf_Fight_diff,Round 3 Takedown Total Attempted,DOB Day_diff,Round 1 Significant Strike Ground Accuracy,Round 3 Ground and Cage Control Time,ELO_fighter_y,Reach Inches_diff,Round 3 Significant Strike Body Attempted,Round 1 Significant Strike Body Landed,Round 2 Significant Strike Body Accuracy,Round 1 Significant Strike Ground Attempted,Round 2 Knockdown Total,Round 2 Significant Strike Total Landed,Round 2 Significant Strike Clinch Landed,Round 2 Reversal,Round 3 Significant Strike Head Landed,Round 1 Significant Strike Clinch Accuracy,Round 2 Significant Strike Total Accuracy,WIN_AVG_Knockdown Total_x,Height_x_m,Significant Strike Total Attempted,Significant Strike Total Landed,NumberOf_LOSE_shift_2_x,Weight_y_kg
0,2012-02-04,0,STEPHEN THOMPSON,1,1,1,dan stittgen,STEPHEN THOMPSON,0.267687,0.01,816,0.96,0.0,1.017997,1.82142,1.905,0.0,0,0,0.01,7.0,0.391,9,0.0,9.0,0.0,0.0,0.0,11401,2.0,2.0,0.579,23.0,0.0,1.0,0,6.0,0.579,1500.0,0.0,0.0,9.0,0,0.0,0.0,0.0,6.0,0.0,0.0,0.778,1500.0,0.0,0.0,0.778,0.0,0.0,0.0,0.0,True,22.428548,0.0,0.0,1500.0,9.0,0.0,0.0,0.0,0.0,10585,1.0,0,0.0,6,1.0,0.0,1500.0,0.0,0.0,7.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.8542,38.0,22.0,0,77.11064
1,2012-02-04,1,STEPHEN THOMPSON,1,1,1,stephen thompson,DAN STITTGEN,0.775281,0.0,-816,1.017997,0.0,0.96,1.905,1.82142,0.0,0,0,0.0,1.0,0.0,-9,0.0,0.0,0.0,0.0,0.0,10585,0.0,0.0,0.364,5.0,0.0,0.583,0,12.0,0.364,1500.0,0.0,0.0,0.0,0,0.0,0.0,0.0,7.0,0.0,0.0,0.2,1500.0,0.0,0.0,0.2,0.0,0.0,0.0,0.0,True,23.055889,0.0,0.0,1500.0,5.0,0.0,0.0,0.0,0.0,11401,0.583,0,0.0,-6,0.0,0.0,1500.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.8288,22.0,8.0,0,77.11064
2,2012-02-04,1,RAFAEL NATAL,2,2,2,rafael natal,MICHAEL KUIPER,0.618524,2.28,2356,0.986301,0.0,0.935065,1.9558,1.8542,-0.076923,0,0,6.24,5.0,0.446,6,19.0,0.0,0.0,0.375,0.0,10633,1.0,2.0,0.5,56.0,1.0,0.75,0,2.0,0.333,1500.0,7.0,0.511,25.0,0,1.0,2.0,11.0,6.0,0.0,1.23,0.625,1500.0,16.0,2.0,1.0,0.0,0.0,0.0,59.0,True,25.090233,3.0,0.0,1500.0,1.0,0.0,5.0,0.0,0.0,8277,1.0,0,2.0,18,0.0,2.33,1500.0,4.0,0.0,1.0,0.571,0.0,0.0,31.0,10.0,0.0,1.0,1.0,0.525,0.0,1.8288,72.0,36.0,0,83.91452
3,2012-02-04,0,RAFAEL NATAL,2,2,2,michael kuiper,RAFAEL NATAL,0.426058,1.18,-2356,0.935065,0.0,0.986301,1.8542,1.9558,0.076923,0,0,3.43,3.0,0.462,-6,12.0,1.0,0.0,0.0,0.0,8277,18.0,24.0,0.484,52.0,2.0,0.5,0,4.0,0.364,1500.0,0.0,0.208,24.0,0,0.0,18.0,4.0,3.0,0.0,0.16,0.75,1500.0,0.0,0.0,1.0,1.0,0.0,0.0,29.0,True,25.090233,22.0,0.0,1500.0,1.0,0.0,2.0,0.0,0.0,10633,0.5,0,0.0,-18,0.333,2.09,1500.0,-4.0,0.0,1.0,0.667,3.0,0.0,8.0,2.0,0.0,18.0,1.0,0.276,0.181818,1.8288,62.0,30.0,0,83.91452
4,2012-02-04,0,MATTHEW RIDDLE,3,3,3,henry martinez,MATTHEW RIDDLE,0.243819,0.0,891,0.960526,0.0,0.971014,1.7526,1.9304,0.333333,0,-15,0.0,3.0,0.336,7,0.0,22.0,0.0,0.0,1.0,10408,0.0,0.0,0.396,149.0,0.0,1.0,0,7.0,0.414,1500.0,0.0,0.349,50.0,0,4.0,11.0,0.0,14.0,0.0,0.0,0.5,1500.0,1.0,0.0,0.0,0.0,1.0,0.0,69.0,False,24.276163,25.0,0.0,1500.0,2.0,0.0,3.0,0.0,0.0,9517,1.0,0,0.0,-7,0.0,0.0,1500.0,-7.0,1.0,0.0,0.667,0.0,0.0,27.0,0.0,0.0,6.0,0.0,0.391,0.4,1.7018,169.0,67.0,0,77.11064


In [47]:
new_future_x_win_ls, feature_importance = train_model(train_data_encoded[col_list],test_dates)

2022-06-11 00:00:00
Fitting 3 folds for each of 27 candidates, totalling 81 fits
[LightGBM] [Info] Number of positive: 3199, number of negative: 3198
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.003537 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 9243
[LightGBM] [Info] Number of data points in the train set: 6397, number of used features: 86
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500078 -> initscore=0.000313
[LightGBM] [Info] Start training from score 0.000313
[LightGBM] [Info] Number of positive: 3198, number of negative: 3199
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001019 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 9205
[LightGBM] [Info] Number of data points in the train set: 6397, number of used features: 86
[LightGBM

In [48]:
col_list = ['date','fighter_x_win','Winner', 'fight_id', 'Fight ID_x', 'Fight ID_y', 'fighter_x_name', 'fighter_y_name']

for x,y in zip(feature_importance['Feature'], feature_importance['Importance']):
    if y > 10:
        col_list.append(x)

col_list

['date',
 'fighter_x_win',
 'Winner',
 'fight_id',
 'Fight ID_x',
 'Fight ID_y',
 'fighter_x_name',
 'fighter_y_name',
 'odds_probability',
 'Round 1 Ground and Cage Control Time',
 'Age (in days)_diff',
 'Height_to_Reach_Ratio_y',
 'Reach Meters_x',
 'Reach Meters_y',
 'new_ELO_fighter_diff',
 'Height_to_Reach_Ratio_x']

In [50]:
new_future_x_win_ls1, feature_importance1 = train_model(train_data_encoded[col_list],test_dates)

2022-06-11 00:00:00
Fitting 3 folds for each of 27 candidates, totalling 81 fits
[LightGBM] [Info] Number of positive: 3199, number of negative: 3198
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000314 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1385
[LightGBM] [Info] Number of data points in the train set: 6397, number of used features: 8
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500078 -> initscore=0.000313
[LightGBM] [Info] Start training from score 0.000313
[LightGBM] [Info] Number of positive: 3198, number of negative: 3199
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000222 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1379
[LightGBM] [Info] Number of data points in the train set: 6397, number of used features: 8
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.499922 -> initscore=-0.00

KeyboardInterrupt: 

----------------------
## END 
---------------

In [None]:
final  = pd.read_csv('backtest_prediction_202501_v2.csv')
y_pred = final['x_win']

In [None]:
len(y_pred), len(new_future_x_win_ls)

In [None]:
confusion_matrix(y_pred, new_future_x_win_ls)