# Compare the performance of the different models

## 1. Importing Libraries

In [5]:
import joblib
import numpy as np
import pandas as pd
import config.ConnectionConfig as cc
from sklearn.ensemble import VotingClassifier
from pyspark.sql import SparkSession
from sklearn.preprocessing import StandardScaler

In [6]:
cc.setupEnvironment()
spark = cc.startLocalCluster("UFC_Logistic_Regression_Training")
spark.getActiveSession()
spark = SparkSession.builder.appName("UFC_Fights").getOrCreate()

## 2. Load the models
 - MLPClassifier
 - Logistic Regression
 - Hard Voting Ensemble

In [7]:
model_mlp = joblib.load('../models/ufc_mlpclassifier_model.pkl')
model_lr = joblib.load('../models/ufc_logistic_regression_model.pkl')
model_sve = joblib.load('../models/ufc_voting_clf.pkl')

## 3. Load the data of upcoming fights

In [8]:
upcoming_fight_info = spark.read.csv('../processed_data/upcoming_events_fights.csv', header=True, inferSchema=True)
upcoming_fight_info.show()

## 3.1 Load the data of the fighters

In [9]:
fighters = spark.read.csv('../processed_data/fighter_details.csv', header=True, inferSchema=True)
fighters.show()

## 4. Preprocess the data
We want to create a dataframe with all the features of the fighters that are going to fight each other.

In [10]:
upcoming_fight_info.createOrReplaceTempView("upcoming_fight_info")
fighters.createOrReplaceTempView("fighters")

full_upcoming_fight_info = spark.sql("""SELECT
                                    ufi.event,
                                    ufi.event_date,
                                    ufi.event_location,
                                    ufi.fighter1,
                                    f1.fighter_index as fighter_index_1,
                                    f1.AVG_KD as AVG_KD_1,
                                    f1.AVG_SUB_ATT as AVG_SUB_ATT_1,
                                    f1.AVG_TD_Percentage as AVG_TD_Percentage_1,
                                    f1.AVG_Significant_Strike_Percentage as AVG_Significant_Strike_Percentage_1,
                                    f1.AVG_TOTAL_STR as AVG_TOTAL_STR_1,
                                    f1.AVG_ROUND as AVG_ROUND_1,
                                    f1.AVG_CTRL_SECONDS as AVG_CTRL_SECONDS_1,
                                    f1.total_wins as total_wins_1,
                                    f1.total_losses as total_loss_1,
                                    f1.total_draws as total_draws_1,
                                    f1.Win_Percentage as Win_Percentage_1,
                                    f1.Height_CM as Height_CM_1,
                                    f1.Weight_KG as Weight_KG_1,
                                    f1.Reach_Conv as Reach_Conv_1,
                                    ufi.fighter2,
                                    f2.fighter_index as fighter_index_2,
                                    f2.AVG_KD as AVG_KD_2,
                                    f2.AVG_SUB_ATT as AVG_SUB_ATT_2,
                                    f2.AVG_TD_Percentage as AVG_TD_Percentage_2,
                                    f2.AVG_Significant_Strike_Percentage as AVG_Significant_Strike_Percentage_2,
                                    f2.AVG_TOTAL_STR as AVG_TOTAL_STR_2,
                                    f2.AVG_ROUND as AVG_ROUND_2,
                                    f2.AVG_CTRL_SECONDS as AVG_CTRL_SECONDS_2,
                                    f2.total_wins as total_wins_2,
                                    f2.total_losses as total_loss_2,
                                    f2.total_draws as total_draws_2,
                                    f2.Win_Percentage as Win_Percentage_2,
                                    f2.Height_CM as Height_CM_2,
                                    f2.Weight_KG as Weight_KG_2,
                                    f2.Reach_Conv as Reach_Conv_2
                                    FROM upcoming_fight_info as ufi \
                                    JOIN fighters as f1 ON ufi.fighter1 = f1.fighter \
                                    JOIN fighters as f2 ON ufi.fighter2 = f2.fighter
                                    """)
full_upcoming_fight_info.show(200)

In [11]:
#make event index as well

full_upcoming_fight_info.createOrReplaceTempView("full_upcoming_fight_info")


backlog_table = spark.sql("""
    SELECT
        event,
        event_location,
        event_date,
        fighter1,
        fighter_index_1,
        fighter2,
        fighter_index_2
    FROM 
        full_upcoming_fight_info
""")

backlog_table.show(60)

In [12]:
full_upcoming_fight_info = full_upcoming_fight_info.drop('event', 'fighter1', 'fighter2', 'event_date', 'event_location')
full_upcoming_fight_info.show(60)

## 5. Predict the outcome of the fights

In [13]:
# Convert Spark DataFrame to pandas DataFrame
full_upcoming_fight_info_pd = full_upcoming_fight_info.toPandas()


# Scale the features if needed
scaler = StandardScaler()
X = scaler.fit_transform(full_upcoming_fight_info_pd)

# Predict outcomes using the trained models
y_mlp = model_mlp.predict(X)
y_lr = model_lr.predict(X)
y_sve = model_sve.predict(X)

# Combine predictions
full_upcoming_fight_results_pd = pd.DataFrame({
    'mlp_prediction': y_mlp,
    'lr_prediction': y_lr,
    'hve_prediction': y_sve
})
full_upcoming_fight_results_pd

In [14]:
# Determine the most common prediction
final_pred = full_upcoming_fight_results_pd.mode(axis=1)[0].rename('final_prediction')
final_pred

#### Probability of the prediction

In [15]:
proba_mlp = model_mlp.predict_proba(X)
proba_lr = model_lr.predict_proba(X)
proba_sve = model_sve.predict_proba(X)

# Combine probabilities
overall_proba = (proba_mlp + proba_lr + proba_sve) / 3
overall_proba_normalized = overall_proba / overall_proba.sum(axis=1)[:, None]
overall_proba_df = pd.DataFrame(overall_proba_normalized, columns=model_lr.classes_)
# combine the probabilities with the final prediction
overall_proba_df = pd.concat([overall_proba_df, final_pred], axis=1)
overall_proba_df

In [16]:
# Add final prediction as a new column to the pandas DataFrame
full_upcoming_fight_info_pd['final_prediction'] = final_pred
# join the final prediction to the probabilities
full_upcoming_fight_info_pd

In [17]:

# add the W/L probabilities to the dataframe
full_upcoming_fight_info_pd = pd.concat([full_upcoming_fight_info_pd, overall_proba_df['W/L']], axis=1)
full_upcoming_fight_info_pd

In [18]:

# Convert pandas DataFrame back to Spark DataFrame
full_upcoming_fight_info_with_pred = spark.createDataFrame(full_upcoming_fight_info_pd)
#rename W/L to W_L_fighter1
full_upcoming_fight_info_with_pred = full_upcoming_fight_info_with_pred.withColumnRenamed('W/L', 'W_L_fighter1')
# Show the DataFrame with final predictions
full_upcoming_fight_info_with_pred.show()

## couple back to the event and fighters

In [30]:
full_upcoming_fight_info_with_pred.createOrReplaceTempView("full_upcoming_fight_info_with_pred")

backlog_table.createOrReplaceTempView("backlog_table")

final_table = spark.sql("""
    SELECT
        backlog_table.event,
        backlog_table.event_location,
        backlog_table.event_date,
        backlog_table.fighter1,
        backlog_table.fighter_index_1,
        backlog_table.fighter2,
        backlog_table.fighter_index_2,
        full_upcoming_fight_info_with_pred.final_prediction,
        full_upcoming_fight_info_with_pred.W_L_fighter1
    FROM 
        backlog_table
    JOIN
        full_upcoming_fight_info_with_pred
    ON
        backlog_table.fighter_index_1 = full_upcoming_fight_info_with_pred.fighter_index_1 and backlog_table.fighter_index_2 = full_upcoming_fight_info_with_pred.fighter_index_2
""")

final_table.drop('event_index', 'fighter_index_1', 'fighter_index_2')

final_table_to_write = final_table.drop('event_date', 'event_location', 'fighter_index_1', 'fighter_index_2')

final_table_to_write.show(200)

In [31]:
final_table_to_write = final_table_to_write.toPandas()

final_table_to_write.to_csv('processed_data/final_table_fights.csv', index=False)

Get events only:

In [20]:
final_table.createOrReplaceTempView("final_table")

event_table = spark.sql("""
    SELECT
        DISTINCT
        event,
        event_location,
        event_date
    FROM
        final_table
""")

# write the event table to pandas

event_table_pd = event_table.toPandas()

#writing the event table to csv

event_table_pd.to_csv('processed_data/upcoming_events.csv', index=False)

### Comparing with individual results

In [21]:
fighter_table = spark.sql("""
    SELECT
        *
    FROM
        fighters
""")

#get by name islam makachev and dustin porrier

fighter_table.createOrReplaceTempView("fighter_table")

fighter1 = spark.sql("""
    SELECT
        *
    FROM
        fighter_table
    WHERE
        fighter = 'Islam Makhachev'
""")

fighter2 = spark.sql("""
    SELECT
        *
    FROM
        fighter_table
    WHERE
        fighter = 'Dustin Poirier'
""")

fighter1.show()


In [22]:
fighter2.show()

In [23]:
from pyspark.sql.functions import lit

#now we simulate the fight between islam makhachev and dustin porrier, join the 2 rows, dustin is fighter 1 and islam is fighter 2

fighter1 = fighter1.withColumnRenamed('fighter', 'fighter1')

fighter2 = fighter2.withColumnRenamed('fighter', 'fighter2')

#add index to the fighters

fighter1 = fighter1.withColumn('fighter_index', lit(1))

fighter2 = fighter2.withColumn('fighter_index', lit(2))

fighter1.createOrReplaceTempView("fighter1")

fighter2.createOrReplaceTempView("fighter2")

little_table = spark.sql("""
                            SELECT
                                    f1.fighter1,
                                    f1.fighter_index as fighter_index_1,
                                    f1.AVG_KD as AVG_KD_1,
                                    f1.AVG_SUB_ATT as AVG_SUB_ATT_1,
                                    f1.AVG_TD_Percentage as AVG_TD_Percentage_1,
                                    f1.AVG_Significant_Strike_Percentage as AVG_Significant_Strike_Percentage_1,
                                    f1.AVG_TOTAL_STR as AVG_TOTAL_STR_1,
                                    f1.AVG_ROUND as AVG_ROUND_1,
                                    f1.AVG_CTRL_SECONDS as AVG_CTRL_SECONDS_1,
                                    f1.total_wins as total_wins_1,
                                    f1.total_losses as total_loss_1,
                                    f1.total_draws as total_draws_1,
                                    f1.Win_Percentage as Win_Percentage_1,
                                    f1.Height_CM as Height_CM_1,
                                    f1.Weight_KG as Weight_KG_1,
                                    f1.Reach_Conv as Reach_Conv_1,
                                    f2.fighter2,
                                    f2.fighter_index as fighter_index_2,
                                    f2.AVG_KD as AVG_KD_2,
                                    f2.AVG_SUB_ATT as AVG_SUB_ATT_2,
                                    f2.AVG_TD_Percentage as AVG_TD_Percentage_2,
                                    f2.AVG_Significant_Strike_Percentage as AVG_Significant_Strike_Percentage_2,
                                    f2.AVG_TOTAL_STR as AVG_TOTAL_STR_2,
                                    f2.AVG_ROUND as AVG_ROUND_2,
                                    f2.AVG_CTRL_SECONDS as AVG_CTRL_SECONDS_2,
                                    f2.total_wins as total_wins_2,
                                    f2.total_losses as total_loss_2,
                                    f2.total_draws as total_draws_2,
                                    f2.Win_Percentage as Win_Percentage_2,
                                    f2.Height_CM as Height_CM_2,
                                    f2.Weight_KG as Weight_KG_2,
                                    f2.Reach_Conv as Reach_Conv_2
                                    FROM fighter1 as f1 \
                                    JOIN fighter2 as f2
                                    """)

little_table.show()


In [24]:
#now we predict the fight between islam makhachev and dustin porrier using the models

little_table = little_table.drop('fighter1', 'fighter2')

little_table_pd = little_table.toPandas()

little_table_pd.head()

In [25]:

# Scale the features if needed

scaler = StandardScaler()

X = scaler.fit_transform(little_table_pd)

# Predict outcomes using the trained models

y_mlp = model_mlp.predict(X)

y_lr = model_lr.predict(X)

y_sve = model_sve.predict(X)

# Combine predictions

little_table_results_pd = pd.DataFrame({

    'mlp_prediction': y_mlp,

    'lr_prediction': y_lr,

    'hve_prediction': y_sve

})

little_table_results_pd

In [26]:
# Determine the most common prediction

final_pred = little_table_results_pd.mode(axis=1)[0].rename('final_prediction')

final_pred

In [27]:
# Probability of the prediction

proba_mlp = model_mlp.predict_proba(X)

proba_lr = model_lr.predict_proba(X)

proba_sve = model_sve.predict_proba(X)

# Combine probabilities

overall_proba = (proba_mlp + proba_lr + proba_sve) / 3

overall_proba_normalized = overall_proba / overall_proba.sum(axis=1)[:, np.newaxis]

overall_proba_df = pd.DataFrame(overall_proba_normalized, columns=model_lr.classes_)

overall_proba_df

final_table.drop('event_index', 'fighter_index_1', 'fighter_index_2').show(60)