In [1]:
import joblib
import numpy as np
import pandas as pd
import config.ConnectionConfig as cc
from sklearn.ensemble import VotingClassifier
from pyspark.sql import SparkSession
from sklearn.preprocessing import StandardScaler

In [2]:

cc.setupEnvironment()
spark = cc.startLocalCluster("UFC_Logistic_Regression_Training")
spark.getActiveSession()
spark = SparkSession.builder.appName("UFC_Fights").getOrCreate()

In [3]:
model_mlp = joblib.load('models/ufc_mlpclassifier_model.pkl')
model_lr = joblib.load('models/ufc_logistic_regression_model.pkl')
model_sve = joblib.load('models/ufc_voting_clf.pkl')

In [4]:
upcoming_fight_info = spark.read.csv('processed_data/upcoming_events_fights.csv', header=True, inferSchema=True)
upcoming_fight_info.show(100)

In [5]:
fighters = spark.read.csv('processed_data/fighter_details.csv', header=True, inferSchema=True)
# order by event_date
#fighters.show(100)
# select row where fighter1 is alexandre pantoja
fighters.show()

In [6]:
upcoming_fight_info.createOrReplaceTempView("upcoming_fight_info")
fighters.createOrReplaceTempView("fighters")

info_upcoming_fights = spark.sql("""SELECT
                                    f1.fighter as fighter_1, 
                                    f1.fighter_index as fighter_index_1,
                                    f1.AVG_KD as AVG_KD_1,
                                    f1.AVG_SUB_ATT as AVG_SUB_ATT_1,
                                    f1.AVG_TD_Percentage as AVG_TD_Percentage_1,
                                    f1.AVG_Significant_Strike_Percentage as AVG_Significant_Strike_Percentage_1,
                                    f1.AVG_TOTAL_STR as AVG_TOTAL_STR_1,
                                    f1.AVG_ROUND as AVG_ROUND_1,
                                    f1.AVG_CTRL_SECONDS as AVG_CTRL_SECONDS_1,
                                    f1.total_wins as total_wins_1,
                                    f1.total_losses as total_loss_1,
                                    f1.total_draws as total_draws_1,
                                    f1.Win_Percentage as Win_Percentage_1,
                                    f1.Height_CM as Height_CM_1,
                                    f1.Weight_KG as Weight_KG_1,
                                    f1.Reach_Conv as Reach_Conv_1,
                                    f2.fighter as fighter_2,
                                    f2.fighter_index as fighter_index_2,
                                    f2.AVG_KD as AVG_KD_2,
                                    f2.AVG_SUB_ATT as AVG_SUB_ATT_2,
                                    f2.AVG_TD_Percentage as AVG_TD_Percentage_2,
                                    f2.AVG_Significant_Strike_Percentage as AVG_Significant_Strike_Percentage_2,
                                    f2.AVG_TOTAL_STR as AVG_TOTAL_STR_2,
                                    f2.AVG_ROUND as AVG_ROUND_2,
                                    f2.AVG_CTRL_SECONDS as AVG_CTRL_SECONDS_2,
                                    f2.total_wins as total_wins_2,
                                    f2.total_losses as total_loss_2,
                                    f2.total_draws as total_draws_2,
                                    f2.Win_Percentage as Win_Percentage_2,
                                    f2.Height_CM as Height_CM_2,
                                    f2.Weight_KG as Weight_KG_2,
                                    f2.Reach_Conv as Reach_Conv_2
                                    FROM upcoming_fight_info as ufi \
                                    JOIN fighters as f1 ON ufi.fighter1 = f1.fighter \
                                    JOIN fighters as f2 ON ufi.fighter2 = f2.fighter
                                    ORDER BY ufi.fighter1
                                    """)
info_upcoming_fights.show(200)

In [7]:
info_upcoming_fights.createOrReplaceTempView("info_upcoming_fights")

info_upcoming_fights = info_upcoming_fights.drop('fighter_1', 'fighter_2')

info_upcoming_fights = info_upcoming_fights.toPandas()
info_upcoming_fights

In [8]:

scaler = StandardScaler()
X = scaler.fit_transform(info_upcoming_fights)

# Predict outcomes using the trained models
y_mlp = model_mlp.predict(X)
y_lr = model_lr.predict(X)
y_sve = model_sve.predict(X)

# Combine predictions
full_upcoming_fight_results_pd = pd.DataFrame({
    'mlp_prediction': y_mlp,
    'lr_prediction': y_lr,
    'hve_prediction': y_sve
})

In [9]:
full_upcoming_fight_results_pd

In [10]:
# get the most common prediction of the three models
full_upcoming_fight_results_pd['final_prediction'] = full_upcoming_fight_results_pd.mode(axis=1)[0]
full_upcoming_fight_results_pd

In [11]:
probabilities = model_mlp.predict_proba(X)
probabilities = pd.DataFrame(probabilities, columns=model_mlp.classes_)
probabilities

In [12]:

# the probability should be the col 'W/L'
full_upcoming_fight_results_pd['probability'] = probabilities['W/L']
total_fights_data = pd.concat([info_upcoming_fights, full_upcoming_fight_results_pd], axis=1)
total_fights_data

In [13]:
total_fights_data = spark.createDataFrame(total_fights_data)
total_fights_data.createOrReplaceTempView("total_fights_data")

In [14]:
final_predictions = spark.sql("SELECT ufi.fighter_1, ufi.fighter_2, tfd.final_prediction, tfd.probability "
                              "FROM info_upcoming_fights as ufi "
                              "JOIN total_fights_data as tfd "
                              "on ufi.fighter_index_1 = tfd.fighter_index_1 and ufi.fighter_index_2 = tfd.fighter_index_2 "
                              "ORDER BY ufi.fighter_1"
                              "").show(100)