# Past Fight Analysis

In [59]:
import config.ConnectionConfig as cc
import pandas as pd
from tqdm import tqdm_notebook
import yaml
import ufc_fights.data_collector.scrape_ufc_stats_library as LIB

In [58]:

config = yaml.safe_load(open('../data_collector/scrape_ufc_stats_config.yaml'))
events_url = config['completed_events_all_url']
# get soup
soup = LIB.get_soup(events_url)

# parse event details
all_event_details_df = LIB.parse_event_details(soup)
list_of_events_urls = list(all_event_details_df['URL'])
# create empty df to store fight details
all_fight_details_df = pd.DataFrame(columns=config['fight_details_column_names'])
# define list of urls of fights to parse

# loop through each event and parse fight details
for url in tqdm_notebook(list_of_events_urls):

    # get soup
    soup = LIB.get_soup(url)

    # parse fight links
    fight_details_df = LIB.parse_fight_details(soup)
    
    # concat fight details
    all_fight_details_df = pd.concat([all_fight_details_df, fight_details_df])
    

In [None]:
    
list_of_fight_details_urls = list(all_fight_details_df['URL'])
# create empty df to store fight results
all_fight_results_df = pd.DataFrame(columns=config['fight_results_column_names'])

In [None]:
# Initialize an empty list to store individual fight result DataFrames
all_fight_results_dfs = []

# loop through each fight and parse fight results and stats
for url in list_of_fight_details_urls[:10]:
    # get soup
    soup = LIB.get_soup(url)

    # parse fight results and fight stats
    fight_results_df, fight_stats_df = LIB.parse_organise_fight_results_and_stats(
        soup,
        url,
        config['fight_results_column_names'],
        config['totals_column_names'],
        config['significant_strikes_column_names']
    )

    # Append the fight result DataFrame to the list
    all_fight_results_dfs.append(fight_results_df)

# Concatenate all the fight result DataFrames in one go
all_fight_results_df = pd.concat(all_fight_results_dfs)

# Display the concatenated DataFrame
display(all_fight_results_df)


In [60]:
cc.setupEnvironment()
spark = cc.startLocalCluster("UFC_Fighter_Stats")
spark.getActiveSession()

In [61]:
past_fights = spark.read.csv("../data/ufc_fight_results.csv", header=True)
fight_total = spark.read.csv("../processed_data/fight_total.csv", header=True)
print(past_fights.count())
fight_total.count()

In [62]:
past_fights.show()

In [63]:
fight_total.show()
fight_total.createOrReplaceTempView("fight_total")

In [64]:
from pyspark.sql.functions import split

# Assuming 'past_fights' is your Spark DataFrame
# Splitting the 'BOUT' column into 'fighter1' and 'fighter2'
past_fights = past_fights.withColumn('fighter1', split('BOUT', ' vs. ')[0])
past_fights = past_fights.withColumn('fighter2', split('BOUT', ' vs. ')[1])
# drop all columns except EVENT, OUTCOME, fighter1, fighter2
past_fights = past_fights.select('EVENT', 'OUTCOME', 'fighter1', 'fighter2')
final_past_fight_data = past_fights
# Display the DataFrame to check the changes
past_fights.show()

In [65]:
# past_fights = past_fights.drop('EVENT')
past_fights.createOrReplaceTempView("past_fights")

In [66]:
past_fights.printSchema()

In [67]:
from pyspark.sql.functions import trim

# Trim extra spaces from fighter names in both DataFrames
fight_total_spark = spark.sql("""
    SELECT TRIM(fighter1) AS fighter_1,
           TRIM(fighter2) AS fighter_2, *
    FROM fight_total
""")
fight_total_spark.createOrReplaceTempView("trimmed_fight_total")
# drop fighter1 and fighter2 columns
fight_total_spark = fight_total_spark.drop('fighter1', 'fighter2')
# rename fighter_1 and fighter_2 columns to fighter1 and fighter2
fight_total_spark = fight_total_spark.withColumnRenamed('fighter_1', 'fighter1')
fight_total_spark = fight_total_spark.withColumnRenamed('fighter_2', 'fighter2')
fight_total_spark.show()

In [68]:

spark.sql("""
    SELECT TRIM(fighter1) AS fighter1,
           TRIM(fighter2) AS fighter2,
           monotonically_increasing_id() AS row_id
    FROM past_fights
""").createOrReplaceTempView("trimmed_past_fights")

# |-- fighter1_index: string (nullable = true)
# |-- height1: string (nullable = true)
# |-- weight_kg1: string (nullable = true)
# |-- avg_kd1: string (nullable = true)
# |-- avg_sub_att1: string (nullable = true)
# |-- avg_td_percentage1: string (nullable = true)
# |-- avg_significant_strike_percentage1: string (nullable = true)
# |-- avg_total_str1: string (nullable = true)
# |-- avg_round1: string (nullable = true)
# |-- avg_ctrl_seconds1: string (nullable = true)
# |-- total_wins1: string (nullable = true)
# |-- total_losses1: string (nullable = true)
# |-- total_draws1: string (nullable = true)
# |-- win_percentage1: string (nullable = true)
# |-- reach_conv1: string (nullable = true)
joined_df = spark.sql("""
    SELECT pf.fighter1, 
    ft.fighter1_index,
    ft.height1,
    ft.weight_kg1,
    ft.avg_kd1,
    ft.avg_sub_att1,
    ft.avg_td_percentage1,
    ft.avg_significant_strike_percentage1,
    ft.avg_total_str1,
    ft.avg_round1,
    ft.avg_ctrl_seconds1,
    ft.total_wins1,
    ft.total_losses1,
    ft.total_draws1,
    ft.win_percentage1,
    ft.reach_conv1,
    pf.fighter2, 
    ft.fighter2_index,  
    ft.height2,
    ft.weight_kg2,
    ft.avg_kd2,
    ft.avg_sub_att2,
    ft.avg_td_percentage2,
    ft.avg_significant_strike_percentage2,
    ft.avg_total_str2,
    ft.avg_round2,
    ft.avg_ctrl_seconds2,
    ft.total_wins2,
    ft.total_losses2,
    ft.total_draws2,
    ft.win_percentage2,
    ft.reach_conv2,
    ft.OUTCOME
    FROM trimmed_fight_total AS ft
    JOIN trimmed_past_fights AS pf
    ON ft.fighter1 = pf.fighter1 AND ft.fighter2 = pf.fighter2
    ORDER BY pf.row_id
""")

In [69]:
joined_df.show()

In [70]:
all_past_fight_data = joined_df
joined_df.show()

In [71]:
joined_df = joined_df.drop('fighter1', 'fighter2')

In [72]:
joined_df.show()

In [73]:
from pyspark.sql.functions import col

def split_data(joined_df, num_parts):
    # Calculate the total count of rows
    total_rows = joined_df.count()
    
    # Calculate the number of rows for each part
    part_rows = int(total_rows / num_parts)
    
    # Create a list to store the segmented DataFrames
    segmented_data = []
    
    # Split the data into parts
    for i in range(num_parts):
        # Define the start and end indices for the part
        start_index = part_rows * i
        end_index = part_rows * (i + 1)
        
        # Extract the part from the DataFrame
        part_df = joined_df.select('*').limit(end_index).subtract(joined_df.select('*').limit(start_index))
        
        # Append the part to the list
        segmented_data.append(part_df)
    
    return segmented_data

In [74]:
num_parts = 5
part_data = split_data(joined_df, num_parts)
part1, part2, part3, part4, part5 = part_data

In [75]:
part1.count()

In [76]:
total = joined_df.count()
train_rows = int(total * 0.8)
# train data is part 5, 2, 3, 4
train_data = part5.union(part4).union(part3).union(part2)
# test data is part 1
test_data = part1
X_train = train_data.drop('OUTCOME')
y_train = train_data.select('OUTCOME')
X_test = test_data.drop('OUTCOME')
y_test = test_data.select('OUTCOME')
print(X_train.count())
print(y_train.count())
print(X_test.count())
print(y_test.count())

In [77]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
X_train = X_train.toPandas()
X_test = X_test.toPandas()

X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [78]:
import joblib
from sklearn.neural_network import MLPClassifier
model = joblib.load("../models/ufc_mlpclassifier_model.pkl")

In [79]:
y_train_flat = y_train.toPandas().values.flatten()
model.fit(X_train_scaled, y_train_flat)

In [80]:
model.score(X_train_scaled, y_train_flat)

In [81]:
predictions = model.predict(X_test_scaled)

In [82]:
probabilities = model.predict_proba(X_test_scaled)
# use model.classes_ to get the class labels
probabilities = pd.DataFrame(probabilities, columns=model.classes_)
probabilities

In [83]:
# accuracy score of the test data
y_test_flat = y_test.toPandas().values.flatten()
print(f'Test accuracy: {model.score(X_test_scaled, y_test_flat)}')

In [84]:
# join X_test with y_test
# convert numpt to pandas
predictions = pd.DataFrame(predictions, columns=['predicted_outcome'])
predictions = pd.concat([probabilities, predictions], axis=1)
predictions

In [85]:
X_test = pd.concat([X_test, predictions], axis=1)
X_test

In [86]:
X_test = spark.createDataFrame(X_test)
X_test.createOrReplaceTempView("X_test")

In [87]:
print(X_test.count())
X_test

In [88]:
spark.sql("SELECT TRIM(fighter1) AS fighter1, "
          "TRIM(fighter2) AS fighter2,"
          "OUTCOME, EVENT "
          "FROM past_fights").createOrReplaceTempView("past_fights")

In [89]:
final_df = spark.sql("SELECT pf.EVENT, pf.fighter1, pf.fighter2, "
                     "xt.predicted_outcome, xt.`W/L` as predicted_accuracy, pf.OUTCOME as actual_outcome "
                     "FROM X_test as xt "
                     "JOIN fight_total as ft "
                     "ON xt.fighter1_index = ft.fighter1_index "
                     "AND xt.fighter2_index = ft.fighter2_index "
                     "JOIN past_fights as pf ON pf.fighter1 = ft.fighter1 AND pf.fighter2 = ft.fighter2 "
                     "group by pf.fighter1, pf.fighter2, pf.EVENT, xt.predicted_outcome, pf.OUTCOME, xt.`W/L` ")
final_df.show(2000)

In [90]:
final_df.count()

In [91]:
final_df.toPandas().to_csv("../processed_data/past_fights_predictions/past_fight_predictions_6.csv", index=False)

In [92]:
pred_1 = pd.read_csv("../processed_data/past_fights_predictions/past_fight_predictions_1.csv", sep=',')
pred_2 = pd.read_csv("../processed_data/past_fights_predictions/past_fight_predictions_2.csv", sep=',')
pred_3 = pd.read_csv("../processed_data/past_fights_predictions/past_fight_predictions_3.csv", sep=',')
pred_4 = pd.read_csv("../processed_data/past_fights_predictions/past_fight_predictions_4.csv", sep=',')
pred_5 = pd.read_csv("../processed_data/past_fights_predictions/past_fight_predictions_5.csv", sep=',')
pred_6 = pd.read_csv("../processed_data/past_fights_predictions/past_fight_predictions_6.csv", sep=',')

In [93]:
# combine all predictions
all_predictions = pd.concat([pred_1, pred_2, pred_3, pred_4, pred_5, pred_6])
# order by event
all_predictions = all_predictions.sort_values(by='EVENT')
all_predictions.count()

In [94]:
# remove all rows with duplicate event, fighter1 and fighter2 combinations
all_predictions = all_predictions.drop_duplicates(subset=['EVENT', 'fighter1', 'fighter2'])
all_predictions.count()

In [96]:
all_predictions.to_csv("../processed_data/past_fights_predictions/all_past_fight_predictions.csv", index=False)