# Predictions on upcoming tennis matches with stacking classifier

In [25]:
import joblib
import pandas as pd
import config.ConnectionConfig as cc
import re

In [26]:
cc.setupEnvironment()
spark = cc.startLocalCluster("Tennis Predictions")
spark.getActiveSession()

In [27]:
scaler = joblib.load('../../models/tennis_nn_scaler.pkl')
nn_model = joblib.load("../../models/tennis_nn_model.pkl")

In [28]:
player_stats = spark.read.csv('../../data/full_player_stats_total.csv', header=True)
player_stats = player_stats.toPandas()
player_stats.to_csv('../../data/all_player_stats.csv')

In [29]:
upcoming_matches = pd.read_csv('../../data/upcoming_matches_info_clean.csv', index_col=0)
upcoming_matches

In [30]:
surfaces = upcoming_matches['Surface'].unique()
surface_mapping = {surface: index + 1 for index, surface in enumerate(surfaces)}
upcoming_matches['Surface'] = upcoming_matches['Surface'].map(surface_mapping)

In [31]:
def clean_player_name(name):
    # Remove everything between [], (), and extra spaces
    cleaned_name = re.sub(r'\[.*?\]|\(.*?\)|\s', '', name).strip()
    return cleaned_name

In [32]:
def clean_player_stats_names(df):
    df['cleaned_player'] = df['player'].apply(clean_player_name)
    return df

In [33]:
player_stats = clean_player_stats_names(player_stats)

In [34]:
player_stats

In [35]:
all_upcoming_matches = upcoming_matches.copy()
all_upcoming_matches

In [36]:
upcoming_matches.drop(columns=['Event'], inplace=True)
upcoming_matches

In [37]:
upcoming_matches = upcoming_matches.merge(player_stats, left_on='player1', right_on='player', how='left')
upcoming_matches.columns = ['Location' ,'Surface', 'Date','player1', 'player2', ] + [col + '1' for col in upcoming_matches.columns[5:]]

In [38]:
upcoming_matches['player1'] = upcoming_matches['player1'].astype(str)
upcoming_matches['player2'] = upcoming_matches['player2'].astype(str)
upcoming_matches['Date'] = upcoming_matches['Date'].astype(str)

In [39]:
upcoming_matches['cleaned_player2'] = upcoming_matches['player2'].apply(clean_player_name)

In [40]:
upcoming_matches.columns

In [41]:
upcoming_matches = upcoming_matches.merge(player_stats, left_on='cleaned_player2', right_on='cleaned_player', how='left')
upcoming_matches.columns = ['Location' ,'Surface', 'Date', 'player1', 'player2', 'player1',
       'Ranking at that time1', 'Opponent Ranking at that time1',
       'Dominance Ratio1', 'Ace Ratio1', 'Double Fault Ratio1',
       'First Serve Percentage1', 'First Serve Points Won1',
       'Second Serve Points Won1', 'round value1', 'Break Points Won1',
       'Break Points Faced1', 'Sets Won1', 'Sets Lost1', 'Total time1',
       'player_id1', 'rank1', 'player_url1', 'name1', 'nationality1',
       'date_of_birth1', 'cleaned_player1', 'cleaned_player2'] + [col + '2' for col in upcoming_matches.columns[28:]]
upcoming_matches

In [42]:
# calculate the age of the players based on the Date and the date of birth
upcoming_matches['Date'] = upcoming_matches['Date'].str.replace('‑', '-')
# Convert the 'Date' column to datetime format
upcoming_matches['Date'] = pd.to_datetime(upcoming_matches['Date'], format='%d-%b-%Y')
upcoming_matches['date_of_birth1'] = pd.to_datetime(upcoming_matches['date_of_birth1'])
upcoming_matches['date_of_birth2'] = pd.to_datetime(upcoming_matches['date_of_birth2'])
upcoming_matches['age1'] = (upcoming_matches['Date'] - upcoming_matches['date_of_birth1']).dt.days
upcoming_matches['age2'] = (upcoming_matches['Date'] - upcoming_matches['date_of_birth2']).dt.days
upcoming_matches = upcoming_matches.drop(columns=['Date', 'date_of_birth1', 'date_of_birth2'])
upcoming_matches = upcoming_matches.dropna()
upcoming_matches_stats = upcoming_matches

In [None]:
upcoming_matches = upcoming_matches[['player_id1', 'age1', 'Ranking at that time1', 'Opponent Ranking at that time1', 'Double Fault Ratio1', 'First Serve Percentage1', 'First Serve Points Won1', 'Break Points Won1', 'Sets Won1', 'Sets Lost1', 'Total time1', 'player_id2', 'age2','Ranking at that time2', 'Opponent Ranking at that time2', 'Double Fault Ratio2', 'First Serve Percentage2', 'First Serve Points Won2', 'Break Points Won2', 'Sets Won2', 'Sets Lost2', 'Total time2', 'Surface']]
upcoming_matches

In [None]:
X = upcoming_matches
X.shape

In [None]:
X_scaled = scaler.transform(X)
X_scaled = pd.DataFrame(X_scaled, columns=X.columns)
X_scaled

In [None]:
y_pred = nn_model.predict(X_scaled)
y_pred

In [None]:
# predict the probability of the player1 winning
nn_model.predict_proba(X_scaled)
# print all the possible outcomes
outcomes = nn_model.classes_
# add the probabilities= to a new df
probabilities = nn_model.predict_proba(X_scaled)
probabilities_df = pd.DataFrame(probabilities, columns=outcomes)

In [None]:
# join probabilities to the y_pred
y_pred_df = pd.DataFrame(y_pred, columns=['Prediction'])
y_pred_df = pd.concat([y_pred_df, probabilities_df], axis=1)

In [None]:
y_pred_df['Accuracy'] = y_pred_df[['L', 'W']].max(axis=1)
y_pred_df = y_pred_df.drop(columns=['L', 'W'])
y_pred_df['Accuracy'] = y_pred_df['Accuracy'].apply(lambda x: "{:.2}".format(x))
y_pred_df

In [None]:
players = pd.read_csv('../../data/player_urls.csv', index_col=0)

In [None]:
# join y_pred_df with upcoming_matches_stats
upcoming_matches_stats.reset_index(drop=True, inplace=True)
y_pred_df = pd.concat([upcoming_matches_stats, y_pred_df], axis=1)
y_pred_df = y_pred_df[['player1', 'player2', 'Prediction', 'Accuracy']]
y_pred_df = y_pred_df.loc[:,~y_pred_df.columns.duplicated()]
y_pred_df

In [None]:
# merge all_upcoming_matches with y_pred_df if the player1 and player2 are the same
all_upcoming_matches = all_upcoming_matches.merge(y_pred_df, left_on=['player1', 'player2'], right_on=['player1', 'player2'], how='left')
all_upcoming_matches = all_upcoming_matches.dropna()
all_upcoming_matches

In [None]:
all_upcoming_matches = all_upcoming_matches[['Event', 'player1', 'player2', 'Date','Prediction', 'Accuracy']]
all_upcoming_matches

In [None]:
all_upcoming_matches.to_csv('../../data/upcoming_matches_predictions.csv')