# Tennis Matches Neural Network Training Data Preparation

We will prepare the data so it can be trained for our neural network. 

In [20]:
import pandas as pd
import config.ConnectionConfig as cc

In [21]:
cc.setupEnvironment()
spark = cc.startLocalCluster("Tennis Predictions")
spark.getActiveSession()

In [22]:
past_matches = spark.read.csv('../data/full_matches_spark_clean.csv', header=True)

In [23]:
player_stats = spark.read.csv('../data/full_player_stats_total.csv', header=True)
player_stats.show()

In [24]:
# past_matches = pd.read_csv('../data/all_matches_data_cleaned.csv', index_col=0)
# player_stats = pd.read_csv('../data/player_stats.csv')
player_ids = pd.read_csv('../data/player_urls.csv')
player_ids

In [25]:
past_matches = past_matches.toPandas()
past_matches

In [26]:
player_stats = player_stats.toPandas()
player_stats.head()

In [27]:
player_ids.head()

In [28]:
# convert the date of birth to a float
player_ids['date_of_birth'] = pd.to_datetime(player_ids['date_of_birth'])
# convert the date of birth to days
player_ids['date_of_birth'] = (pd.to_datetime('today') - player_ids['date_of_birth']).dt.days

In [29]:
def find_matching_player_name(player_name):
    for player in player_stats['player']:
        if all(part.lower() in player.lower() for part in player_name.split()):
            return player
    return player_name

# Apply the function to create player_name column
#past_matches['Winner'] = past_matches['Winner'].apply(lambda x: find_matching_player_name(x) if pd.notnull(x) else x)
#past_matches['Loser'] = past_matches['Loser'].apply(lambda x: find_matching_player_name(x) if pd.notnull(x) else x)
past_matches

In [30]:
def add_space_in_name(name):
    # after the first upper nothing should be added, after all the next upper case letter should be followed by a space
    new_name = ''
    for i in range(len(name)):
        if i == 0:
            new_name += name[i]
        elif name[i].isupper():
            new_name += ' ' + name[i]
        else:
            new_name += name[i]
    return new_name

In [31]:
# drop all rows with NaN values
past_matches = past_matches.dropna()

In [32]:
#for index, row in past_matches.iterrows():
#    if sum(1 for c in row['Winner'] if c.isupper()) > 1 and ' ' not in row['Winner']:
#        past_matches.at[index, 'Winner'] = add_space_in_name(row['Winner'])
#    if sum(1 for c in row['Loser'] if c.isupper()) > 1 and ' ' not in row['Loser']:
#        past_matches.at[index, 'Loser'] = add_space_in_name(row['Loser'])
        
#past_matches

In [33]:
# remove all cols instead of surface, winner, loser
past_matches = past_matches.drop(columns=['player'])
# add a column for the result that is always W
#past_matches['Result'] = 'W'
# change winner and loser to player 1 and player 2
#past_matches.columns = ['Date', 'Player1', 'Player2', 'Surface', 'Result']
past_matches

Randomize the order so the winner is not always the first player

In [34]:
import numpy as np

past_matches = past_matches.sample(frac=1, random_state=42).reset_index(drop=True)
swap_indices = np.random.choice([True, False], size=len(past_matches))
past_matches.loc[swap_indices, ['Player1', 'Player2']] = past_matches.loc[swap_indices, ['Player2', 'Player1']].values
past_matches.loc[swap_indices, 'Result'] = 'L'
past_matches = past_matches[['Date', 'Player1', 'Player2', 'Surface', 'Result']]
past_matches

In [35]:
# join upcoming matches with player_ids on Player1 and name add the player_id to the upcoming matches as player_id1
past_matches = past_matches.merge(player_ids, left_on='Player1', right_on='name', how='left')
past_matches

In [36]:
past_matches = past_matches.drop(columns=['name', 'rank', 'nationality'])
past_matches = past_matches.rename(columns={'player_id': 'player_id1', 'date_of_birth': 'date_of_birth1_days'})
# join upcoming matches with player_ids on Player2 and name add the player_id to the upcoming matches as player_id2
past_matches = past_matches.merge(player_ids, left_on='Player2', right_on='name', how='left')
past_matches = past_matches.drop(columns=['name', 'rank', 'nationality'])
past_matches = past_matches.rename(columns={'player_id': 'player_id2', 'date_of_birth': 'date_of_birth2_days'})
past_matches

In [37]:
player_stats.columns

In [38]:
# drop nationality, player_url, name
player_stats = player_stats.drop(columns=['nationality', 'name'])

In [39]:
# rename date_of_birth to date_of_birth_full
player_stats = player_stats.rename(columns={'date_of_birth': 'date_of_birth_full'})

In [40]:
past_matches.columns

In [41]:
# join past matches with player stats for player 1 and add a 1 after the column name
past_matches = past_matches.merge(player_stats, left_on='Player1', right_on='player', how='left')
past_matches = past_matches.drop(columns='player')
# add a 1 to every column execpt player1, player2, surface and result
past_matches.columns = ['Date', 'Player1', 'Player2', 'Surface', 'Result','date_of_birth1_days', 'player_id1', 'date_of_birth2_days',
       'player_id2'] + [col + '1' for col in past_matches.columns[9:]]

In [42]:
past_matches

In [43]:
# drop the columns date_of_birth_full1
#past_matches = past_matches.drop(columns='date_of_birth_full1')

In [44]:
past_matches.columns

In [45]:
# do the same for player 2
past_matches = past_matches.merge(player_stats, left_on='Player2', right_on='player', how='left')
past_matches

In [46]:
past_matches.columns = (
            ['Date', 'Player1', 'Player2', 'Surface', 'Result',
       'date_of_birth1_days', 'player_id1', 'date_of_birth2_days',
       'player_id2', 'Ranking at that time1', 'Opponent Ranking at that time1',
       'Dominance Ratio1', 'Ace Ratio1', 'Double Fault Ratio1',
       'First Serve Percentage1', 'First Serve Points Won1',
       'Second Serve Points Won1', 'round value1', 'Break Points Won1',
       'Break Points Faced1', 'Sets Won1', 'Sets Lost1', 'Total time1', 'date_of_birth_full1']
            + [col + '2' for col in past_matches.columns[24:]])
past_matches

In [47]:
#past_matches = past_matches.drop(columns='date_of_birth_full2')
#past_matches = past_matches.drop(columns='player2')

In [48]:
past_matches.columns

In [49]:
past_matches.isna().sum()

In [50]:
# remove all rows with NaN values
past_matches = past_matches.dropna()

In [51]:
# show me those rows
past_matches

In [52]:
# calculate the age of the players based on the Date and the date of birth
past_matches['Date'] = past_matches['Date'].str.replace('‑', '-')
# Convert the 'Date' column to datetime format
past_matches['Date'] = pd.to_datetime(past_matches['Date'], format='%d-%b-%Y')

In [53]:
past_matches.columns

In [54]:
past_matches['date_of_birth_full1'] = pd.to_datetime(past_matches['date_of_birth_full1'])

past_matches['date_of_birth_full2'] = pd.to_datetime(past_matches['date_of_birth_full2'])

In [55]:
past_matches['age1'] = (past_matches['Date'] - past_matches['date_of_birth_full1']).dt.days
past_matches['age2'] = (past_matches['Date'] - past_matches['date_of_birth_full2']).dt.days
past_matches = past_matches.drop(columns=['Date', 'date_of_birth_full1', 'date_of_birth_full2'])
past_matches

In [56]:
# change the order so first player 1 with all his stats and then player 2 with all his stats and then the surface and the result
past_matches = past_matches[['Player1', 'player_id1', 'age1',  'Ranking at that time1', 'Opponent Ranking at that time1', 'Dominance Ratio1', 'Ace Ratio1', 'Double Fault Ratio1', 'First Serve Percentage1', 'First Serve Points Won1', 'Second Serve Points Won1', 'round value1', 'Break Points Won1', 'Break Points Faced1', 'Sets Won1', 'Sets Lost1', 'Total time1','Player2','player_id2', 'age2', 'Ranking at that time2', 'Opponent Ranking at that time2', 'Dominance Ratio2', 'Ace Ratio2', 'Double Fault Ratio2', 'First Serve Percentage2', 'First Serve Points Won2', 'Second Serve Points Won2', 'round value2', 'Break Points Won2', 'Break Points Faced2', 'Sets Won2', 'Sets Lost2', 'Total time2', 'Surface', 'Result']]
past_matches

In [57]:
# remove the player1 and player2 columns
past_matches = past_matches.drop(columns=['Player1', 'Player2'])
past_matches

In [58]:
past_matches.columns

In [59]:
past_matches = past_matches.loc[:,~past_matches.columns.duplicated()]

# Perform one hot encoding on the surface column

In [60]:
surfaces = past_matches['Surface'].unique()
surface_mapping = {surface: index + 1 for index, surface in enumerate(surfaces)}
past_matches['Surface'] = past_matches['Surface'].map(surface_mapping)

In [61]:
past_matches.columns

In [62]:
new_column_order = [
    'player_id1', 'age1', 'Ranking at that time1', 'Opponent Ranking at that time1', 
    'Dominance Ratio1', 'Ace Ratio1', 'Double Fault Ratio1', 'First Serve Percentage1', 
    'First Serve Points Won1', 'Second Serve Points Won1', 'round value1', 'Break Points Won1', 
    'Break Points Faced1', 'Sets Won1', 'Sets Lost1', 'Total time1', 'player_id2', 'age2', 
    'Ranking at that time2', 'Opponent Ranking at that time2', 'Dominance Ratio2', 'Ace Ratio2', 
    'Double Fault Ratio2', 'First Serve Percentage2', 'First Serve Points Won2', 
    'Second Serve Points Won2', 'round value2', 'Break Points Won2', 'Break Points Faced2', 
    'Sets Won2', 'Sets Lost2', 'Total time2', 'Surface',  'Result'
]

# Assuming past_matches is your DataFrame
past_matches = past_matches.reindex(columns=new_column_order)

In [63]:
past_matches

In [64]:
past_matches_spark = spark.createDataFrame(past_matches)

In [65]:
past_matches_spark.write.csv('../data/final_train_df_spark.csv', header=True, mode="overwrite")

In [66]:
# past_matches.to_csv("../data/final_train_df.csv")