# Data Converter

Convert the data of `all_matches_data_raw.csv` to a dataset that can be used to train our neural network

In [1]:
import pandas as pd
import config.ConnectionConfig as cc

In [2]:
cc.setupEnvironment()
spark = cc.startLocalCluster("Tennis Predictions")
spark.getActiveSession()

In [3]:
matches_data_spark = spark.read.csv("../../data/all_matches_spark.csv", header=True)

In [4]:
matches_data = matches_data_spark.toPandas()
matches_data

In [5]:
matches_data = matches_data.rename(columns={'player_name': 'player'})

In [6]:
def count_uppercase_chars(name):
    return sum(1 for c in name if c.isupper())

In [7]:
def check_uppercase_chars(name):
    return sum(1 for c in name if c.isupper())

In [8]:
matches_data.isna().sum()
matches_data = matches_data.dropna()

In [9]:
players = pd.read_csv("../data/player_urls.csv", index_col=0)
players

In [10]:
# Function to find matching player name
def find_matching_player_name(player_name):
    for player in players['name']:
        if all(part.lower() in player.lower() for part in player_name.split()):
            return player
    if None:
        print(player_name)
        return None

# Apply the function to create player_name column
# matches_data['player_name'] = matches_data['player'].apply(lambda x: find_matching_player_name(x) if pd.notnull(x) else None)

In [11]:
matches_data.player.nunique()

# Give the round in tournament a value

In [12]:
round_mapping = {
    'Q1': 1, 'Q2': 1, 'Q3': 1, 'RR': 1,
    'R128': 1.5, 'R64': 2, 'R32': 3,
    'R16': 5, 'QF': 7, 'SF': 12, 'F': 20
}

matches_data['round value'] = matches_data['Round in Tournament'].map(round_mapping)
matches_data

In [13]:
# from the columns 'Ace Ratio', 'Double Fault Ratio', 'First Serve Percentage', 'First Serve Points Won', 'Second Serve Points Won' remove the % sign
matches_data['Ace Ratio'] = matches_data['Ace Ratio'].str.replace('%', '')
matches_data['Double Fault Ratio'] = matches_data['Double Fault Ratio'].str.replace('%', '')
matches_data['First Serve Percentage'] = matches_data['First Serve Percentage'].str.replace('%', '')
matches_data['First Serve Points Won'] = matches_data['First Serve Points Won'].str.replace('%', '')
matches_data['Second Serve Points Won'] = matches_data['Second Serve Points Won'].str.replace('%', '')
matches_data

In [14]:
# split the break points won and break points faced into two columns
break_points = matches_data['Break Points Saved'].str.split('/', expand=True)
break_points.columns = ['Break Points Won', 'Break Points Faced']
matches_data = pd.concat([matches_data, break_points], axis=1)
matches_data

In [15]:
matches_data.isna().sum()

In [16]:
print("Before dropping rows:", matches_data.shape)
matches_data_filtered = matches_data[matches_data.isnull().sum(axis=1) <= 3]
print("After dropping rows:", matches_data_filtered.shape)

In [17]:
times_split = matches_data_filtered['Time'].str.split(':', expand=True)
times_split.columns = ['Hours', 'Minutes']
matches_data_filtered = pd.concat([matches_data_filtered, times_split], axis=1)
matches_data_filtered

In [18]:
matches_data_filtered.isna().sum()

In [19]:
matches_data_filtered = matches_data_filtered.drop('Time', axis=1)

In [20]:
matches_data_filtered['Set Scores'] = matches_data_filtered['Set Scores'].str.replace(r'\s*RET\s*$', '', regex=True)
matches_data_filtered['Set Scores'] = matches_data_filtered['Set Scores'].str.replace(r'\[\d+-\d+\]', '', regex=True)

In [21]:
for index, row in matches_data_filtered.iterrows():
    set_scores = row['Set Scores'].split(' ')
    if len(set_scores) == 6:
        print(f"Match at index {index}: {row['Set Scores']}")

In [22]:
sets = matches_data_filtered['Set Scores'].str.split(' ', expand=True)
sets.columns = ['Set 1', 'Set 2', 'Set 3', 'Set 4', 'Set 5']
matches_data_filtered = pd.concat([matches_data_filtered, sets], axis=1)
matches_data_filtered

In [23]:
import re

def add_space_before_uppercase(s):
    return re.sub(r'([A-Z])', r' \1', s).strip()

In [24]:
matches_data_filtered['Winner'] = matches_data_filtered['Winner'].apply(add_space_before_uppercase)
matches_data_filtered['Loser'] = matches_data_filtered['Loser'].apply(add_space_before_uppercase)

In [25]:
matches_data_filtered

Calculate the amount of sets won and lost by the player

In [26]:
def is_winner(tennis_match):
    stripped_winner = tennis_match['Winner'].strip()
    stripped_player = tennis_match['player'].strip()
    return  stripped_winner.lower() in stripped_player.lower()

In [27]:
matches_data_filtered['isWinner'] = matches_data_filtered.apply(is_winner, axis=1)

In [28]:
matches_data_filtered

Sum up the points won and lost by the player

In [29]:
for index, row in matches_data_filtered.iterrows():
    set_scores = row['Set Scores']
    if '[' in set_scores or ']' in set_scores:
        print(f"Row at index {index}: {set_scores}")


In [30]:
sets_data = matches_data_filtered[['Set 1', 'Set 2', 'Set 3', 'Set 4', 'Set 5']]
points_won= []
points_lost = []

for index, row in matches_data_filtered.iterrows():
    player_won = row['isWinner']
    player_points = 0
    opponent_points = 0
    for column in sets_data.columns:
        set_score = row[column]
        if pd.notnull(set_score):  # Check if set score is not NaN
            set_score_parts = set_score.split('-')
            # remove the tiebreak score if there is one
            if '-' not in set_score:
                set_score_parts = ["0", "0"]
            set_score_parts[0] = set_score_parts[0].split('(')[0]
            set_score_parts[1] = set_score_parts[1].split('(')[0]
            if set_score_parts[0].isdigit() and set_score_parts[1].isdigit():
                if player_won:
                    player_points += int(set_score_parts[0])
                    opponent_points += int(set_score_parts[1])
                else:
    
                    player_points += int(set_score_parts[1])
                    opponent_points += int(set_score_parts[0])
            else:
                print(set_score_parts[0])
                print(set_score_parts[1])
    points_won.append(player_points)
    points_lost.append(opponent_points)
    
    
matches_data_filtered['Sets Won'] = points_won
matches_data_filtered['Sets Lost'] = points_lost
matches_data_filtered

In [31]:
# show the row with the highest value of hour
matches_data_filtered['Hours'] = pd.to_numeric(matches_data_filtered['Hours'])
matches_data_filtered['Minutes'] = pd.to_numeric(matches_data_filtered['Minutes'])

In [32]:
matches_data_filtered = matches_data_filtered.drop(['Set Scores', 'Set 1', 'Set 2', 'Set 3', 'Set 4', 'Set 5'], axis=1)

In [33]:
matches_data_filtered['Total time'] = matches_data_filtered['Hours'] * 60 + matches_data_filtered['Minutes']
matches_data_filtered = matches_data_filtered.drop(['Hours', 'Minutes'], axis=1)
matches_data_filtered

In [34]:
#matches_data_filtered.to_csv('../../data/full_matches_data_cleaned.csv')

In [35]:
matches_data_filtered

In [36]:
all_matches = matches_data_filtered
all_matches['Result'] = ' W'
# drop all columns instead of Date, Tournament, Winner, Loser, Result, player, isWinner
all_matches = all_matches[['Date', 'Tournament', 'Surface', 'Winner', 'Loser', 'player', 'isWinner','Result']]
all_matches['Winner'] = all_matches.apply(
    lambda row: row['player'] if row['isWinner'] else row['Winner'], axis=1)
all_matches['Loser'] = all_matches.apply(
    lambda row: row['player'] if not row['isWinner'] else row['Loser'], axis=1)
# rename column Winner to Player1 and Loser to Player2
all_matches = all_matches.rename(columns={'Winner': 'Player1', 'Loser': 'Player2'})
all_matches = all_matches.drop('isWinner', axis=1)
all_matches

In [37]:
all_matches = spark.createDataFrame(all_matches)
all_matches.write.csv("../../data/full_matches_spark_clean.csv", header=True, mode='overwrite')

In [38]:
matches_data_filtered.isna().sum()

In [39]:
# drop all rows that contain NaN values
matches_data_filtered = matches_data_filtered.dropna()
matches_data_filtered = matches_data_filtered.drop('isWinner', axis=1)

In [40]:
import numpy as np

In [41]:
def replace_non_numeric_with_nan(df, column_name):
    for x in df[column_name].unique():
        try:
            float(x)
        except ValueError:
            df.loc[df[column_name] == x, column_name] = np.nan
            print(f"Replaced non-numeric value '{x}' with NaN in column '{column_name}'")
    return df

In [42]:
matches_data_filtered = replace_non_numeric_with_nan(matches_data_filtered, "Dominance Ratio")
matches_data_filtered = replace_non_numeric_with_nan(matches_data_filtered, "Ace Ratio")
matches_data_filtered = replace_non_numeric_with_nan(matches_data_filtered, "Double Fault Ratio")
matches_data_filtered = replace_non_numeric_with_nan(matches_data_filtered, "First Serve Percentage")
matches_data_filtered = replace_non_numeric_with_nan(matches_data_filtered, "First Serve Points Won")
matches_data_filtered = replace_non_numeric_with_nan(matches_data_filtered, "Second Serve Points Won")
matches_data_filtered = replace_non_numeric_with_nan(matches_data_filtered, "Break Points Won")
matches_data_filtered = replace_non_numeric_with_nan(matches_data_filtered, "Break Points Faced")
matches_data_filtered = replace_non_numeric_with_nan(matches_data_filtered, "round value")
matches_data_filtered = replace_non_numeric_with_nan(matches_data_filtered, "Sets Won")
matches_data_filtered = replace_non_numeric_with_nan(matches_data_filtered, "Sets Lost")
matches_data_filtered = replace_non_numeric_with_nan(matches_data_filtered, "Total time")

In [43]:
matches_data_filtered.isna().sum()
matches_data_filtered = matches_data_filtered.dropna()

In [44]:
# create a function that convert a column name to float
def convert_column_to_float(df, column_name):
    df[column_name] = df[column_name].astype(float)
    return df

In [45]:
matches_data_filtered = convert_column_to_float(matches_data_filtered, "Dominance Ratio")
matches_data_filtered = convert_column_to_float(matches_data_filtered, "Ace Ratio")
matches_data_filtered['Ranking at that time'] = matches_data_filtered['Ranking at that time'].astype(int)
matches_data_filtered['Opponent Ranking at that time'] = matches_data_filtered['Opponent Ranking at that time'].astype(int)
matches_data_filtered = convert_column_to_float(matches_data_filtered, "Double Fault Ratio")
matches_data_filtered = convert_column_to_float(matches_data_filtered, "First Serve Percentage")
matches_data_filtered = convert_column_to_float(matches_data_filtered, "First Serve Points Won")
matches_data_filtered = convert_column_to_float(matches_data_filtered, "Second Serve Points Won")
matches_data_filtered = convert_column_to_float(matches_data_filtered, "Break Points Won")
matches_data_filtered = convert_column_to_float(matches_data_filtered, "Break Points Faced")
matches_data_filtered = convert_column_to_float(matches_data_filtered, "round value")
matches_data_filtered = convert_column_to_float(matches_data_filtered, "Sets Won")
matches_data_filtered = convert_column_to_float(matches_data_filtered, "Sets Lost")
matches_data_filtered = convert_column_to_float(matches_data_filtered, "Total time")

In [46]:
matches_data_filtered

In [47]:
player_stats = matches_data_filtered.groupby('player').mean().reset_index()

In [48]:
player_stats

In [49]:
player_stats_spark = spark.createDataFrame(player_stats)
player_stats_spark.write.csv("../../data/player_stats_spark.csv", header=True, mode='overwrite')

In [50]:
player_info = pd.read_csv('../../data/player_urls.csv', index_col=0)
player_info

In [51]:
merged_df = player_stats.merge(player_info, left_on='player', right_on='name')
merged_df

In [52]:
merged_df_spark = spark.createDataFrame(merged_df)
merged_df_spark.write.csv('../../data/full_player_stats_total.csv', header=True, mode="overwrite")

In [53]:
# merged_df.to_csv('../../data/full_player_stats_total.csv')