In [1]:
# =============================================================================
# 01_data_preparation.ipynb
# 
# Purpose: This notebook takes the raw ATP match data, cleans it,
# engineers relevant features, and saves the final prepared datasets (X and y)
# for the modeling phase. This is the first step in our ML pipeline.
# ==========================================================================

In [2]:
import pandas as pd

In [3]:
# Load the raw dataset
df_matches = pd.read_csv('atp_tennis.csv')

In [4]:
df_matches.head()

Unnamed: 0,Tournament,Date,Series,Court,Surface,Round,Best of,Player_1,Player_2,Winner,Rank_1,Rank_2,Pts_1,Pts_2,Odd_1,Odd_2,Score
0,Australian Hardcourt Championships,2000-01-03,International,Outdoor,Hard,1st Round,3,Dosedel S.,Ljubicic I.,Dosedel S.,63,77,-1,-1,-1.0,-1.0,6-4 6-2
1,Australian Hardcourt Championships,2000-01-03,International,Outdoor,Hard,1st Round,3,Clement A.,Enqvist T.,Enqvist T.,56,5,-1,-1,-1.0,-1.0,3-6 3-6
2,Australian Hardcourt Championships,2000-01-03,International,Outdoor,Hard,1st Round,3,Escude N.,Baccanello P.,Escude N.,40,655,-1,-1,-1.0,-1.0,6-7 7-5 6-3
3,Australian Hardcourt Championships,2000-01-03,International,Outdoor,Hard,1st Round,3,Knippschild J.,Federer R.,Federer R.,87,65,-1,-1,-1.0,-1.0,1-6 4-6
4,Australian Hardcourt Championships,2000-01-03,International,Outdoor,Hard,1st Round,3,Fromberg R.,Woodbridge T.,Fromberg R.,81,198,-1,-1,-1.0,-1.0,7-6 5-7 6-4


In [5]:
# The dataset contains incomplete data (e.g., ATP points are marked as -1) before a certain period.
# We've identified that modern, reliable data starts from index 15653.
clean_start_index = 15653

In [6]:
# Create a new DataFrame containing only the modern, clean-era matches
df_modern = df_matches.loc[clean_start_index:].copy()

In [7]:
# Create the binary target 'Target'. 
# It's 1 if Player 1 won, 0 otherwise (Player 2 won)
df_modern['Target'] = (df_modern['Winner'] == df_modern['Player_1']).astype(int)

In [8]:
df_modern[['Player_1', 'Player_2', 'Winner', 'Target']].head(10)

Unnamed: 0,Player_1,Player_2,Winner,Target
15653,Ryderstedt M.,Vinciguerra A.,Vinciguerra A.,0
15654,Youzhny M.,Haehnel J.,Youzhny M.,1
15655,Dlouhy L.,Ferrero J.C.,Ferrero J.C.,0
15656,Berdych T.,Kim K.,Berdych T.,1
15657,Acasuso J.,Almagro N.,Almagro N.,0
15658,Garcia-Lopez G.,Lapentti N.,Garcia-Lopez G.,1
15659,Monaco J.,Nadal R.,Nadal R.,0
15660,Zabaleta M.,Sluiter R.,Zabaleta M.,1
15661,Calleri A.,Martin A.,Martin A.,0
15662,Serra F.,Nieminen J.,Serra F.,1


In [9]:
# Calculate the difference in ranking points between players.
df_modern['Rank_Difference'] = df_matches['Rank_1'] - df_matches['Rank_2']

In [10]:
# Calculate the difference in ATP points.
df_modern['Pts_Difference'] = df_modern['Pts_1'] - df_modern['Pts_2']

In [11]:
# Define the final list of features to be used by the model
final_features = ['Rank_1', 'Rank_2', 'Pts_1', 'Pts_2', 'Rank_Difference', 'Pts_Difference', 'Surface', 'Odd_1', 'Odd_2']

# Create our feature matrix (X) and target vector (y)
X = df_modern[final_features].fillna(0)
y = df_modern['Target']

In [12]:
# Save the finished X and y to files
X.to_pickle('prepared_X.pkl')
y.to_pickle('prepared_y.pkl')

print(f"Feature matrix X has {X.shape[0]} rows and {X.shape[1]} columns.")
print(f"Target vector y has {y.shape[0]} rows.")
print("X and y are prepared and saved to the files prepared_X.pkl and prepared_y.pkl")

Feature matrix X has 50231 rows and 9 columns.
Target vector y has 50231 rows.
X and y are prepared and saved to the files prepared_X.pkl and prepared_y.pkl
