In [2]:
import pandas as pd
import numpy as np
import os

path_2023_pts = 'data/2023-wimbledon-points.csv'
path_2023_matches = 'data/2023-wimbledon-matches.csv'
path_2024_pts = 'data/2024-wimbledon-points.csv'
path_2024_matches = 'data/2024-wimbledon-matches.csv'

# function to load and clean a single year
def process_year(points_path, matches_path):
    df_pts = pd.read_csv(points_path)
    df_matches = pd.read_csv(matches_path)
    
    # keeping only useful columns to save memory
    # pointwinner and pointserver are usually 1 or 2
    cols_to_keep = [
        'match_id', 'SetNo', 'GameNo', 'PointNumber', 
        'PointWinner', 'PointServer', 'P1Score', 'P2Score',
        'GameWinner', 'SetWinner'
    ]
    
    existing_cols = [c for c in cols_to_keep if c in df_pts.columns]
    df_pts = df_pts[existing_cols]
    
    # removing weird rows where pointwinner is 0 (usually markers, not real points)
    df_pts = df_pts[df_pts['PointWinner'].isin([1, 2])]

    # Force PointNumber to be an integer (it was a string because of '0X' values)
    df_pts['PointNumber'] = df_pts['PointNumber'].astype(int)
    
    return df_pts, df_matches

pts_23, matches_23 = process_year(path_2023_pts, path_2023_matches)
pts_24, matches_24 = process_year(path_2024_pts, path_2024_matches)

# merging both years into one big dataset
all_points = pd.concat([pts_23, pts_24], ignore_index=True)
all_matches = pd.concat([matches_23, matches_24], ignore_index=True)

# creating the target variable 'y'
# logic: if the server won the point, y=1. else y=0
# pointserver and pointwinner are 1 or 2
all_points['server_won'] = (all_points['PointWinner'] == all_points['PointServer']).astype(int)

# identifying "epic" 5-set matches
# we group by match_id and check the maximum set number
match_durations = all_points.groupby('match_id')['SetNo'].max()
epic_match_ids = match_durations[match_durations == 5].index

print(f"found {len(epic_match_ids)} matches that went to 5 sets")

# filtering the main dataframe to keep only these matches
epic_points = all_points[all_points['match_id'].isin(epic_match_ids)].copy()

epic_points = epic_points.sort_values(by=['match_id', 'PointNumber'])

# this file contains the binary vector 'server_won' needed for the model
output_filename = 'data/clean_wimbledon_5setters.csv'
epic_points.to_csv(output_filename, index=False)

print(f"saved clean data to {output_filename}")
print(epic_points.head())

found 58 matches that went to 5 sets
saved clean data to data/clean_wimbledon_5setters.csv
                match_id  SetNo  GameNo  PointNumber  PointWinner  \
415  2023-wimbledon-1103      1       1            1            2   
416  2023-wimbledon-1103      1       1            2            2   
417  2023-wimbledon-1103      1       1            3            2   
418  2023-wimbledon-1103      1       1            4            2   
419  2023-wimbledon-1103      1       2            5            1   

     PointServer P1Score P2Score  GameWinner  SetWinner  server_won  
415            2       0      15           0          0           1  
416            2       0      30           0          0           1  
417            2       0      40           0          0           1  
418            2       0       0           2          0           1  
419            1      15       0           0          0           1  


In [3]:
print(epic_points[['match_id', 'PointNumber', 'server_won']].head(10))

                match_id  PointNumber  server_won
415  2023-wimbledon-1103            1           1
416  2023-wimbledon-1103            2           1
417  2023-wimbledon-1103            3           1
418  2023-wimbledon-1103            4           1
419  2023-wimbledon-1103            5           1
420  2023-wimbledon-1103            6           1
421  2023-wimbledon-1103            7           1
422  2023-wimbledon-1103            8           0
423  2023-wimbledon-1103            9           1
424  2023-wimbledon-1103           10           1
