# Pre-process our tennis data set so we can feed in the data to ML and DL model

Match data source: https://github.com/JeffSackmann/tennis_atp

In [1]:
import pandas as pd
import datetime
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns


%matplotlib inline



# Reading Dataset

In [2]:
DATASET_DIR = '../datasets'
years = np.arange(1998, 2020)

matches_orig = pd.DataFrame()
for year in years:
    matches_orig = matches_orig.append(pd.read_csv(f'{DATASET_DIR}/tennis_atp-master/atp_matches_{year}.csv', parse_dates=["tourney_date"]), ignore_index=True)
    
print(len(matches_orig))
# read in our ATP data set and parse tournament dates
# matches_orig = pd.read_csv(f'{DATASET_DIR}/tennis_atp-master/ATP.csv', parse_dates=["tourney_date"])
matches_orig.sample(10).T

66348


Unnamed: 0,31634,48581,4291,2501,25309,42780,57829,18663,50448,60395
tourney_id,2007-540,2013-D004,1999-403,1998-422,2005-D059,2011-495,2016-M007,2003-540,2013-422,2017-0506
tourney_name,Wimbledon,Davis Cup WG R1: USA vs BRA,Miami Masters,Cincinnati Masters,Davis Cup G2 R2: SLO vs LAT,Dubai,Miami Masters,Wimbledon,Cincinnati Masters,Buenos Aires
surface,Grass,Hard,Hard,Hard,Clay,Hard,Hard,Grass,Hard,Clay
draw_size,,,,,,,,,,
tourney_level,G,D,M,M,D,A,M,G,M,A
tourney_date,2007-06-25 00:00:00,2013-02-01 00:00:00,1999-03-15 00:00:00,1998-08-10 00:00:00,2005-07-15 00:00:00,2011-02-21 00:00:00,2016-03-21 00:00:00,2003-06-23 00:00:00,2013-08-11 00:00:00,2017-02-13 00:00:00
match_num,72,5,54,18,5,10,215,113,54,273
winner_id,103602,105023,102456,102154,104582,104332,104719,102035,104745,105589
winner_seed,5,,19,,,,,,4,
winner_entry,,,,Q,,,Q,,,


# Data Cleaning

In [3]:
# make all column names lower case so it's easier to remember
matches = matches_orig
matches.columns = [col.strip().lower() for col in matches.columns]

# these columns don't have much data from our EDA so we can't impute. Let's drop them
# for rank points - these are used to determine a player's ranking in the ATP at the moment of the tournament so it's duplicate of rank - will drop this as well
drop_columns = ["draw_size","loser_entry", "winner_entry", "loser_seed", "winner_seed", "loser_rank_points", "winner_rank_points"]
matches = matches.drop(drop_columns, axis=1)

# we only care about professional tournaments since we are prediction grand slams so let's filter out non-professional tournaments
matches = matches[~matches.tourney_level.isin(["C", "S", "D"])]

# ATP was formed in 1972. Federer turned Pro in 1998, we will only look at matches since January 1998
matches = matches[matches.tourney_date > datetime.datetime(1997, 12, 31)].copy()

# tourney_id is actually a little bit mis-leading - it has the format of {year}-{tourney_id} or {year} {tourney_id}
# let's split this
print(matches[matches.tourney_id == "1999 495"])

matches = matches.rename({"tourney_id": "year_tourney_id"}, axis=1)
matches["year_tourney_id"] = matches["year_tourney_id"].apply(lambda x: x.replace(" ", "-"))
matches["tourney_year"] = matches.year_tourney_id.apply(lambda x: x.split("-")[0])
matches["tourney_id"] = matches.year_tourney_id.apply(lambda x: x.split("-")[1])

print(matches[matches.year_tourney_id == "1999 495"])




Empty DataFrame
Columns: [tourney_id, tourney_name, surface, tourney_level, tourney_date, match_num, winner_id, winner_name, winner_hand, winner_ht, winner_ioc, winner_age, loser_id, loser_name, loser_hand, loser_ht, loser_ioc, loser_age, score, best_of, round, minutes, w_ace, w_df, w_svpt, w_1stin, w_1stwon, w_2ndwon, w_svgms, w_bpsaved, w_bpfaced, l_ace, l_df, l_svpt, l_1stin, l_1stwon, l_2ndwon, l_svgms, l_bpsaved, l_bpfaced, winner_rank, loser_rank]
Index: []

[0 rows x 42 columns]
Empty DataFrame
Columns: [year_tourney_id, tourney_name, surface, tourney_level, tourney_date, match_num, winner_id, winner_name, winner_hand, winner_ht, winner_ioc, winner_age, loser_id, loser_name, loser_hand, loser_ht, loser_ioc, loser_age, score, best_of, round, minutes, w_ace, w_df, w_svpt, w_1stin, w_1stwon, w_2ndwon, w_svgms, w_bpsaved, w_bpfaced, l_ace, l_df, l_svpt, l_1stin, l_1stwon, l_2ndwon, l_svgms, l_bpsaved, l_bpfaced, winner_rank, loser_rank, tourney_year, tourney_id]
Index: []

[0 rows x

### Clean String Columns 

Let's standarize data that is non-numeric by stripping out leading/trailing spaces and converting to lowercase

We will also remove any special characters

In [4]:
# keys for the data are player names and tournament names - these are strings
# we also have come categorical columns - ie loser_hand, winner_hand, surface, tourney_level, 
# let's convert any non-numerical column data into lower case and strip
# we will also remove any special characters and accents

import unicodedata
import re

# first let's print one of the columns
print("Before lowering...")
print(matches[:5].loser_hand)

lower_columns = [col for col, dt in matches.dtypes.items() if dt == np.object]
for col in lower_columns:
    print(f'cleaning col {col}')
    matches[col] = matches[col].str.strip()
    matches[col] = matches[col].str.lower()
    matches[col] = matches[col].apply(lambda x: unicodedata.normalize('NFKD', str(x)).encode('ascii', 'ignore').decode('utf-8', 'ignore'))
    matches[col] = matches[col].apply(lambda x: re.sub('[^a-zA-Z0-9\s]', ' ', x, flags=re.I | re.A))


# check to make sure we've done this correctly
print("After lowering...")
matches[:5].loser_hand

Before lowering...
0    R
1    R
2    R
3    R
4    R
Name: loser_hand, dtype: object
cleaning col year_tourney_id
cleaning col tourney_name
cleaning col surface
cleaning col tourney_level
cleaning col winner_name
cleaning col winner_hand
cleaning col winner_ioc
cleaning col loser_name
cleaning col loser_hand
cleaning col loser_ioc
cleaning col score
cleaning col round
cleaning col tourney_year
cleaning col tourney_id
After lowering...


0    r
1    r
2    r
3    r
4    r
Name: loser_hand, dtype: object

In [5]:
matches.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 59764 entries, 0 to 66347
Data columns (total 44 columns):
year_tourney_id    59764 non-null object
tourney_name       59764 non-null object
surface            59764 non-null object
tourney_level      59764 non-null object
tourney_date       59764 non-null datetime64[ns]
match_num          59764 non-null int64
winner_id          59764 non-null int64
winner_name        59764 non-null object
winner_hand        59764 non-null object
winner_ht          57500 non-null float64
winner_ioc         59764 non-null object
winner_age         59764 non-null float64
loser_id           59764 non-null int64
loser_name         59764 non-null object
loser_hand         59764 non-null object
loser_ht           55821 non-null float64
loser_ioc          59764 non-null object
loser_age          59764 non-null float64
score              59764 non-null object
best_of            59764 non-null int64
round              59764 non-null object
minutes            578

# Impute Missing Data

We are missing some matches's minutes (ie, length of the match). We can impute this - since matches length might depend on the tournament (ie, surface) and whether the tournament is best of 3 or 5 - we will use this inforamation to impute by using the mean of match minutes for that tournament

NOTE: height & weight let's do this later

In [6]:
## Some matches are misssing minutes

In [7]:
# we are missing some matches's minutes (ie, length of the match)
# we can impute this - since matches length might depend on the tournament (ie, surface) 
# and whether the tournament is best of 3 or 5 - we will use this inforamation 
# to impute by using the mean of match minutes for that tournament
tids = {id for id in matches[matches.minutes.isnull()].tourney_id}
for tid in tids:
    matches.loc[(matches.minutes.isnull()) & (matches.tourney_id == tid), "minutes"] = \
             matches[(matches.minutes.notnull()) & (matches.tourney_id == tid)].minutes.mean()

In [8]:
matches.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 59764 entries, 0 to 66347
Data columns (total 44 columns):
year_tourney_id    59764 non-null object
tourney_name       59764 non-null object
surface            59764 non-null object
tourney_level      59764 non-null object
tourney_date       59764 non-null datetime64[ns]
match_num          59764 non-null int64
winner_id          59764 non-null int64
winner_name        59764 non-null object
winner_hand        59764 non-null object
winner_ht          57500 non-null float64
winner_ioc         59764 non-null object
winner_age         59764 non-null float64
loser_id           59764 non-null int64
loser_name         59764 non-null object
loser_hand         59764 non-null object
loser_ht           55821 non-null float64
loser_ioc          59764 non-null object
loser_age          59764 non-null float64
score              59764 non-null object
best_of            59764 non-null int64
round              59764 non-null object
minutes            597

In [9]:
matches[matches.minutes.isnull()][["year_tourney_id", "minutes"]]

Unnamed: 0,year_tourney_id,minutes
3031,1998 604,
3032,1998 604,
3033,1998 604,
3034,1998 604,
3035,1998 604,
3036,1998 604,
3037,1998 604,
3038,1998 604,
3039,1998 604,
3040,1998 604,


Looks like there is still a subset of tournaments where we do not have any results. This is ok - minutes will be used as part of feature engineering. We will just have to skip over these records when we create features later.

Looks like '98 and '99 Grand Slam Cup did not record match minutes. This is a year end tournament: 
https://www.grandslamhistory.com/atp/grand-slam-cup-munich

In [10]:
matches[matches.year_tourney_id.isin(matches[matches.minutes.isnull()].year_tourney_id.tolist())]["tourney_name"].unique()

array(['grand slam cup'], dtype=object)

## Impute Height

We are missing some values for player height. This, we can impute by using the average height

In [11]:
# impute height with mean of players
matches.loc[matches.loser_ht.isnull(), 'loser_ht'] = matches.loser_ht.mean()
matches.loc[matches.winner_ht.isnull(), 'winner_ht'] = matches.winner_ht.mean()

In [12]:
matches.sample(5).T

Unnamed: 0,21371,13326,34514,53341,54051
year_tourney_id,2004 414,2001 375,2008 520,2014 422,2015 339
tourney_name,hamburg masters,lyon,roland garros,cincinnati masters,brisbane
surface,clay,carpet,clay,hard,hard
tourney_level,m,a,g,m,a
tourney_date,2004-05-10 00:00:00,2001-10-08 00:00:00,2008-05-25 00:00:00,2014-08-10 00:00:00,2015-01-04 00:00:00
match_num,30,6,69,12,14
winner_id,103151,103174,104527,105668,105902
winner_name,mariano zabaleta,noam okun,stanislas wawrinka,jerzy janowicz,james duckworth
winner_hand,r,r,r,r,r
winner_ht,183,185,183,203,183


In [13]:
# Lastly, let's drop any rows where we don't have scores for the matches
matches = matches.dropna(axis=0, subset=["score", "minutes"])

### Let's see what else we are missing data for

In [14]:
def print_columns_with_missing_data(m: pd.DataFrame):
    print(m.columns[m.isnull().any()].tolist())
    
    
print_columns_with_missing_data(matches)

['w_ace', 'w_df', 'w_svpt', 'w_1stin', 'w_1stwon', 'w_2ndwon', 'w_svgms', 'w_bpsaved', 'w_bpfaced', 'l_ace', 'l_df', 'l_svpt', 'l_1stin', 'l_1stwon', 'l_2ndwon', 'l_svgms', 'l_bpsaved', 'l_bpfaced', 'winner_rank', 'loser_rank']


## Missing Rank

At first I thought missing rank might mean that the player

When you look closer, I don't recognize all players, but Tim Henman was an English player that was at the top of his game in 1998 and was in the top 10 according to his wikipedia page: https://en.wikipedia.org/wiki/Tim_Henman

So I think we should somehow impute this

In [15]:
matches[(matches.winner_rank.isnull())][["tourney_date", "year_tourney_id", "round", "tourney_name", "winner_name"]].head(10)

Unnamed: 0,tourney_date,year_tourney_id,round,tourney_name,winner_name
1872,1998-06-22,1998 540,r128,wimbledon,mikael tillstrom
2251,1998-07-20,1998 418,r64,washington,jimmy arias
2351,1998-07-27,1998 423,r32,los angeles,tim henman
2359,1998-07-27,1998 423,r16,los angeles,tim henman
2363,1998-07-27,1998 423,qf,los angeles,tim henman
2365,1998-07-27,1998 423,sf,los angeles,tim henman
2580,1998-08-17,1998 419,r64,indianapolis,wayne black
2600,1998-08-17,1998 419,r32,indianapolis,wayne black
2692,1998-08-24,1998 80,r32,boston,jiri novak
3078,1998-09-28,1998 327,r32,toulouse,lionel barthez


I think a reasonable way to to impute the data for that tournament. We will take the mean of the player rank in that round and impute into our missing ranks

In [16]:
def impute_missing_rank(matches: pd.DataFrame, missing_list: pd.DataFrame, name_col: str) -> pd.DataFrame:
    for index, row in missing_list.iterrows():
        current_round_matches = matches[(matches.year_tourney_id == row.year_tourney_id) & 
                                        (matches["round"] == row["round"]) & (matches[name_col].notnull())]
        if len(current_round_matches) == 0:
            print(f'Unable to find other matches in this round index: {index} year_tourney_id: {str(row["year_tourney_id"])} round: {row["round"]} column: {name_col}')
        else:
#             print(f'len: {len(current_round_matches)} current mean: {int(current_round_matches[name_col].mean())}')
            matches.loc[index, name_col] = int(current_round_matches[name_col].mean())

In [17]:
losers_missing_rank = matches[matches.loser_rank.isnull()][["year_tourney_id", "round", "loser_name"]]
print(f'Missing loser rank before imputing: {len(losers_missing_rank)}')
print(losers_missing_rank.head(5))
impute_missing_rank(matches, losers_missing_rank, "loser_rank")    

losers_missing_rank = matches[matches.loser_rank.isnull()][["year_tourney_id", "round", "loser_name"]]

print(f'Missing loser rank after imputing: {len(losers_missing_rank)}')

Missing loser rank before imputing: 156
     year_tourney_id round        loser_name
1014        1998 336   r32    wing luen wong
1017        1998 336   r32      peter nyborg
1935        1998 540   r64  mikael tillstrom
2195        1998 321   r64  nicolas lapentti
2269        1998 418   r32       jimmy arias
Unable to find other matches in this round index: 2366 year_tourney_id: 1998 423 round: f column: loser_rank
Unable to find other matches in this round index: 3905 year_tourney_id: 1999 495 round: f column: loser_rank
Missing loser rank after imputing: 2


In [18]:
winner_missing_ranks = matches[matches.winner_rank.isnull()][["year_tourney_id", "round", "winner_name"]]
print(f'Missing winner rank before imputing: {len(winner_missing_ranks)}')

impute_missing_rank(matches, winner_missing_ranks, "winner_rank")    

winner_missing_ranks = matches[matches.winner_rank.isnull()][["year_tourney_id", "round", "winner_name"]]
print(f'Missing winner rank after imputing: {len(winner_missing_ranks)}')

Missing winner rank before imputing: 47
Missing winner rank after imputing: 0


### We were not able to imput everything but only have 2 left is no big deal - we will just drop this columns

In [19]:
print_columns_with_missing_data(matches)

['w_ace', 'w_df', 'w_svpt', 'w_1stin', 'w_1stwon', 'w_2ndwon', 'w_svgms', 'w_bpsaved', 'w_bpfaced', 'l_ace', 'l_df', 'l_svpt', 'l_1stin', 'l_1stwon', 'l_2ndwon', 'l_svgms', 'l_bpsaved', 'l_bpfaced', 'loser_rank']


# Done

OK. Looks like we have cleaned up and inputed as much data as we can as of this point

For this round, we will not be using any of the match statistics so we will not drop rows with empty data for these columns.

However, we will drop the two rows for loser_rank and then save to file so we can move on to feature engineering

In [20]:
matches = matches.dropna(axis=0, subset=["loser_rank"])
matches.to_csv(f'{DATASET_DIR}/atp_matches_preprocessed.csv', index=False)