# Pre-process our tennis data set so we can feed in the data to ML and DL model

* 

In [3]:
import pandas as pd
import datetime
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns


%matplotlib inline

DATASET_DIR = '../datasets'

In [4]:
# read in our ATP data set and parse tournament dates
matches_orig = pd.read_csv(f'{DATASET_DIR}/atpdata/ATP.csv', parse_dates=["tourney_date"])

# we will get tournament info from another dataset
tourney_orig= pd.read_csv(f'{DATASET_DIR}/a-large-tennis-dataset-for-atp-and-itf-betting/all_tournaments.csv')

# Data Cleaning

In [5]:
# make all column names lower case so it's easier to remember
matches = matches_orig
matches.columns = [col.strip().lower() for col in matches.columns]

# these columns don't have much data from our EDA so we can't impute. Let's drop them
# for rank points - these are used to determine a player's ranking in the ATP at the moment of the tournament so it's duplicate of rank - will drop this as well
drop_columns = ["draw_size","loser_entry", "winner_entry", "loser_seed", "winner_seed"]
# drop_columns = ["draw_size","loser_entry", "winner_entry", "loser_seed", "winner_seed", "loser_rank_points", "winner_rank_points"]
matches = matches.drop(drop_columns, axis=1)

# we only care about professional tournaments since we are prediction grand slams so let's filter out non-professional tournaments
matches = matches[~matches.tourney_level.isin(["C", "S", "D"])]

# ATP was formed in 1972. Federer turned Pro in 1998, we will only look at matches since January 1998
matches = matches[matches.tourney_date > datetime.datetime(1997, 12, 31)].copy()

# tourney_id is actually a little bit mis-leading - it has the format of {year}-{tourney_id}
# let's split this
matches = matches.rename({"tourney_id": "year_tourney_id"}, axis=1)
matches["tourney_year"] = matches.year_tourney_id.apply(lambda x: x.split("-")[0])
matches["tourney_id"] = matches.year_tourney_id.apply(lambda x: x.split("-")[1])


Let's standarize data that is non-numeric by stripping out leading/trailing spaces and converting to lowercase

In [6]:
# keys for the data are player names and tournament names - these are strings
# we also have come categorical columns - ie loser_hand, winner_hand, surface, tourney_level, 
# let's convert any non-numerical column data into lower case and strip

# first let's print one of the columns
print("Before lowering...")
print(matches[:5].loser_hand)

lower_columns = [col for col, dt in matches.dtypes.items() if dt == np.object]
for col in lower_columns:
    matches[col] = matches[col].str.strip()
    matches[col] = matches[col].str.lower()

# check to make sure we've done this correctly
print("After lowering...")
matches[:5].loser_hand

Before lowering...
103342    R
103343    R
103344    R
103345    R
103346    R
Name: loser_hand, dtype: object
After lowering...


103342    r
103343    r
103344    r
103345    r
103346    r
Name: loser_hand, dtype: object

In [7]:
matches.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 59764 entries, 103342 to 169689
Data columns (total 46 columns):
best_of               59764 non-null int64
l_1stin               59164 non-null float64
l_1stwon              59164 non-null float64
l_2ndwon              59164 non-null float64
l_svgms               59164 non-null float64
l_ace                 59164 non-null float64
l_bpfaced             59164 non-null float64
l_bpsaved             59164 non-null float64
l_df                  59164 non-null float64
l_svpt                59164 non-null float64
loser_age             59764 non-null float64
loser_hand            59764 non-null object
loser_ht              55821 non-null float64
loser_id              59764 non-null int64
loser_ioc             59764 non-null object
loser_name            59764 non-null object
loser_rank            59608 non-null float64
loser_rank_points     59608 non-null float64
match_num             59764 non-null int64
minutes               57858 non-null fl

# Impute Missing Data

We are missing some matches's minutes (ie, length of the match). We can impute this - since matches length might depend on the tournament (ie, surface) and whether the tournament is best of 3 or 5 - we will use this inforamation to impute by using the mean of match minutes for that tournament

NOTE: height & weight let's do this later

In [8]:
## Some matches are misssing minutes

In [9]:
# we are missing some matches's minutes (ie, length of the match)
# we can impute this - since matches length might depend on the tournament (ie, surface) 
# and whether the tournament is best of 3 or 5 - we will use this inforamation 
# to impute by using the mean of match minutes for that tournament
tids = {id for id in matches[matches.minutes.isnull()].tourney_id}
for tid in tids:
    matches.loc[(matches.minutes.isnull()) & (matches.tourney_id == tid), "minutes"] = \
             matches[(matches.minutes.notnull()) & (matches.tourney_id == tid)].minutes.mean()

In [10]:
matches.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 59764 entries, 103342 to 169689
Data columns (total 46 columns):
best_of               59764 non-null int64
l_1stin               59164 non-null float64
l_1stwon              59164 non-null float64
l_2ndwon              59164 non-null float64
l_svgms               59164 non-null float64
l_ace                 59164 non-null float64
l_bpfaced             59164 non-null float64
l_bpsaved             59164 non-null float64
l_df                  59164 non-null float64
l_svpt                59164 non-null float64
loser_age             59764 non-null float64
loser_hand            59764 non-null object
loser_ht              55821 non-null float64
loser_id              59764 non-null int64
loser_ioc             59764 non-null object
loser_name            59764 non-null object
loser_rank            59608 non-null float64
loser_rank_points     59608 non-null float64
match_num             59764 non-null int64
minutes               59742 non-null fl

In [11]:
matches[matches.minutes.isnull()][["year_tourney_id", "minutes"]]

Unnamed: 0,year_tourney_id,minutes
106373,1998-604,
106374,1998-604,
106375,1998-604,
106376,1998-604,
106377,1998-604,
106378,1998-604,
106379,1998-604,
106380,1998-604,
106381,1998-604,
106382,1998-604,


Looks like there is still a subset of tournaments where we do not have any results. This is ok - minutes will be used as part of feature engineering. We will just have to skip over these records when we create features later.

Looks like '98 and '99 Grand Slam Cup did not record match minutes. This is a year end tournament: 
https://www.grandslamhistory.com/atp/grand-slam-cup-munich

In [12]:
matches[matches.year_tourney_id.isin(matches[matches.minutes.isnull()].year_tourney_id.tolist())]["tourney_name"].unique()

array(['grand slam cup'], dtype=object)

## Impute Height

We are missing some values for player height. This, we can impute by using the average height

In [13]:
# impute height with mean of players
matches.loc[matches.loser_ht.isnull(), 'loser_ht'] = matches.loser_ht.mean()
matches.loc[matches.winner_ht.isnull(), 'winner_ht'] = matches.winner_ht.mean()

In [14]:
matches.sample(5).T

Unnamed: 0,130829,112604,164510,125028,146679
best_of,3,3,3,3,3
l_1stin,47,56,73,75,36
l_1stwon,20,40,49,52,15
l_2ndwon,12,21,24,12,10
l_svgms,7,16,17,16,9
l_ace,1,10,15,2,0
l_bpfaced,17,9,9,4,13
l_bpsaved,13,5,6,1,5
l_df,0,3,4,2,1
l_svpt,66,90,117,101,65


In [15]:
# Lastly, let's drop any rows where we don't have scores for the matches
matches = matches.dropna(axis=0, subset=["score", "minutes"])

In [16]:
matches.columns[matches.isnull().any()].tolist()

['l_1stin',
 'l_1stwon',
 'l_2ndwon',
 'l_svgms',
 'l_ace',
 'l_bpfaced',
 'l_bpsaved',
 'l_df',
 'l_svpt',
 'loser_rank',
 'loser_rank_points',
 'w_1stin',
 'w_1stwon',
 'w_2ndwon',
 'w_svgms',
 'w_ace',
 'w_bpfaced',
 'w_bpsaved',
 'w_df',
 'w_svpt',
 'winner_rank',
 'winner_rank_points']

## Missing Rank

When loser doesn't have a rank, loser doesn't have points either

This is the same with winners

In [37]:
# losers without rank but have points
print(len(matches[(matches.loser_rank.isnull()) & (matches.loser_rank_points.notnull())]))
# winners without rank but have points
print(len(matches[(matches.winner_rank.isnull()) & (matches.winner_rank_points.notnull())]))

0
0


When you look closer, I don't recognize all players, but Tim Henman was an English player that was at the top of his game in 1998 and was in the top 10 according to his wikipedia page: https://en.wikipedia.org/wiki/Tim_Henman

So I think we should somehow impute this

In [35]:
matches[(matches.winner_rank.isnull())][["tourney_date", "year_tourney_id", "round", "tourney_name", "winner_name"]].head(10)

0


Unnamed: 0,tourney_date,year_tourney_id,round,tourney_name,winner_name
105214,1998-06-22,1998-540,r128,wimbledon,mikael tillstrom
105593,1998-07-20,1998-418,r64,washington,jimmy arias
105693,1998-07-27,1998-423,r32,los angeles,tim henman
105701,1998-07-27,1998-423,r16,los angeles,tim henman
105705,1998-07-27,1998-423,qf,los angeles,tim henman
105707,1998-07-27,1998-423,sf,los angeles,tim henman
105922,1998-08-17,1998-419,r64,indianapolis,wayne black
105942,1998-08-17,1998-419,r32,indianapolis,wayne black
106034,1998-08-24,1998-80,r32,boston,jiri novak
106420,1998-09-28,1998-327,r32,toulouse,lionel barthez


I think a reasonable way to to impute the data for that tournament. We will take the mean of the player rank in that round and impute into our missing ranks

In [39]:
def impute_missing_rank(matches: pd.DataFrame, missing_list: pd.DataFrame, name_col: str) -> pd.DataFrame:
    for index, player in missing_list.iteritems():
        print(matches[(matches.year_tourney_id == player.year_tourney_id) & 
                (matches.round == player.round)])

In [41]:
loser_missing_ranks = matches[matches.loser_rank.isnull()][["year_tourney_id", "round", "loser_name"]]
impute_missing_rank(matches, loser_missing_ranks, "loser_name")    

AttributeError: 'Series' object has no attribute 'year_tourney_id'

In [None]:
winner_missing_ranks = matches[matches.winner_rank.isnull()][["year_tourney_id", "round", "winner_name"]]

In [19]:
matches_orig[matches_orig.loser_rank.isnull()].sample(10).T

Unnamed: 0,46731,428,7651,9915,44479,41825,11078,11195,11208,51373
best_of,5,3,5,5,3,3,3,3,5,5
draw_size,,64,,,,,,,,
l_1stin,,,,,,,,,,
l_1stwon,,,,,,,,,,
l_2ndwon,,,,,,,,,,
l_svgms,,,,,,,,,,
l_ace,,,,,,,,,,
l_bpfaced,,,,,,,,,,
l_bpsaved,,,,,,,,,,
l_df,,,,,,,,,,
