# ATP Tennis - Advanced Feature Engineering

In previous notebook we created features based on data that was already provided for the match.

In this notebook, we will add to this dataset by created features that represents player matchup and player history as I think this should help us with our model

## Player History

We will create features that represent player matches leading up to that match we are predicting. Not all of these columns will be used in our model - most likely I will only use the percentages in our models but the data is there can we can filter them out later

We are currently only looking back 5 matches leading up to our current match we are trying to predict

### Features Created
* wins - matches won
* losses - matches lost
* games won
* sets won
* games lost
* sets lost
* winning percentage = wins / (wins + losses)
* games won percetnage = games won / (games won + games lost)
* sets won percentage = sets won / (sets won + sets lost)

## Player Matchup
We will create features that represent match-up history - ie, whether p1 defeated p2, score, etc. Currently, we are only looking back up to 5 matchups

## Features Created
* matchups won - ie, out of last 5 matchups - how many matches did p1 win
* matchups lost
* matchups sets won
* matchups sets lost
* matchups games won
* matchups games lost
* matchups win percentage
* matchups sets won percentage
* matchups games won percentage



In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import LabelEncoder
import json
from datetime import datetime, date
from sys import path
from os.path import dirname as dir
import re
from pprint import pprint
import traceback
import sys

path.append(dir('../'))

# from util import score_util

# from util.score_util import process_scores
# import importlib
# importlib.reload(util.score_util)

import logging

%matplotlib inline
sns.set()

In [13]:
# date
DATE_FORMAT = '%Y-%m-%d'
DATE = datetime.now().strftime(DATE_FORMAT)

# Contants
START_YEAR = 1985
END_YEAR = 2019

# files
DATASET_DIR = '../datasets'
# this is the file we generated from our pre-processing notebook
PREPROCESSED_FILE = f'{DATASET_DIR}/atp_matches_{START_YEAR}-{END_YEAR}_preprocessed.csv'

FEATURE_FILE_1998 = f'{DATASET_DIR}/atp_matches_1998-2019_features.csv'
FEATURE_FILE_1985 = f'{DATASET_DIR}/atp_matches_1985-2019_features.csv'
FEATURE_FILE_1998_DIFF = f'{DATASET_DIR}/atp_matches_1998-2019_features-diff.csv'
FEATURE_FILE_1985_DIFF = f'{DATASET_DIR}/atp_matches_1985-2019_features-diff.csv'
FEATURE_FILE_1998_DIFF_OHE = f'{DATASET_DIR}/atp_matches_1998-2019_features-diff-ohe.csv'
FEATURE_FILE_1985_DIFF_OHE = f'{DATASET_DIR}/atp_matches_1985-2019_features-diff-ohe.csv'

# output info
MODEL_DIR = '../models'
MODEL_FILE = f'{MODEL_DIR}/{DATE}-lr.csv'
MODEL_FILE = f'{MODEL_DIR}/{DATE}-lr-diff.csv'

REPORT_DIR = '../reports'
REFPORT_FILE = f'{REPORT_DIR}/report.csv'

LABEL_COL = 'p1_winner'

# random seed
RSTATE = 1
N_JOBS = 4
MAX_ITER = 1000
LEADING_MATCHES = 5



In [5]:
preprocessed = pd.read_csv(PREPROCESSED_FILE, parse_dates=["tourney_date"])
feature1985 = pd.read_csv(FEATURE_FILE_1985)

In [7]:



logging.basicConfig(level=logging.INFO)
log = logging.getLogger(__name__)

def process_scores(scores: str, idx: str = None):
    """
    Use this to parse out differnt parts of a match score.
    Will be using this for feature Engineering as input to predict match results

    :param scores: - string with scores - ie, 6-7 7-6(7) 6-3
    "param idx": - optional entry with row index - used for debugging since some of the scores look to have been converted to date (ie, 3-jun)

    :return:
        winner_sets_won
        winner_games_won
        loser_sets_won
        loser_games_won
    """
    scores_orig = scores
    set_score = [0, 0]
    game_score = [0, 0]
    try:
        scores = re.sub(r"[^0-9-\ ]","", scores).strip()
        log.debug(f'scores {scores_orig} cleaned scores {scores}')
        sets = scores.split()
        for set in sets:
            p1 = int(set.split("-")[0])
            p2 = int(set.split("-")[1])
            game_score[0] += p1
            game_score[1] += p2
            if p1 > p2:
                set_score[0] += 1
            else:
                set_score[1] += 1
    except Exception as e:
        raise Exception(f'orig score {scores_orig} scores {scores}') from e
        

    return set_score[0], game_score[0], set_score[1], game_score[1]

def get_player_matches(matches: pd.DataFrame, player_id: str, tourney_date: datetime, num_matches: int):
    return matches[((matches.winner_id == player_id) | (matches.loser_id == player_id)) &
                  (matches.score.notnull()) &
                  (matches.tourney_date < tourney_date)][-num_matches:]

def get_match_ups(matches: pd.DataFrame, p1_id: str, p2_id: str, tourney_date: datetime, num_matches: int):
    matchups = matches[
        (((matches.winner_id == p1_id) & (matches.loser_id == p2_id)) |
        ((matches.winner_id == p2_id) & (matches.loser_id == p1_id))) &
        (matches.tourney_date < tourney_date) &
        (matches.score.notnull())
    ][-num_matches:]
    return matchups

scores = pd.DataFrame()
for index, row in feature1985.iterrows():
    try:
        # get player record leading up to this match
        p1_wins, p1_losses, p1_sets_won, p1_games_won, p1_sets_lost, p1_games_lost = 0, 0, 0, 0, 0, 0
        p2_wins, p2_losses, p2_sets_won, p2_games_won, p2_sets_lost, p2_games_lost = 0, 0, 0, 0, 0, 0
        p1_matchup_wins, p1_matchup_losses, p1_matchup_sets_won, p1_matchup_games_won = 0, 0, 0, 0
        p2_matchup_wins, p2_matchup_losses, p2_matchup_sets_won, p2_matchup_games_won = 0, 0, 0, 0
        log.debug(f'index {index} tourney_year {row.tourney_year} tourney_month {row.tourney_month}')
        for index, match in get_player_matches(preprocessed, row.p1, 
                                               datetime(int(row.tourney_year), int(row.tourney_month), 1), LEADING_MATCHES).iterrows():
            log.debug(f'index {index} p1 {row.p1}')
            if match.winner_id == row.p1:
                log.debug('p1 winner')
                p1_wins += 1
                ws, wg, ls, lg = process_scores(match.score, index)
                p1_sets_won += ws
                p1_games_won += wg
                p1_sets_lost += ls
                p1_games_lost += lg
            else:
                log.debug('p1 loser')
                p1_losses += 1
                ws, wg, ls, lg = process_scores(match.score, index)
                p1_sets_won += ls
                p1_games_won += lg
                p1_sets_lost += ws
                p1_games_lost += wg
        for index, match in get_player_matches(preprocessed, row.p2, 
                                               datetime(int(row.tourney_year), int(row.tourney_month), 1), LEADING_MATCHES).iterrows():
            log.debug(f'index {index} p2 {row.p2}')
            if match.winner_id == row.p2:
                log.debug('p2 winner')
                p2_wins += 1
                ws, wg, ls, lg = process_scores(match.score, index)
                p2_sets_won += ws
                p2_games_won += wg
                p2_sets_lost += ls
                p2_games_lost += lg
            else:
                log.debug('p2 loser')
                p2_losses += 1
                ws, wg, ls, lg = process_scores(match.score, index)
                p2_sets_won += ls
                p2_games_won += lg
                p2_sets_lost += ws
                p2_games_lost += wg

        # get match-up info
        matchups = get_match_ups(preprocessed, row.p1, row.p2, 
                                          datetime(int(row.tourney_year), int(row.tourney_month), 1), LEADING_MATCHES).iterrows()
        for index, match in matchups:
            if match.winner_id == row.p1:
                # p1 was winner in this matchup
                log.debug('p1 matchup winner')
                p1_matchup_wins += 1
                p2_matchup_losses += 1
                ws, wg, ls, lg = process_scores(match.score, index)
                p1_matchup_sets_won += ws
                p1_matchup_games_won += wg
                p2_matchup_sets_won += ls
                p2_matchup_games_won += lg
            elif match.winner_id == row.p2:
                log.debug('p2 matchup winner')
                # p2 was the winner in this matchup
                p2_matchup_wins += 1
                p1_matchup_losses += 1
                ws, wg, ls, lg = process_scores(match.score, index)
                p2_matchup_sets_won += ws
                p2_matchup_games_won += wg
                p1_matchup_sets_won += ls
                p1_matchup_games_won += lg
            # last matchup date as feature



        d = {
            "p1_wins": p1_wins,
            "p1_losses": p1_losses,
            "p1_sets_won": p1_sets_won,
            "p1_games_won": p1_games_won,
            "p1_sets_lost": p1_sets_lost,
            "p1_games_lost": p1_games_lost,
            "p1_win_percentage": 0 if p1_wins == 0 & p1_losses == 0 else p1_wins / (p1_wins + p1_losses),
            "p1_games_won_percentage": 0 if p1_games_won == 0 & p1_games_lost == 0 else p1_games_won / (p1_games_won + p1_games_lost),
            "p1_sets_won_percentage": 0 if p1_sets_won == 0 & p1_sets_lost == 0 else p1_sets_won / (p1_sets_won + p1_sets_lost),
            "p2_wins": p2_wins,
            "p2_losses": p2_losses,
            "p2_sets_won": p2_sets_won,
            "p2_games_won": p2_games_won,
            "p2_sets_lost": p2_sets_lost,
            "p2_games_lost": p2_games_lost,
            "p2_win_percentage": 0 if p2_wins == 0 & p2_losses == 0 else p2_wins / (p2_wins + p2_losses),
            "p2_games_won_percentage": 0 if p2_games_won == 0 & p2_games_lost == 0 else p2_games_won / (p2_games_won + p2_games_lost),
            "p2_sets_won_percentage": 0 if p2_sets_won == 0 & p2_sets_lost == 0 else p2_sets_won / (p2_sets_won + p2_sets_lost),
            "p1_matchup_wins": p1_matchup_wins,
            "p1_matchup_losses": p1_matchup_losses,
            "p1_matchup_sets": p1_matchup_sets_won,
            "p1_matchup_games": p1_matchup_games_won,
            "p2_matchup_wins": p2_matchup_wins,
            "p2_matchup_losses": p2_matchup_losses,
            "p2_matchup_sets": p2_matchup_sets_won,
            "p2_matchup_games": p2_matchup_games_won,
            "p1_matchup_win_percentage": 0 if p1_matchup_wins == 0 & p2_matchup_wins == 0 else p1_matchup_wins / (p1_matchup_wins + p2_matchup_wins),
            "p1_matchup_sets_won_percentage": 0 if p1_matchup_sets_won == 0 & p2_matchup_sets_won == 0 else p1_matchup_sets_won / (p1_matchup_sets_won + p2_matchup_sets_won),
            "p1_matchup_games_won_percentage": 0 if p1_matchup_games_won == 0 & p2_matchup_games_won == 0 else p1_matchup_games_won / (p1_matchup_games_won + p2_matchup_games_won)
        }
        scores = scores.append(d, ignore_index=True)
    except Exception as e:
        traceback.print_exc(limit=2, file=sys.stdout)
        print(f'index {index} row {row}')
        


scores.head()

            
        
    

Unnamed: 0,p1_games_lost,p1_games_won,p1_games_won_percentage,p1_losses,p1_matchup_games,p1_matchup_games_won_percentage,p1_matchup_losses,p1_matchup_sets,p1_matchup_sets_won_percentage,p1_matchup_win_percentage,...,p2_losses,p2_matchup_games,p2_matchup_losses,p2_matchup_sets,p2_matchup_wins,p2_sets_lost,p2_sets_won,p2_sets_won_percentage,p2_win_percentage,p2_wins
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [8]:
scores[scores.p1_matchup_win_percentage != 0]

Unnamed: 0,p1_games_lost,p1_games_won,p1_games_won_percentage,p1_losses,p1_matchup_games,p1_matchup_games_won_percentage,p1_matchup_losses,p1_matchup_sets,p1_matchup_sets_won_percentage,p1_matchup_win_percentage,...,p2_losses,p2_matchup_games,p2_matchup_losses,p2_matchup_sets,p2_matchup_wins,p2_sets_lost,p2_sets_won,p2_sets_won_percentage,p2_win_percentage,p2_wins
137,53.0,59.0,0.526786,2.0,16.0,0.470588,1.0,2.0,0.500000,0.50,...,2.0,18.0,1.0,2.0,1.0,5.0,6.0,0.545455,0.6,3.0
282,29.0,53.0,0.646341,0.0,12.0,0.750000,0.0,2.0,1.000000,1.00,...,1.0,4.0,1.0,0.0,0.0,2.0,2.0,0.500000,0.5,1.0
371,43.0,50.0,0.537634,2.0,13.0,0.590909,0.0,2.0,1.000000,1.00,...,2.0,9.0,1.0,0.0,0.0,5.0,7.0,0.583333,0.6,3.0
399,31.0,67.0,0.683673,0.0,14.0,0.518519,0.0,2.0,0.666667,1.00,...,2.0,13.0,1.0,1.0,0.0,5.0,7.0,0.583333,0.6,3.0
414,56.0,86.0,0.605634,0.0,18.0,0.750000,0.0,3.0,1.000000,1.00,...,0.0,6.0,1.0,0.0,0.0,0.0,11.0,1.000000,1.0,5.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
99932,59.0,51.0,0.463636,4.0,18.0,0.514286,0.0,2.0,0.666667,1.00,...,3.0,17.0,1.0,1.0,0.0,9.0,5.0,0.357143,0.4,2.0
99937,65.0,62.0,0.488189,2.0,37.0,0.506849,1.0,4.0,0.500000,0.50,...,3.0,36.0,1.0,4.0,1.0,6.0,4.0,0.400000,0.4,2.0
99942,58.0,61.0,0.512605,2.0,84.0,0.494118,2.0,9.0,0.529412,0.60,...,2.0,86.0,3.0,8.0,2.0,6.0,6.0,0.500000,0.6,3.0
99946,61.0,48.0,0.440367,3.0,39.0,0.453488,3.0,3.0,0.333333,0.25,...,3.0,47.0,1.0,6.0,3.0,6.0,4.0,0.400000,0.4,2.0


In [9]:
scores.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 99955 entries, 0 to 99954
Data columns (total 29 columns):
p1_games_lost                      99955 non-null float64
p1_games_won                       99955 non-null float64
p1_games_won_percentage            99955 non-null float64
p1_losses                          99955 non-null float64
p1_matchup_games                   99955 non-null float64
p1_matchup_games_won_percentage    99955 non-null float64
p1_matchup_losses                  99955 non-null float64
p1_matchup_sets                    99955 non-null float64
p1_matchup_sets_won_percentage     99955 non-null float64
p1_matchup_win_percentage          99955 non-null float64
p1_matchup_wins                    99955 non-null float64
p1_sets_lost                       99955 non-null float64
p1_sets_won                        99955 non-null float64
p1_sets_won_percentage             99955 non-null float64
p1_win_percentage                  99955 non-null float64
p1_wins              

In [10]:
scores.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
p1_games_lost,99955.0,63.19996,16.990096,0.0,55.0,64.0,74.0,159.0
p1_games_won,99955.0,60.258226,18.515497,0.0,51.0,60.0,70.0,151.0
p1_games_won_percentage,99955.0,0.475929,0.089823,0.0,0.444444,0.484375,0.522124,0.857143
p1_losses,99955.0,2.717843,1.199481,0.0,2.0,3.0,4.0,5.0
p1_matchup_games,99955.0,12.706448,19.573532,0.0,0.0,0.0,19.0,120.0
p1_matchup_games_won_percentage,99955.0,0.22349,0.261237,0.0,0.0,0.0,0.482759,1.0
p1_matchup_losses,99955.0,0.508189,0.953981,0.0,0.0,0.0,1.0,5.0
p1_matchup_sets,99955.0,1.304677,2.247482,0.0,0.0,0.0,2.0,14.0
p1_matchup_sets_won_percentage,99955.0,0.223511,0.334571,0.0,0.0,0.0,0.428571,1.0
p1_matchup_win_percentage,99955.0,0.223731,0.374127,0.0,0.0,0.0,0.4,1.0


# Save off our new feature files

In [17]:

def concat_dfs(filename: str, scores_df: pd.DataFrame) -> pd.DataFrame:
    # read in data file
    print(f'Reading {filename}')
    left = pd.read_csv(filename)
    
    dirname = re.findall(r'^([\w\.-\/]+)/[\w-]+\.csv$', filename)[0]
    basename = re.findall(r'/([\w-]+)\.csv$', filename)[0]
    newfile = f'{dirname}/{basename}-matchup5.csv'
    
    # concat with scores
    new_df = pd.concat([left, scores_df], axis=1)
    print(f'Saving {newfile}')
    new_df.to_csv(newfile, index=False)
    
    # now create 1998 versions
    new_df = new_df[new_df.tourney_year >= 1998]
    newfile = newfile.replace("1985", "1998")
    print(f'Saving {newfile}')
    new_df.to_csv(newfile, index=False)

files = [FEATURE_FILE_1985, FEATURE_FILE_1985_DIFF, FEATURE_FILE_1985_DIFF_OHE]
for file in files:
    concat_dfs(file, scores)



Saving ../datasets/atp_matches_1985-2019_features-matchup5.csv
Saving ../datasets/atp_matches_1998-2019_features-matchup5.csv
Saving ../datasets/atp_matches_1985-2019_features-diff-matchup5.csv
Saving ../datasets/atp_matches_1998-2019_features-diff-matchup5.csv
Saving ../datasets/atp_matches_1985-2019_features-diff-ohe-matchup5.csv
Saving ../datasets/atp_matches_1998-2019_features-diff-ohe-matchup5.csv
