## A tennis simulation model based on historic win probabilities of a player at different scores as server/receiver

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import sys, os
import csv
from IPython.display import Image
import random
csv.field_size_limit(sys.maxsize)
pd.set_option('display.max_columns', 500)
pd.set_option('display.max_rows', 500)

#### Read data, thanks to Jeff Sackmann's tennis match charting project @ https://github.com/JeffSackmann/tennis_MatchChartingProject

In [2]:
df = pd.read_csv('charting-m-points.csv', encoding = "ISO-8859-1", low_memory=False)
df.head()

Unnamed: 0,match_id,Pt,Set1,Set2,Gm1,Gm2,Pts,Gm#,TbSet,TB?,TBpt,Svr,Ret,Serving,1st,2nd,Notes,1stNoLet,2ndNoLet,1stSV,2ndSV,1stNoSV,2ndNoSV,1stIn,2ndIn,isRally1st,isRally2nd,Sv1,Sv2,Rally,isAce,isUnret,isRallyWinner,isForced,isUnforced,isDouble,rallyNoSpec,rallyNoError,rallyNoDirection,rallyLen,PtWinner,isSvrWinner,PtsAfter,GmW,Gm1.1,Gm2.1,SetW,Set1.1,Set2.1,RevTB,TBrev,rallyCount
0,20190327-M-Miami_Masters-R16-Daniil_Medvedev-R...,1.0,0.0,0.0,0.0,0.0,0-0,1 (1),1.0,0,,1.0,2.0,DM,6d,6s+3w@,,6d,6s+3w@,0,0.0,6d,6s+3w@,0,1.0,0,1,6d,6.0,s+3w@,False,False,False,False,True,False,s3w,s3,s,1,1.0,1.0,15-0,0.0,0.0,0.0,0.0,0.0,0.0,,,1
1,20190327-M-Miami_Masters-R16-Daniil_Medvedev-R...,2.0,0.0,0.0,0.0,0.0,15-0,1 (2),1.0,0,,1.0,2.0,DM,6*,,,6*,,0,,6*,,1,,0,0,6*,,,True,False,False,False,False,False,,,,0,1.0,1.0,30-0,0.0,0.0,0.0,0.0,0.0,0.0,,,1
2,20190327-M-Miami_Masters-R16-Daniil_Medvedev-R...,3.0,0.0,0.0,0.0,0.0,30-0,1 (3),1.0,0,,1.0,2.0,DM,4#,,,4#,,0,,4#,,1,,0,0,4#,,,False,True,False,False,False,False,,,,0,1.0,1.0,40-0,0.0,0.0,0.0,0.0,0.0,0.0,,,1
3,20190327-M-Miami_Masters-R16-Daniil_Medvedev-R...,4.0,0.0,0.0,0.0,0.0,40-0,1 (4),1.0,0,,1.0,2.0,DM,6w,4f18*,,6w,4f18*,0,0.0,6w,4f18*,0,1.0,0,1,6w,4.0,f18*,False,False,True,False,False,False,f18,f18,f,1,2.0,0.0,40-15,0.0,0.0,0.0,0.0,0.0,0.0,,,2
4,20190327-M-Miami_Masters-R16-Daniil_Medvedev-R...,5.0,0.0,0.0,0.0,0.0,40-15,1 (5),1.0,0,,1.0,2.0,DM,4f28b3b3b1f1w#,,,4f28b3b3b1f1w#,,0,,4f28b3b3b1f1w#,,1,,1,0,4,,f28b3b3b1f1w#,False,False,False,True,False,False,f28b3b3b1f1w,f28b3b3b1f1,fbbbf,5,1.0,1.0,GM,1.0,1.0,0.0,0.0,0.0,0.0,,,5


#### Get all players' average win probabilities at different scores (rows) as server/receiver (columns)
For example, under row [Pts=(0-0)], column [isSvrWinner] means that when a server faces a score of 0-0, his probability of win is 0.6397. Interesting to note that players has higher win probabilities when leading instead of trailing.

In [3]:
df['Pts'].value_counts()
df_transition = df[(df['Gm1']+df['Gm2'] <= 11) & (df['match_id'] != '20180916-M-Davis_Cup_World_Group_SF-RR-Marcel_Granollers-Nicolas_Mahut')]
df_transition['isRcvrWinner'] = 1 - df_transition['isSvrWinner']
avgSvr = df_transition.groupby(['Pts'])[['isSvrWinner']].mean()
avgRcvr = df_transition.groupby(['Pts'])[['isRcvrWinner']].mean()
pd.concat([avgSvr, avgRcvr], axis=1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  This is separate from the ipykernel package so we can avoid doing imports until


Unnamed: 0_level_0,isSvrWinner,isRcvrWinner
Pts,Unnamed: 1_level_1,Unnamed: 2_level_1
0-0,0.639694,0.360306
0-15,0.62055,0.37945
0-30,0.609068,0.390932
0-40,0.579554,0.420446
15-0,0.649771,0.350229
15-15,0.642503,0.357497
15-30,0.612672,0.387328
15-40,0.613088,0.386912
30-0,0.676607,0.323393
30-15,0.633839,0.366161


#### All points in the dataframe will be converted into the following format for the ease of simulating a game.

In [4]:
ptsMapper = {'0-15': '0-1',
             '15-0': '1-0',
             '30-0': '2-0',
             '15-15': '1-1',
             '0-30': '0-2',
             '40-0': '3-0',
             '30-15': '2-1',
             '15-30': '1-2',
             '0-40': '0-3',
             '40-15': '3-1',
             '30-30': '2-2',
             '15-40': '1-3',
             '40-30': '3-2',
             '30-40': '2-3',
             '40-40': '3-3',
             'AD-40': '4-3',
             '40-AD': '3-4',
            }

In [5]:
def playerEval(df_transition, name, abbv):
    """
    input: historic game data, last name and initial of the player (e.g., for Roger Federer, name=Rederer, abbv=RF)
    output: a dict of historic win probabilities of the player at different scores as server/receiver
    """
    playerSvr = df_transition[(df_transition['Serving'] == abbv) & (df_transition['match_id'].str.contains(name))].groupby(['Pts'])['isSvrWinner'].agg(['mean','count'])
    playerSvr.columns = ['Svr Mean', 'Svr Cnt']
    playerRcvr = df_transition[(df_transition['Serving'] != abbv) & (df_transition['match_id'].str.contains(name))].groupby(['Pts'])['isRcvrWinner'].agg(['mean','count'])
    playerRcvr.columns = ['Rcvr Mean', 'Rcvr Cnt']
    playerValue = pd.concat([playerSvr, playerRcvr], axis=1).reset_index(level=0).replace({"Pts": ptsMapper})
    prob = dict(zip(playerValue['Pts'], zip(playerValue['Svr Mean'], playerValue['Rcvr Mean'])))
    prob['avg'] = (df_transition[(df_transition['Serving'] == abbv) & (df_transition['match_id'].str.contains(name))]['isSvrWinner'].mean(), df_transition[(df_transition['Serving'] != abbv) & (df_transition['match_id'].str.contains(name))]['isRcvrWinner'].mean())
    return prob

#### For example, Roger Federer's and Rafael Nadal's win probabilities dictionaries.
When having a score of 0-15, Federer has a win probability of 0.6860 as a server, and 0.4038 as a receiver. The 'avg', or average probabilities would be used in tie-breaks to simulate games. 

In [6]:
RF = playerEval(df_transition, 'Federer', 'RF')
RF

{'0-0': (0.6875471698113208, 0.3855467999231213),
 '0-1': (0.6859903381642513, 0.4037886340977069),
 '0-2': (0.6961538461538461, 0.39382716049382716),
 '0-3': (0.6772151898734177, 0.40752351097178685),
 '1-0': (0.6831275720164609, 0.38172715894868586),
 '1-1': (0.6883457005674378, 0.3679635761589404),
 '1-2': (0.6635687732342007, 0.39420289855072466),
 '1-3': (0.6780383795309168, 0.3819918144611187),
 '2-0': (0.7036144578313253, 0.37803643724696356),
 '2-1': (0.6622030237580994, 0.37906772207563766),
 '2-2': (0.69451871657754, 0.39010017678255743),
 '2-3': (0.6645161290322581, 0.38565022421524664),
 '3-0': (0.6992009132420092, 0.3661513425549227),
 '3-1': (0.6936893203883495, 0.359828141783029),
 '3-2': (0.6748502994011976, 0.38592375366568915),
 '3-3': (0.6672212978369384, 0.3939745075318656),
 '3-4': (0.615, 0.42549019607843136),
 '4-3': (0.6874480465502909, 0.42065009560229444),
 'avg': (0.6831100738514658, 0.38644672044472644)}

In [7]:
RN = playerEval(df_transition, 'Nadal', 'RN')
RN

{'0-0': (0.636594663278272, 0.3958860759493671),
 '0-1': (0.6704545454545454, 0.39728217426059154),
 '0-2': (0.6047745358090185, 0.4104627766599598),
 '0-3': (0.6510067114093959, 0.46078431372549017),
 '1-0': (0.6676646706586826, 0.39863803038239914),
 '1-1': (0.6615491974877878, 0.41056105610561056),
 '1-2': (0.6437587657784011, 0.4365426695842451),
 '1-3': (0.6125356125356125, 0.41453831041257366),
 '2-0': (0.680119581464873, 0.3632404181184669),
 '2-1': (0.6598837209302325, 0.41755725190839693),
 '2-2': (0.6612729234088457, 0.3973634651600753),
 '2-3': (0.6521739130434783, 0.42083333333333334),
 '3-0': (0.701098901098901, 0.320109439124487),
 '3-1': (0.6627118644067796, 0.36810431293881646),
 '3-2': (0.6597428288822947, 0.37835153922542203),
 '3-3': (0.6094117647058823, 0.39458030403172506),
 '3-4': (0.6405622489959839, 0.4338358458961474),
 '4-3': (0.6563706563706564, 0.4115720524017467),
 'avg': (0.6543887147335423, 0.39794589178356715)}

In [8]:
def normProb(playerA, playerB):
    """
    input: win probabilities dictionaries of player A and player B
    output: a dict of normalized probabilities of player A at different scores as server/receiver when facing player B
    """
    playerA_prob = dict((i,(playerA[i][0]/(playerA[i][0]+playerB[i][1]), playerA[i][1]/(playerB[i][0]+playerA[i][1]))) for i in playerA)
    return playerA_prob

#### To simulate a game between two players, we need to normalize the probabilities according to the following way. 
Continue on with the example of Federer playing Nadal, at a score of 0-15, Federer has a probability of 0.6860 to win as a server, while Nadal has a probability of 0.3973 to win as a receiver according to the dictionaries. The normalized probability would be that Federer has a probability of 0.6860/(0.6860+0.3973) = 0.6333 to win as a server. If Nadal serves, Federer has a probability of 0.4038/(0.4038+0.6704) = 0.3759 to win.

In [9]:
RF_RN_prob = normProb(RF, RN)
RF_RN_prob

{'0-0': (0.6346003987800725, 0.37719514744619725),
 '0-1': (0.6332574031890661, 0.37588196209540486),
 '0-2': (0.6290831276180618, 0.39437862157845893),
 '0-3': (0.5950926935659759, 0.3849899628326675),
 '1-0': (0.6314931538788044, 0.36376036879523793),
 '1-1': (0.6263913624950415, 0.3574152604785505),
 '1-2': (0.6031832298136646, 0.3797856049004595),
 '1-3': (0.6205865325261386, 0.3840937957987298),
 '2-0': (0.6595221840317468, 0.35725963899649926),
 '2-1': (0.6132870774018295, 0.3648560523472856),
 '2-2': (0.6360747782075596, 0.37103876512680367),
 '2-3': (0.6122600619195047, 0.3715949652451625),
 '3-0': (0.6859548827486148, 0.3430791838485525),
 '3-1': (0.6533184025738135, 0.3518963948646094),
 '3-2': (0.6407606544641108, 0.36906960603570765),
 '3-3': (0.6283860343238289, 0.3926449049908021),
 '3-4': (0.5863643985914031, 0.399126889154811),
 '4-3': (0.6255099858554014, 0.39056823634244975),
 'avg': (0.6318914982816768, 0.37128513056296736)}

In [10]:
def simulateGame(playerA_prob, aIsRcvr):
    """
    input: a dict of normalized probabilities of player A, and whether A is the receiver in this game (1 for receiver, 0 for server)
    output: Whether player A wins the game
    optional prints: Scores updated after each simulation of point (needs conversion, e.g., 0-40 is shown as 0-3)
    """
    aScore = 0
    bScore = 0
    while ((aScore <= 3 and bScore <= 3) or 
           ((aScore >= 4 or bScore >= 4) and (abs(aScore - bScore) <=1))):
        if ((aScore >= 4 or bScore >= 4) and (abs(aScore - bScore) <=1)):
            if aScore == bScore:
                scoreNow = '3-3'
            elif aScore > bScore:
                scoreNow = '4-3'
            else:
                scoreNow = '3-4'
        else:
            scoreNow = str(aScore) + '-' + str(bScore)
#         print(scoreNow)
        result = int(random.random() < playerA_prob[scoreNow][aIsRcvr])
        aScore +=  result
        bScore +=  (1 - result)
#     print(str(aScore) + '-' + str(bScore))
    return(aScore > bScore)

In [11]:
def simulateTieBreak(playerA_prob, aIsRcvr):
    """
    input: a dict of normalized probabilities of player A, and whether A is the receiver in this game (1 for receiver, 0 for server)
    output: Whether player A wins the tie-break
    optional prints: Scores updated after each simulation of point
    """
    serverScore = 0
    receiverScore = 0
    aIsRcvrCopy = aIsRcvr
    while ((serverScore <= 6 and receiverScore <= 6) or 
           ((serverScore >= 7 or receiverScore >= 7) and (abs(serverScore - receiverScore) <= 1))):
        if (serverScore + receiverScore) % 2 == 1 and (serverScore + receiverScore) > 0:
            aIsRcvrCopy = 1 - aIsRcvrCopy     # Player alternates to serve after the first, third, fifth... points
#         print(str(serverScore) + '-' + str(receiverScore))
#         print(playerA_prob['avg'][aIsRcvrCopy])
        result = int(random.random() < playerA_prob['avg'][aIsRcvrCopy])
        serverScore +=  result
        receiverScore +=  (1 - result)
#     print(str(serverScore) + '-' + str(receiverScore))
    return(serverScore > receiverScore)

In [17]:
def simulateMatch(playerA_prob):
    """
    input: dict of history data, player A's and player B's last names and initials
    output: Whether player A wins the match
    optional prints: Results of each set and final match
    """
    aIsRcvr = int(random.random() < 0.5)    # random coin toss to decide who serves
    setScoreA = 0
    setScoreB = 0
    while (setScoreA != 3 and setScoreB != 3):
        gameScoreA = 0
        gameScoreB = 0
        while (gameScoreA not in (6,7) and gameScoreB not in (6,7)) \
        or (gameScoreA in (5,6) and gameScoreB in (5,6) and abs(gameScoreA - gameScoreB) <= 1):
            aIsRcvr = 1 - aIsRcvr    # alternate to serve after each game
            if gameScoreA == 6 and gameScoreB == 6:
                gameResult = int(simulateTieBreak(playerA_prob, aIsRcvr))
            else:
                gameResult = int(simulateGame(playerA_prob, aIsRcvr))
            gameScoreA += gameResult
            gameScoreB += 1 - gameResult
        setResult = int(gameScoreA > gameScoreB)
        setScoreA += setResult
        setScoreB += 1 - setResult
#         print("Set {}: ".format(setScoreA + setScoreB) + str(gameScoreA) + "-" + str(gameScoreB))
#     print("Final Result: " + str(setScoreA) + "-" + str(setScoreB))
    return (setScoreA > setScoreB)

In [13]:
simulateMatch(RF_RN_prob)

Set 1: 4-6
Set 2: 6-3
Set 3: 6-7
Set 4: 4-6
Final Result: 1-3


False

#### Simulate 10,000 games between Federer and Agassi, Federer won 5784/10000 games. 

In [18]:
result = {}
RF = playerEval(df_transition, 'Federer', 'RF')
AA = playerEval(df_transition, 'Agassi', 'AA')
RF_AA_prob = normProb(RF, AA)
for i in range(10000):
    temp = simulateMatch(RF_AA_prob);
    if temp in result:
        result[temp] += 1
    else:
        result[temp] = 1
print(result)

{True: 5784, False: 4216}


#### Simulate 10,000 games between Nadal and Roddick, Nadal won 7939/10000 games. 

In [19]:
result = {}
RN = playerEval(df_transition, 'Nadal', 'RN')
AR = playerEval(df_transition, 'Roddick', 'AR')
RN_AR_prob = normProb(RN, AR)
for i in range(10000):
    temp = simulateMatch(RN_AR_prob);
    if temp in result:
        result[temp] += 1
    else:
        result[temp] = 1
print(result)

{True: 7939, False: 2061}
