# Functions

In [30]:
def compute_match_rates_and_sample_weights(match):
    a = max(0.001,min(match["p1 serves won"] / match["p1 serves total"],0.999))
    b = max(0.001,min(match["p2 serves won"] / match["p2 serves total"],0.999))
    c = max(0.001,min(match["p1 service games won"] / match["p1 service games total"],0.999))
    d = max(0.001,min(match["p2 service games won"] / match["p2 service games total"],0.999))
    e = max(0.001,min(match["p1 sets won"] / match["sets total"],0.999))
    f = float("nan")
    if match["p1 tiebreaks won"] != 0 or match["p2 tiebreaks won"] != 0:
        f = max(0.001,min(match["p1 tiebreaks won"] / (match["p1 tiebreaks won"] + match["p2 tiebreaks won"]),0.999))
        
    return a,b,c,d,e,f

def compute_sample_weights(match):
    a = match["p1 match serve win rate"] * (1-match["p1 match serve win rate"]) / match["p1 serves total"]
    b = match["p2 match serve win rate"] * (1-match["p2 match serve win rate"]) / match["p2 serves total"]
    c = match["p1 match service game win rate"] * (1-match["p1 match service game win rate"]) / match["p1 service games total"]
    d = match["p2 match service game win rate"] * (1-match["p2 match service game win rate"]) / match["p2 service games total"]
    e = match["p1 match set win rate"] * (1-match["p1 match set win rate"]) / match["sets total"]
    f = float("nan")
    if match["p1 tiebreaks won"] != 0 or match["p2 tiebreaks won"] != 0:
        f = match["p1 tiebreak win rate"] * (1-match["p1 tiebreak win rate"]) / (match["p1 tiebreaks won"] + match["p2 tiebreaks won"])
        
    return 1/a,1/b,1/c,1/d,1/e,1/f

In [31]:
def sigmoid(theta, x):
    return 1 / (1 + math.e**(-theta.dot(x)))

from sklearn.linear_model import LinearRegression


class LogitRegression(LinearRegression):
    def fit(self, x, p, sample_weight):
        # special care for 0 or 1 probabilities
        for i in range(len(p)):
            p[i] = max(0.001,min(p[i],0.999))
        p = np.asarray(p)
        y = np.log(p / (1 - p))
        return super().fit(x, y,sample_weight)

    def predict(self, x):
        y = super().predict(x)
        return 1 / (np.exp(-y) + 1)
#i = 0
def compute_economist_rates(match, matches):
    #global i
    # if either player has played less than two matches in the past year, return nan
    if match["p1 match count"] < 2 or match["p2 match count"] < 2:
        return float("nan"),float("nan")
    
    date = match["date"]
    one_year_prior = date - relativedelta(years=1)
    relevant_matches = matches[(matches["date"] < date) & (matches["date"] >= one_year_prior)]
    
    # find players in past year
    players = set()
    for index, match_ in relevant_matches.iterrows():
        players.add(match_["server1"])
        players.add(match_["server2"])
    players = list(players)
    players_dict = {}
    for i in range(len(players)):
        players_dict[players[i]] = i
    
    
    # construct features for p1 serves to p2
    features = []
    for index, match_ in relevant_matches.iterrows():
        # first are the serve abilities, next the return abilities
        feature = [0 for i in range(2*len(players))]
        feature[players_dict[match_["server1"]]] = 1
        feature[len(players) + players_dict[match_["server2"]]] = 1
        features.append(feature)
        
    # construct features for p2 serves to p1
    for index, match_ in relevant_matches.iterrows():
        # first are the serve abilities, next the return abilities
        feature = [0 for i in range(2*len(players))]
        feature[players_dict[match_["server2"]]] = 1
        feature[len(players) + players_dict[match_["server1"]]] = 1
        features.append(feature)
    
    # logistic regression
    features = np.array(features)
    w1 = np.array(relevant_matches["p1 match inv serve var"])
    w2 = np.array(relevant_matches["p2 match inv serve var"])
    weights = np.concatenate([w1,w2])
    outcome_col = np.append(relevant_matches["p1 match serve win rate"].to_numpy(), relevant_matches["p2 match serve win rate"].to_numpy())
    reg = LogitRegression()
    reg.fit(features,outcome_col,weights)
    
    # compute estimated serve win rate for match from p1 to p2
    p1_to_p2 = [0 for i in range(2*len(players))]
    p1_to_p2[players_dict[match["server1"]]] = 1
    p1_to_p2[len(players) + players_dict[match["server2"]]] = 1
    p1_to_p2 = np.array([p1_to_p2])
    a = reg.predict(p1_to_p2)
    
    # compute estimated serve win rate for match from p2 to p1
    p2_to_p1 = [0 for i in range(2*len(players))]
    p2_to_p1[players_dict[match["server2"]]] = 1
    p2_to_p1[len(players) + players_dict[match["server1"]]] = 1
    p2_to_p1 = np.array([p2_to_p1])
    b = reg.predict(p2_to_p1)
    #print(i)
    #i += 1
    return a[0],b[0]
    
    
    
    
    

In [32]:
count = [0,0,0,0]
def compute_economist_sg_rg_rates(match, matches):
    global count
    # if either player has played less than two matches in the past year, return nan
    if match["p1 match count"] < 2 or match["p2 match count"] < 2:
        return float("nan"),float("nan")
    
    date = match["date"]
    one_year_prior = date - relativedelta(years=1)
    relevant_matches = matches[(matches["date"] < date) & (matches["date"] >= one_year_prior)]
    
    # find players in past year
    players = set()
    for index, match_ in relevant_matches.iterrows():
        players.add(match_["server1"])
        players.add(match_["server2"])
    players = list(players)
    players_dict = {}
    for i in range(len(players)):
        players_dict[players[i]] = i
    
    
    # construct features for p1 serves to p2
    features = []
    for index, match_ in relevant_matches.iterrows():
        # first are the serve abilities, next the return abilities
        feature = [0 for i in range(2*len(players))]
        feature[players_dict[match_["server1"]]] = 1
        feature[len(players) + players_dict[match_["server2"]]] = 1
        features.append(feature)
        
    # construct features for p2 serves to p1
    for index, match_ in relevant_matches.iterrows():
        # first are the serve abilities, next the return abilities
        feature = [0 for i in range(2*len(players))]
        feature[players_dict[match_["server2"]]] = 1
        feature[len(players) + players_dict[match_["server1"]]] = 1
        features.append(feature)
    
    # logistic regression
    features = np.array(features)
    w1 = np.array(relevant_matches["p1 match inv service game var"])
    w2 = np.array(relevant_matches["p2 match inv service game var"])
    weights = np.concatenate([w1,w2])
    outcome_col = np.append(relevant_matches["p1 match service game win rate"].to_numpy(), relevant_matches["p2 match service game win rate"].to_numpy())
    reg = LogitRegression()
    reg.fit(features,outcome_col,weights)
    
    # compute estimated serve win rate for match from p1 to p2
    p1_to_p2 = [0 for i in range(2*len(players))]
    p1_to_p2[players_dict[match["server1"]]] = 1
    p1_to_p2[len(players) + players_dict[match["server2"]]] = 1
    p1_to_p2 = np.array([p1_to_p2])
    a = reg.predict(p1_to_p2)
    
    # compute estimated serve win rate for match from p2 to p1
    p2_to_p1 = [0 for i in range(2*len(players))]
    p2_to_p1[players_dict[match["server2"]]] = 1
    p2_to_p1[len(players) + players_dict[match["server1"]]] = 1
    p2_to_p1 = np.array([p2_to_p1])
    b = reg.predict(p2_to_p1)
    
    if match["pbp_id"] == 8634739:
        print(reg.coef_[players_dict[match["server1"]]])
        print(reg.coef_[players_dict[match["server2"]]])
        print(reg.coef_[len(players) + players_dict[match["server1"]]])
        print(reg.coef_[len(players) + players_dict[match["server2"]]])
        
    if reg.coef_[players_dict[match["server1"]]] > 0:
        count[0] += 1
    else:
        count[1] += 1
        
    if reg.coef_[players_dict[match["server2"]]] > 0:
        count[0] += 1
    else:
        count[1] += 1
        
    if reg.coef_[len(players) + players_dict[match["server1"]]] > 0:
        count[2] += 1
    else:
        count[3] += 1
        
    if reg.coef_[len(players) + players_dict[match["server1"]]] > 0:
        count[2] += 1
    else:
        count[3] += 1
    
    

    return a[0],b[0]
    
    
    
    
    

# Code

In [33]:
import pandas as pd
import numpy as np
from datetime import datetime
from dateutil.relativedelta import relativedelta
from scipy.stats import beta
from multiprocessing import Pool, freeze_support
import math
from sklearn import linear_model
pd.set_option("display.max_columns", None)

In [34]:
matches = pd.read_csv("./data/2011-2015_matches_cleaned_forShrinkage.csv")
matches["date"] = matches.apply(lambda x: datetime.strptime(x["date"], '%Y-%m-%d'), axis=1)
matches

Unnamed: 0,pbp_id,date,tny_name,tour,draw,server1,server2,winner,pbp,score,adf_flag,wh_minutes,p1 serves won,p1 serves total,p1 service games won,p1 service games total,p1 sets won,p1 tiebreaks won,p2 serves won,p2 serves total,p2 service games won,p2 service games total,p2 sets won,p2 tiebreaks won,sets total,tourney serves won,tourney serves total,tourney service games won,tourney service games total,p1 serve win rate,p1 return win rate,p1 service game win rate,p1 return game win rate,p1 set win rate,p1 tiebreak win rate,p2 serve win rate,p2 return win rate,p2 service game win rate,p2 return game win rate,p2 set win rate,p2 tiebreak win rate,p1 opp serve win rate,p1 opp return win rate,p1 opp service game win rate,p1 opp return game win rate,p1 opp set win rate,p1 opp tiebreak win rate,p2 opp serve win rate,p2 opp return win rate,p2 opp service game win rate,p2 opp return game win rate,p2 opp set win rate,p2 opp tiebreak win rate,p1 match count,p2 match count,all player year serve win rate,all player year service game win rate,p1 year serve total,p1 year return total,p1 year service game total,p1 year return game total,p1 year set total,p1 year tiebreak total,p2 year serve total,p2 year return total,p2 year service game total,p2 year return game total,p2 year set total,p2 year tiebreak total,p1 year match win rate,p2 year match rate,p1 year match results,p2 year match results,p1 opp match win rate,p2 opp match win rate,p1 relative serve,p2 relative serve,p1 relative return,p2 relative return,p1 relative service games,p2 relative service games,p1 relative return games,p2 relative return games,p1 relative set,p2 relative set,p1 relative tiebreak,p2 relative tiebreak,tourney serve win rate,tourney return win rate,tourney service game win rate,tourney return game win rate,p1 tourney serve adv,p1 tourney return adv,p2 tourney serve adv,p2 tourney return adv,p1 tourney service game adv,p1 tourney return game adv,p2 tourney service game adv,p2 tourney return game adv,p1 tourney set adv,p2 tourney set adv,p1 tourney tiebreak adv,p2 tourney tiebreak adv,tourney sets to win,p1 serve_tt,p1 service_game_tt,p1 set_tt,p1 tiebreak_tt,p2 serve_tt,p2 service_game_tt,p1 serve_tt no sos adjust,p2 serve_tt no sos adjust,p1 service_game_tt no sos adjust,p2 service_game_tt no sos adjust
0,2231275,2011-07-28,ATPStu,ATP,Main,Olivier Rochus,Fabio Fognini,2,SSSS;RRRR;SSRRSS;SSRRSS;RSRSRSRR;SSRSS;RSRRSR;...,6-4 6-1,0,66,27,58,3,9,0,0,32,56,6,8,2,0,2,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0,0,,,0,0,2,2,2,2,0,0,2,2,2,2,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,2,,,,,,,,,,
1,2231276,2011-07-28,ATPStu,ATP,Main,Robin Haase,Marin Cilic,2,SSRSS;RRSSRSSS;SSSS;RSSSS;SRSRSS;RSRSRSSS;RSRS...,4-6 6-4 6-3,0,141,60,102,11,15,1,0,60,92,12,14,2,0,3,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0,0,,,0,0,2,2,2,2,0,0,2,2,2,2,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,2,,,,,,,,,,
2,2231342,2011-07-28,Farmer,ATP,Main,Alejandro Falla,Thomaz Bellucci,2,SRRSSRSRRSRSRSRSRSRR;SSSRS;RRSRR;SRSSS;RRSSRR;...,6-0 6-1,0,42,21,55,1,7,0,0,26,37,6,6,2,0,2,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0,0,,,0,0,2,2,2,2,0,0,2,2,2,2,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,2,,,,,,,,,,
3,2229262,2011-07-28,Credit,ATP,Main,Matthias Bachinger,Julien Benneteau,2,SSSS;SSSS;RRRR;SSSS;SSSS;SRRRSSRSSRRSSRSS;SSSS...,6-4 6-4,0,57,36,59,7,10,0,0,43,58,9,10,2,0,2,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0,0,,,0,0,2,2,2,2,0,0,2,2,2,2,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,2,,,,,,,,,,
4,2228887,2011-07-28,Credit,ATP,Main,Stan Wawrinka,Peter Luczak,1,SSSS;RRSSSS;SRSSRS;RRSSSS;SSSRS;SSSRRS;RSSSS;R...,6-3 7-5,0,82,43,58,10,11,2,0,34,61,7,10,0,0,2,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0,0,,,0,0,2,2,2,2,0,0,2,2,2,2,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,2,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10782,8628374,2015-12-18,Men'sAustralianOpenWildCardPlayoff,ATP,Main,James Duckworth,Marinko Matosevic,1,SSSS;RSRSRSSS;SSSS;RSSRSS;SRSRRSRR;SRRRSSRR;SS...,4-6 6-3 7-6(7) 7-6(8),0,199,103,154,18,22,3,2,93,146,17,21,1,0,4,564.0,879.0,112.0,147.0,0.635690,0.335260,0.792208,0.169935,0.477612,0.500000,0.575717,0.348559,0.685345,0.188034,0.301887,0.142857,0.625203,0.364778,0.758255,0.178435,0.414046,0.454246,0.629896,0.374191,0.766704,0.191251,0.436469,0.379723,21,19,0.641567,0.797852,2023,1903,308,306,67,20,1499,1423,232,234,53,7,0.210526,0.575717,WWLWLWLWLWWLLWLWLLLLW,LLWLLWLWLLLLLLLLLLW,0.40433,0.450411,0.000467,-0.050092,-0.039537,-0.021544,-0.029357,-0.123404,-0.071810,-0.045262,-0.108342,-0.261644,-0.045754,-0.477420,0.641638,0.358362,0.761905,0.238095,0.642106,0.318825,0.591546,0.336817,0.732548,0.166285,0.638501,0.192833,0.391658,0.238356,0.454246,0.022580,3,0.779374,0.919775,0.672908,0.972994,0.755754,0.898535,0.765322,0.729030,0.942736,0.914083
10783,8625517,2015-12-18,Men'sS,ATP,Main,Jesse Huta Galung,Matwe Middelkoop,2,SSRSS;RSSSS;RRRSSR;RRRR;RRSRR;SSRRSRSS;SSRRSS;...,6-4 6-3,0,70,31,54,6,10,0,0,41,65,8,9,2,0,2,461.0,771.0,87.0,124.0,0.621951,0.313187,0.724138,0.100000,0.250000,,,,,,,,0.700937,0.346911,0.829079,0.221165,0.564946,,,,,,,,2,0,0.641567,0.797852,164,182,29,30,8,2,0,0,2,2,2,2,,,LL,,,,-0.031138,,0.014124,,-0.054697,,-0.070921,,-0.185054,,,,0.597925,0.402075,0.701613,0.298387,0.566787,0.416199,,,0.646916,0.227466,,,0.314946,,,,2,,,,,,,,,,
10784,8625520,2015-12-18,Men'sS,ATP,Main,Tallon Griekspoor,Robin Haase,2,SSSRS;SSSRRRSS;RSRSSRRR;RSSSS;SRSRRR;SSSRS;SRS...,6-2 6-3,0,63,38,71,5,9,0,0,35,47,8,8,2,0,2,461.0,771.0,87.0,124.0,0.666667,0.520000,0.800000,0.600000,0.750000,,0.622655,0.342847,0.771739,0.186296,0.444444,0.500000,0.480000,0.340000,0.320000,0.160000,0.125000,,0.640685,0.360862,0.791987,0.194391,0.488360,0.409366,1,34,0.641567,0.797852,51,50,10,10,4,2,2878,2838,460,467,99,16,0.411765,0.622655,W,LLLLLLWWLWLLWWLLWWWLWLLWWLLLWLWWLL,,0.484743,0.006667,-0.016483,0.000000,-0.016467,-0.040000,-0.033870,-0.080000,-0.021718,-0.125000,-0.067195,,-0.090634,0.597925,0.402075,0.701613,0.298387,0.604591,0.402075,0.581441,0.385608,0.661613,0.218387,0.667743,0.276669,0.375000,0.432805,,0.409366,2,0.708982,0.836380,0.440185,,0.673821,0.877942,0.793111,0.603672,0.945862,0.692683
10785,8632913,2015-12-19,Men'sS,ATP,Main,Jasper Smit,Matwe Middelkoop,2,SRRRR;SSSS;SRRSSS;RRRSSSSS;RSSSS;RRRSR;SRRSRSR...,6-4 6-7(6) 6-3,0,109,62,93,12,15,1,1,68,98,15,16,2,0,3,606.0,1008.0,114.0,160.0,0.675676,0.411111,0.733333,0.333333,0.600000,,0.630769,0.425926,0.818182,0.416667,0.750000,,0.627586,0.314379,0.624000,0.277333,0.342857,,0.610092,0.394737,0.566434,0.076923,0.100000,,1,1,0.641537,0.797829,74,90,15,15,5,2,65,54,11,12,4,2,1.000000,0.630769,W,W,,,-0.009945,0.025506,0.038697,0.036018,0.010667,-0.104895,-0.042667,-0.016900,-0.057143,-0.150000,,,0.601190,0.398810,0.712500,0.287500,0.591245,0.437507,0.626697,0.434827,0.723167,0.244833,0.607605,0.270600,0.442857,0.350000,,,2,0.652784,0.875643,0.596154,,0.683382,0.826872,0.737393,0.709899,0.793814,0.900000


In [35]:
match_rates = ["p1 match serve win rate", "p2 match serve win rate", "p1 match service game win rate", "p2 match service game win rate", "p1 match set win rate", "p1 match tiebreak win rate"]
matches[match_rates] = matches.apply(lambda match: compute_match_rates(match), axis=1, result_type="expand")

match_var_rates = ["p1 match inv serve var", "p2 match inv serve var", "p1 match inv service game var", "p2 match inv service game var", "p1 match inv set var", "p1 match inv tiebreak var"]
matches[match_var_rates] = matches.apply(lambda match: compute_match_var_rates(match), axis=1, result_type="expand")




In [36]:
economist_serve_rates = ["p1 serve_tt economist", "p2 serve_tt economist"]
matches[economist_serve_rates] = matches.apply(lambda match: compute_economist_rates(match, matches), axis=1, result_type="expand")


matches



Unnamed: 0,pbp_id,date,tny_name,tour,draw,server1,server2,winner,pbp,score,adf_flag,wh_minutes,p1 serves won,p1 serves total,p1 service games won,p1 service games total,p1 sets won,p1 tiebreaks won,p2 serves won,p2 serves total,p2 service games won,p2 service games total,p2 sets won,p2 tiebreaks won,sets total,tourney serves won,tourney serves total,tourney service games won,tourney service games total,p1 serve win rate,p1 return win rate,p1 service game win rate,p1 return game win rate,p1 set win rate,p1 tiebreak win rate,p2 serve win rate,p2 return win rate,p2 service game win rate,p2 return game win rate,p2 set win rate,p2 tiebreak win rate,p1 opp serve win rate,p1 opp return win rate,p1 opp service game win rate,p1 opp return game win rate,p1 opp set win rate,p1 opp tiebreak win rate,p2 opp serve win rate,p2 opp return win rate,p2 opp service game win rate,p2 opp return game win rate,p2 opp set win rate,p2 opp tiebreak win rate,p1 match count,p2 match count,all player year serve win rate,all player year service game win rate,p1 year serve total,p1 year return total,p1 year service game total,p1 year return game total,p1 year set total,p1 year tiebreak total,p2 year serve total,p2 year return total,p2 year service game total,p2 year return game total,p2 year set total,p2 year tiebreak total,p1 year match win rate,p2 year match rate,p1 year match results,p2 year match results,p1 opp match win rate,p2 opp match win rate,p1 relative serve,p2 relative serve,p1 relative return,p2 relative return,p1 relative service games,p2 relative service games,p1 relative return games,p2 relative return games,p1 relative set,p2 relative set,p1 relative tiebreak,p2 relative tiebreak,tourney serve win rate,tourney return win rate,tourney service game win rate,tourney return game win rate,p1 tourney serve adv,p1 tourney return adv,p2 tourney serve adv,p2 tourney return adv,p1 tourney service game adv,p1 tourney return game adv,p2 tourney service game adv,p2 tourney return game adv,p1 tourney set adv,p2 tourney set adv,p1 tourney tiebreak adv,p2 tourney tiebreak adv,tourney sets to win,p1 serve_tt,p1 service_game_tt,p1 set_tt,p1 tiebreak_tt,p2 serve_tt,p2 service_game_tt,p1 serve_tt no sos adjust,p2 serve_tt no sos adjust,p1 service_game_tt no sos adjust,p2 service_game_tt no sos adjust,p1 match serve win rate,p2 match serve win rate,p1 match service game win rate,p2 match service game win rate,p1 match set win rate,p1 match tiebreak win rate,p1 match inv serve var,p2 match inv serve var,p1 match inv service game var,p2 match inv service game var,p1 match inv set var,p1 match inv tiebreak var,p1 serve_tt economist,p2 serve_tt economist
0,2231275,2011-07-28,ATPStu,ATP,Main,Olivier Rochus,Fabio Fognini,2,SSSS;RRRR;SSRRSS;SSRRSS;RSRSRSRR;SSRSS;RSRRSR;...,6-4 6-1,0,66,27,58,3,9,0,0,32,56,6,8,2,0,2,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0,0,,,0,0,2,2,2,2,0,0,2,2,2,2,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,2,,,,,,,,,,,0.465517,0.571429,0.333333,0.750000,0.000000,,0.004290,0.004373,0.024691,0.023438,0.000000,,,
1,2231276,2011-07-28,ATPStu,ATP,Main,Robin Haase,Marin Cilic,2,SSRSS;RRSSRSSS;SSSS;RSSSS;SRSRSS;RSRSRSSS;RSRS...,4-6 6-4 6-3,0,141,60,102,11,15,1,0,60,92,12,14,2,0,3,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0,0,,,0,0,2,2,2,2,0,0,2,2,2,2,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,2,,,,,,,,,,,0.588235,0.652174,0.733333,0.857143,0.333333,,0.002375,0.002466,0.013037,0.008746,0.074074,,,
2,2231342,2011-07-28,Farmer,ATP,Main,Alejandro Falla,Thomaz Bellucci,2,SRRSSRSRRSRSRSRSRSRR;SSSRS;RRSRR;SRSSS;RRSSRR;...,6-0 6-1,0,42,21,55,1,7,0,0,26,37,6,6,2,0,2,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0,0,,,0,0,2,2,2,2,0,0,2,2,2,2,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,2,,,,,,,,,,,0.381818,0.702703,0.142857,1.000000,0.000000,,0.004292,0.005646,0.017493,0.000000,0.000000,,,
3,2229262,2011-07-28,Credit,ATP,Main,Matthias Bachinger,Julien Benneteau,2,SSSS;SSSS;RRRR;SSSS;SSSS;SRRRSSRSSRRSSRSS;SSSS...,6-4 6-4,0,57,36,59,7,10,0,0,43,58,9,10,2,0,2,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0,0,,,0,0,2,2,2,2,0,0,2,2,2,2,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,2,,,,,,,,,,,0.610169,0.741379,0.700000,0.900000,0.000000,,0.004032,0.003306,0.021000,0.009000,0.000000,,,
4,2228887,2011-07-28,Credit,ATP,Main,Stan Wawrinka,Peter Luczak,1,SSSS;RRSSSS;SRSSRS;RRSSSS;SSSRS;SSSRRS;RSSSS;R...,6-3 7-5,0,82,43,58,10,11,2,0,34,61,7,10,0,0,2,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0,0,,,0,0,2,2,2,2,0,0,2,2,2,2,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,2,,,,,,,,,,,0.741379,0.557377,0.909091,0.700000,1.000000,,0.003306,0.004044,0.007513,0.021000,0.000000,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10782,8628374,2015-12-18,Men'sAustralianOpenWildCardPlayoff,ATP,Main,James Duckworth,Marinko Matosevic,1,SSSS;RSRSRSSS;SSSS;RSSRSS;SRSRRSRR;SRRRSSRR;SS...,4-6 6-3 7-6(7) 7-6(8),0,199,103,154,18,22,3,2,93,146,17,21,1,0,4,564.0,879.0,112.0,147.0,0.635690,0.335260,0.792208,0.169935,0.477612,0.500000,0.575717,0.348559,0.685345,0.188034,0.301887,0.142857,0.625203,0.364778,0.758255,0.178435,0.414046,0.454246,0.629896,0.374191,0.766704,0.191251,0.436469,0.379723,21,19,0.641567,0.797852,2023,1903,308,306,67,20,1499,1423,232,234,53,7,0.210526,0.575717,WWLWLWLWLWWLLWLWLLLLW,LLWLLWLWLLLLLLLLLLW,0.40433,0.450411,0.000467,-0.050092,-0.039537,-0.021544,-0.029357,-0.123404,-0.071810,-0.045262,-0.108342,-0.261644,-0.045754,-0.477420,0.641638,0.358362,0.761905,0.238095,0.642106,0.318825,0.591546,0.336817,0.732548,0.166285,0.638501,0.192833,0.391658,0.238356,0.454246,0.022580,3,0.779374,0.919775,0.672908,0.972994,0.755754,0.898535,0.765322,0.729030,0.942736,0.914083,0.668831,0.636986,0.818182,0.809524,0.750000,1.0,0.001438,0.001584,0.006762,0.007343,0.046875,0.125000,0.641679,0.597230
10783,8625517,2015-12-18,Men'sS,ATP,Main,Jesse Huta Galung,Matwe Middelkoop,2,SSRSS;RSSSS;RRRSSR;RRRR;RRSRR;SSRRSRSS;SSRRSS;...,6-4 6-3,0,70,31,54,6,10,0,0,41,65,8,9,2,0,2,461.0,771.0,87.0,124.0,0.621951,0.313187,0.724138,0.100000,0.250000,,,,,,,,0.700937,0.346911,0.829079,0.221165,0.564946,,,,,,,,2,0,0.641567,0.797852,164,182,29,30,8,2,0,0,2,2,2,2,,,LL,,,,-0.031138,,0.014124,,-0.054697,,-0.070921,,-0.185054,,,,0.597925,0.402075,0.701613,0.298387,0.566787,0.416199,,,0.646916,0.227466,,,0.314946,,,,2,,,,,,,,,,,0.574074,0.630769,0.600000,0.888889,0.000000,,0.004528,0.003583,0.024000,0.010974,0.000000,,,
10784,8625520,2015-12-18,Men'sS,ATP,Main,Tallon Griekspoor,Robin Haase,2,SSSRS;SSSRRRSS;RSRSSRRR;RSSSS;SRSRRR;SSSRS;SRS...,6-2 6-3,0,63,38,71,5,9,0,0,35,47,8,8,2,0,2,461.0,771.0,87.0,124.0,0.666667,0.520000,0.800000,0.600000,0.750000,,0.622655,0.342847,0.771739,0.186296,0.444444,0.500000,0.480000,0.340000,0.320000,0.160000,0.125000,,0.640685,0.360862,0.791987,0.194391,0.488360,0.409366,1,34,0.641567,0.797852,51,50,10,10,4,2,2878,2838,460,467,99,16,0.411765,0.622655,W,LLLLLLWWLWLLWWLLWWWLWLLWWLLLWLWWLL,,0.484743,0.006667,-0.016483,0.000000,-0.016467,-0.040000,-0.033870,-0.080000,-0.021718,-0.125000,-0.067195,,-0.090634,0.597925,0.402075,0.701613,0.298387,0.604591,0.402075,0.581441,0.385608,0.661613,0.218387,0.667743,0.276669,0.375000,0.432805,,0.409366,2,0.708982,0.836380,0.440185,,0.673821,0.877942,0.793111,0.603672,0.945862,0.692683,0.535211,0.744681,0.555556,1.000000,0.000000,,0.003504,0.004045,0.027435,0.000000,0.000000,,,
10785,8632913,2015-12-19,Men'sS,ATP,Main,Jasper Smit,Matwe Middelkoop,2,SRRRR;SSSS;SRRSSS;RRRSSSSS;RSSSS;RRRSR;SRRSRSR...,6-4 6-7(6) 6-3,0,109,62,93,12,15,1,1,68,98,15,16,2,0,3,606.0,1008.0,114.0,160.0,0.675676,0.411111,0.733333,0.333333,0.600000,,0.630769,0.425926,0.818182,0.416667,0.750000,,0.627586,0.314379,0.624000,0.277333,0.342857,,0.610092,0.394737,0.566434,0.076923,0.100000,,1,1,0.641537,0.797829,74,90,15,15,5,2,65,54,11,12,4,2,1.000000,0.630769,W,W,,,-0.009945,0.025506,0.038697,0.036018,0.010667,-0.104895,-0.042667,-0.016900,-0.057143,-0.150000,,,0.601190,0.398810,0.712500,0.287500,0.591245,0.437507,0.626697,0.434827,0.723167,0.244833,0.607605,0.270600,0.442857,0.350000,,,2,0.652784,0.875643,0.596154,,0.683382,0.826872,0.737393,0.709899,0.793814,0.900000,0.666667,0.693878,0.800000,0.937500,0.333333,1.0,0.002389,0.002167,0.010667,0.003662,0.074074,,,


In [37]:
economist_serve_rates = ["p1 service_game_tt economist", "p2 service_game_tt economist"]
matches[economist_serve_rates] = matches.apply(lambda match: compute_economist_sg_rg_rates(match, matches), axis=1, result_type="expand")


matches



-12186782539485.066
-12186782539486.035
283147765926.1382
283147765927.6019


Unnamed: 0,pbp_id,date,tny_name,tour,draw,server1,server2,winner,pbp,score,adf_flag,wh_minutes,p1 serves won,p1 serves total,p1 service games won,p1 service games total,p1 sets won,p1 tiebreaks won,p2 serves won,p2 serves total,p2 service games won,p2 service games total,p2 sets won,p2 tiebreaks won,sets total,tourney serves won,tourney serves total,tourney service games won,tourney service games total,p1 serve win rate,p1 return win rate,p1 service game win rate,p1 return game win rate,p1 set win rate,p1 tiebreak win rate,p2 serve win rate,p2 return win rate,p2 service game win rate,p2 return game win rate,p2 set win rate,p2 tiebreak win rate,p1 opp serve win rate,p1 opp return win rate,p1 opp service game win rate,p1 opp return game win rate,p1 opp set win rate,p1 opp tiebreak win rate,p2 opp serve win rate,p2 opp return win rate,p2 opp service game win rate,p2 opp return game win rate,p2 opp set win rate,p2 opp tiebreak win rate,p1 match count,p2 match count,all player year serve win rate,all player year service game win rate,p1 year serve total,p1 year return total,p1 year service game total,p1 year return game total,p1 year set total,p1 year tiebreak total,p2 year serve total,p2 year return total,p2 year service game total,p2 year return game total,p2 year set total,p2 year tiebreak total,p1 year match win rate,p2 year match rate,p1 year match results,p2 year match results,p1 opp match win rate,p2 opp match win rate,p1 relative serve,p2 relative serve,p1 relative return,p2 relative return,p1 relative service games,p2 relative service games,p1 relative return games,p2 relative return games,p1 relative set,p2 relative set,p1 relative tiebreak,p2 relative tiebreak,tourney serve win rate,tourney return win rate,tourney service game win rate,tourney return game win rate,p1 tourney serve adv,p1 tourney return adv,p2 tourney serve adv,p2 tourney return adv,p1 tourney service game adv,p1 tourney return game adv,p2 tourney service game adv,p2 tourney return game adv,p1 tourney set adv,p2 tourney set adv,p1 tourney tiebreak adv,p2 tourney tiebreak adv,tourney sets to win,p1 serve_tt,p1 service_game_tt,p1 set_tt,p1 tiebreak_tt,p2 serve_tt,p2 service_game_tt,p1 serve_tt no sos adjust,p2 serve_tt no sos adjust,p1 service_game_tt no sos adjust,p2 service_game_tt no sos adjust,p1 match serve win rate,p2 match serve win rate,p1 match service game win rate,p2 match service game win rate,p1 match set win rate,p1 match tiebreak win rate,p1 match inv serve var,p2 match inv serve var,p1 match inv service game var,p2 match inv service game var,p1 match inv set var,p1 match inv tiebreak var,p1 serve_tt economist,p2 serve_tt economist,p1 service_game_tt economist,p2 service_game_tt economist
0,2231275,2011-07-28,ATPStu,ATP,Main,Olivier Rochus,Fabio Fognini,2,SSSS;RRRR;SSRRSS;SSRRSS;RSRSRSRR;SSRSS;RSRRSR;...,6-4 6-1,0,66,27,58,3,9,0,0,32,56,6,8,2,0,2,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0,0,,,0,0,2,2,2,2,0,0,2,2,2,2,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,2,,,,,,,,,,,0.465517,0.571429,0.333333,0.750000,0.000000,,0.004290,0.004373,0.024691,0.023438,0.000000,,,,,
1,2231276,2011-07-28,ATPStu,ATP,Main,Robin Haase,Marin Cilic,2,SSRSS;RRSSRSSS;SSSS;RSSSS;SRSRSS;RSRSRSSS;RSRS...,4-6 6-4 6-3,0,141,60,102,11,15,1,0,60,92,12,14,2,0,3,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0,0,,,0,0,2,2,2,2,0,0,2,2,2,2,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,2,,,,,,,,,,,0.588235,0.652174,0.733333,0.857143,0.333333,,0.002375,0.002466,0.013037,0.008746,0.074074,,,,,
2,2231342,2011-07-28,Farmer,ATP,Main,Alejandro Falla,Thomaz Bellucci,2,SRRSSRSRRSRSRSRSRSRR;SSSRS;RRSRR;SRSSS;RRSSRR;...,6-0 6-1,0,42,21,55,1,7,0,0,26,37,6,6,2,0,2,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0,0,,,0,0,2,2,2,2,0,0,2,2,2,2,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,2,,,,,,,,,,,0.381818,0.702703,0.142857,1.000000,0.000000,,0.004292,0.005646,0.017493,0.000000,0.000000,,,,,
3,2229262,2011-07-28,Credit,ATP,Main,Matthias Bachinger,Julien Benneteau,2,SSSS;SSSS;RRRR;SSSS;SSSS;SRRRSSRSSRRSSRSS;SSSS...,6-4 6-4,0,57,36,59,7,10,0,0,43,58,9,10,2,0,2,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0,0,,,0,0,2,2,2,2,0,0,2,2,2,2,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,2,,,,,,,,,,,0.610169,0.741379,0.700000,0.900000,0.000000,,0.004032,0.003306,0.021000,0.009000,0.000000,,,,,
4,2228887,2011-07-28,Credit,ATP,Main,Stan Wawrinka,Peter Luczak,1,SSSS;RRSSSS;SRSSRS;RRSSSS;SSSRS;SSSRRS;RSSSS;R...,6-3 7-5,0,82,43,58,10,11,2,0,34,61,7,10,0,0,2,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0,0,,,0,0,2,2,2,2,0,0,2,2,2,2,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,2,,,,,,,,,,,0.741379,0.557377,0.909091,0.700000,1.000000,,0.003306,0.004044,0.007513,0.021000,0.000000,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10782,8628374,2015-12-18,Men'sAustralianOpenWildCardPlayoff,ATP,Main,James Duckworth,Marinko Matosevic,1,SSSS;RSRSRSSS;SSSS;RSSRSS;SRSRRSRR;SRRRSSRR;SS...,4-6 6-3 7-6(7) 7-6(8),0,199,103,154,18,22,3,2,93,146,17,21,1,0,4,564.0,879.0,112.0,147.0,0.635690,0.335260,0.792208,0.169935,0.477612,0.500000,0.575717,0.348559,0.685345,0.188034,0.301887,0.142857,0.625203,0.364778,0.758255,0.178435,0.414046,0.454246,0.629896,0.374191,0.766704,0.191251,0.436469,0.379723,21,19,0.641567,0.797852,2023,1903,308,306,67,20,1499,1423,232,234,53,7,0.210526,0.575717,WWLWLWLWLWWLLWLWLLLLW,LLWLLWLWLLLLLLLLLLW,0.40433,0.450411,0.000467,-0.050092,-0.039537,-0.021544,-0.029357,-0.123404,-0.071810,-0.045262,-0.108342,-0.261644,-0.045754,-0.477420,0.641638,0.358362,0.761905,0.238095,0.642106,0.318825,0.591546,0.336817,0.732548,0.166285,0.638501,0.192833,0.391658,0.238356,0.454246,0.022580,3,0.779374,0.919775,0.672908,0.972994,0.755754,0.898535,0.765322,0.729030,0.942736,0.914083,0.668831,0.636986,0.818182,0.809524,0.750000,1.0,0.001438,0.001584,0.006762,0.007343,0.046875,0.125000,0.641679,0.597230,0.767799,0.598063
10783,8625517,2015-12-18,Men'sS,ATP,Main,Jesse Huta Galung,Matwe Middelkoop,2,SSRSS;RSSSS;RRRSSR;RRRR;RRSRR;SSRRSRSS;SSRRSS;...,6-4 6-3,0,70,31,54,6,10,0,0,41,65,8,9,2,0,2,461.0,771.0,87.0,124.0,0.621951,0.313187,0.724138,0.100000,0.250000,,,,,,,,0.700937,0.346911,0.829079,0.221165,0.564946,,,,,,,,2,0,0.641567,0.797852,164,182,29,30,8,2,0,0,2,2,2,2,,,LL,,,,-0.031138,,0.014124,,-0.054697,,-0.070921,,-0.185054,,,,0.597925,0.402075,0.701613,0.298387,0.566787,0.416199,,,0.646916,0.227466,,,0.314946,,,,2,,,,,,,,,,,0.574074,0.630769,0.600000,0.888889,0.000000,,0.004528,0.003583,0.024000,0.010974,0.000000,,,,,
10784,8625520,2015-12-18,Men'sS,ATP,Main,Tallon Griekspoor,Robin Haase,2,SSSRS;SSSRRRSS;RSRSSRRR;RSSSS;SRSRRR;SSSRS;SRS...,6-2 6-3,0,63,38,71,5,9,0,0,35,47,8,8,2,0,2,461.0,771.0,87.0,124.0,0.666667,0.520000,0.800000,0.600000,0.750000,,0.622655,0.342847,0.771739,0.186296,0.444444,0.500000,0.480000,0.340000,0.320000,0.160000,0.125000,,0.640685,0.360862,0.791987,0.194391,0.488360,0.409366,1,34,0.641567,0.797852,51,50,10,10,4,2,2878,2838,460,467,99,16,0.411765,0.622655,W,LLLLLLWWLWLLWWLLWWWLWLLWWLLLWLWWLL,,0.484743,0.006667,-0.016483,0.000000,-0.016467,-0.040000,-0.033870,-0.080000,-0.021718,-0.125000,-0.067195,,-0.090634,0.597925,0.402075,0.701613,0.298387,0.604591,0.402075,0.581441,0.385608,0.661613,0.218387,0.667743,0.276669,0.375000,0.432805,,0.409366,2,0.708982,0.836380,0.440185,,0.673821,0.877942,0.793111,0.603672,0.945862,0.692683,0.535211,0.744681,0.555556,1.000000,0.000000,,0.003504,0.004045,0.027435,0.000000,0.000000,,,,,
10785,8632913,2015-12-19,Men'sS,ATP,Main,Jasper Smit,Matwe Middelkoop,2,SRRRR;SSSS;SRRSSS;RRRSSSSS;RSSSS;RRRSR;SRRSRSR...,6-4 6-7(6) 6-3,0,109,62,93,12,15,1,1,68,98,15,16,2,0,3,606.0,1008.0,114.0,160.0,0.675676,0.411111,0.733333,0.333333,0.600000,,0.630769,0.425926,0.818182,0.416667,0.750000,,0.627586,0.314379,0.624000,0.277333,0.342857,,0.610092,0.394737,0.566434,0.076923,0.100000,,1,1,0.641537,0.797829,74,90,15,15,5,2,65,54,11,12,4,2,1.000000,0.630769,W,W,,,-0.009945,0.025506,0.038697,0.036018,0.010667,-0.104895,-0.042667,-0.016900,-0.057143,-0.150000,,,0.601190,0.398810,0.712500,0.287500,0.591245,0.437507,0.626697,0.434827,0.723167,0.244833,0.607605,0.270600,0.442857,0.350000,,,2,0.652784,0.875643,0.596154,,0.683382,0.826872,0.737393,0.709899,0.793814,0.900000,0.666667,0.693878,0.800000,0.937500,0.333333,1.0,0.002389,0.002167,0.010667,0.003662,0.074074,,,,,


In [14]:
x = np.array([1,2,3])
y = np.array([3,4,5])
np.append(x,y)

array([1, 2, 3, 3, 4, 5])

In [43]:
features

NameError: name 'features' is not defined

In [47]:
matches[match_rates].describe()

Unnamed: 0,p1 match serve win rate,p2 match serve win rate,p1 match service game win rate,p2 match service game win rate,p1 match set win rate,p1 match tiebreak win rate
count,10787.0,10787.0,10787.0,10787.0,10787.0,4047.0
mean,0.641515,0.632156,0.787912,0.771705,0.521623,0.514496
std,0.091863,0.091779,0.17294,0.177159,0.410931,0.477276
min,0.0,0.241379,0.0,0.0,0.0,0.0
25%,0.581633,0.574074,0.692308,0.666667,0.0,0.0
50%,0.644737,0.633333,0.8125,0.8,0.666667,0.5
75%,0.704762,0.693954,0.909091,0.9,1.0,1.0
max,0.941176,0.947368,1.0,1.0,1.0,1.0


# Try to predict on test matches

### predict using serve win rates

In [38]:
# Filter matches we want to predict on

# Only consider matches on or after August 1, 2012, because we've had one year of data at that point
games_after_2012 = matches[matches["date"] > datetime.strptime("31 Dec 14", '%d %b %y')]

# Only consider best of three set matches
games_after_2012 = games_after_2012[games_after_2012["tourney sets to win"] == 2]

# Don't consider Davis Cup matches because they don't have a final set tiebreak
games_after_2012 = games_after_2012[games_after_2012["tny_name"] != "DavisC"]

# Remove rows where either player hasn't played at least 10 matches in the past year
games_after_2012 = games_after_2012[(games_after_2012["p1 match count"] >= 2) & (games_after_2012["p2 match count"] >= 2)]

In [39]:
# Remove rows where we don't have s_ijtt values to predict the result of the match
games_after_2012 = games_after_2012[(games_after_2012['p1 serve_tt economist'].notna()) & (games_after_2012['p2 serve_tt economist'].notna())]





In [40]:
# import the markov model functions lazily
%run Modeling_tennis_match_iid_points.ipynb

0.69875
0.933699


In [41]:
# compute Pr[p1 wins] with opponent/ tournament adjusted shrunken serve values
games_after_2012["p1 pr win"] = games_after_2012.apply(lambda match: match_driver(2, 6, 4, 7, 1, match["p1 serve_tt economist"], match["p2 serve_tt economist"]), axis=1).clip(upper=0.9985, lower=0.0015)




In [42]:
# compute log loss
games_after_2012["winner"] = games_after_2012["winner"].replace(2,0)

compute_lloss = games_after_2012[["winner", "p1 pr win"]]

compute_lloss["lloss"] = compute_lloss.apply(lambda m: (m["winner"]*np.log(m["p1 pr win"]) + (1-m["winner"])*np.log(1 - m["p1 pr win"])), axis=1)
N = compute_lloss.shape[0]

lloss = (compute_lloss["lloss"].sum()) * (-1/N)

print(f"log loss: {lloss}")
print(f"Random guessing log loss: {-np.log(0.5)}")

log loss: 0.6775073990470023
Random guessing log loss: 0.6931471805599453


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


### predict using sg,rg and serve for tiebreak

In [43]:
# Filter matches we want to predict on

# Only consider matches on or after August 1, 2012, because we've had one year of data at that point
games_after_2012 = matches[matches["date"] > datetime.strptime("31 Dec 14", '%d %b %y')]

# Only consider best of three set matches
games_after_2012 = games_after_2012[games_after_2012["tourney sets to win"] == 2]

# Don't consider Davis Cup matches because they don't have a final set tiebreak
games_after_2012 = games_after_2012[games_after_2012["tny_name"] != "DavisC"]

# Remove rows where either player hasn't played at least 10 matches in the past year
games_after_2012 = games_after_2012[(games_after_2012["p1 match count"] >= 2) & (games_after_2012["p2 match count"] >= 2)]



In [44]:
# Remove rows where we don't have sg_ijtt and s_ijtt values to predict the result of the match
games_after_2012 = games_after_2012[(games_after_2012['p1 service_game_tt economist'].notna()) & (games_after_2012['p2 service_game_tt economist'].notna()) & (games_after_2012['p1 serve_tt economist'].notna()) & (games_after_2012['p2 serve_tt economist'].notna())]





In [45]:
# compute Pr[p1 wins] with non opponenent/ tournament adjusted service game values
games_after_2012["p1 pr win"] = games_after_2012.apply(lambda match: match_driver_game_based_tiebreak_serve_based(2, 6, 4, 7, 1, match["p1 service_game_tt economist"], match["p2 service_game_tt economist"], match["p1 serve_tt economist"], match["p2 serve_tt economist"]), axis=1).clip(upper=0.9985, lower=0.0015)






In [46]:
games_after_2012["winner"] = games_after_2012["winner"].replace(2,0)

compute_lloss = games_after_2012[["winner", "p1 pr win"]]

compute_lloss["lloss"] = compute_lloss.apply(lambda m: (m["winner"]*np.log(m["p1 pr win"]) + (1-m["winner"])*np.log(1 - m["p1 pr win"])), axis=1)
N = compute_lloss.shape[0]

lloss = (compute_lloss["lloss"].sum()) * (-1/N)

print(f"log loss: {lloss}")
print(f"Random guessing log loss: {-np.log(0.5)}")

log loss: 0.6525481363919573
Random guessing log loss: 0.6931471805599453


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """


In [98]:
count

[9624, 9462, 9468, 9618]