In [None]:
from google.colab import drive
import pandas as pd
import numpy as np
import math
from sklearn.model_selection import train_test_split
from sklearn.inspection import permutation_importance
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.optimizers import Adam
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV

In [None]:
drive.mount('/content/drive')

In [None]:
path_to_match_data = '/content/drive/My Drive/tennis'

In [None]:
# Load in csv files
non_training_year = 2010 # decide year before which matches won't be trained or tested
atp_csv_files_all = []
atp_csv_files_before = []
atp_csv_files_after = []
for i in range(1995, 2024):
  df = pd.read_csv(f'{path_to_match_data}/atp_matches_{i}.csv')
  atp_csv_files_all.append(df)
  if i < non_training_year:
    atp_csv_files_before.append(df)
  else:
    atp_csv_files_after.append(df)
  print(f"Year {i} loaded")

atp_matches_all = pd.concat(atp_csv_files_all, ignore_index=True)
atp_matches_before_df = pd.concat(atp_csv_files_before, ignore_index=True)
atp_matches_after_df = pd.concat(atp_csv_files_after, ignore_index=True)
atp_players_df = pd.read_csv('/content/drive/My Drive/tennis/atp_players.csv')

In [None]:
def clean_dataframe(df):
    # Remove rows where the score is "W/O"
    df = df[df['score'] != 'W/O']

    # Replace values in winner_hand and loser_hand columns
    df['winner_hand'] = df['winner_hand'].apply(lambda x: 'R' if x not in ['L', 'R'] else x)
    df['loser_hand'] = df['loser_hand'].apply(lambda x: 'R' if x not in ['L', 'R'] else x)
    df['tourney_date'] = pd.to_datetime(df['tourney_date'], format='%Y%m%d')

    # Replace NaN values in specific columns with 0
    cols_with_zeros = ['w_1stWon', 'w_1stIn', 'w_2ndWon', 'w_svpt', 'w_bpSaved', 'w_bpFaced',
                       'l_1stWon', 'l_1stIn', 'l_2ndWon', 'l_svpt', 'l_bpSaved', 'l_bpFaced']

    for col in cols_with_zeros:
        df[col].fillna(0, inplace=True)

    # Replace NaN values in "w_height" and "l_height" with the column average
    for col in ['winner_ht', 'loser_ht']:
        mean_value = df[col].mean()
        df[col].fillna(mean_value, inplace=True)

    # just call NA surface Hard
    df['surface'].fillna("Hard", inplace=True)

    return df

# Clean all DataFrames
atp_matches_all = clean_dataframe(atp_matches_all)
atp_matches_before_df = clean_dataframe(atp_matches_before_df)
atp_matches_after_df = clean_dataframe(atp_matches_after_df)

In [None]:
# Drop useless columns
atp_matches_before_df = atp_matches_before_df.drop(['draw_size', 'winner_seed', 'loser_seed', 'winner_entry', 'loser_entry', 'winner_rank', 'winner_rank_points', 'loser_rank', 'loser_rank_points'], axis=1)
atp_matches_after_df = atp_matches_after_df.drop(['draw_size', 'winner_seed', 'loser_seed', 'winner_entry', 'loser_entry', 'winner_rank', 'winner_rank_points', 'loser_rank', 'loser_rank_points'], axis=1)
atp_matches_all = atp_matches_all.drop(['draw_size', 'winner_seed', 'loser_seed', 'winner_entry', 'loser_entry', 'winner_rank', 'winner_rank_points', 'loser_rank', 'loser_rank_points'], axis=1)

In [None]:
# Make first player feature, id
atp_player_features = pd.DataFrame({
    'player_id' : pd.concat([
    pd.Series(atp_matches_all['winner_id'].unique()),
    pd.Series(atp_matches_all['loser_id'].unique())]).unique()
})

In [None]:
# make total_wins feature
wins_count_before = atp_matches_before_df['winner_id'].value_counts().reset_index()
wins_count_before.columns = ['player_id', 'total_wins']
atp_player_features = atp_player_features.merge(wins_count_before, on='player_id', how='left')
atp_player_features['total_wins'].fillna(0, inplace=True)

# make total_losses feature
loss_count_before = atp_matches_before_df['loser_id'].value_counts().reset_index()
loss_count_before.columns = ['player_id', 'total_losses']
atp_player_features = atp_player_features.merge(loss_count_before, on='player_id', how='left')
atp_player_features['total_losses'].fillna(0, inplace=True)

# make total_wins/losses_(surface) features
for surface in ['Hard', 'Carpet', 'Clay', 'Grass']:
  matches_on_surface = atp_matches_before_df[atp_matches_before_df['surface'] == surface]
  wins_count_surface = matches_on_surface['winner_id'].value_counts().reset_index()
  wins_count_surface.columns = ['player_id', f'total_wins_on_{surface}']
  atp_player_features = atp_player_features.merge(wins_count_surface, on='player_id', how='left')
  atp_player_features[f'total_wins_on_{surface}'].fillna(0, inplace=True)
  loss_count_surface = matches_on_surface['loser_id'].value_counts().reset_index()
  loss_count_surface.columns = ['player_id', f'total_losses_on_{surface}']
  atp_player_features = atp_player_features.merge(loss_count_surface, on='player_id', how='left')
  atp_player_features[f'total_losses_on_{surface}'].fillna(0, inplace=True)

In [None]:
# Calculate wins/losses in best of 3/5 sets
for best_of in [3, 5]:
    matches_best_of = atp_matches_before_df[atp_matches_before_df['best_of'] == best_of]
    wins_count_best_of = matches_best_of['winner_id'].value_counts().reset_index()
    wins_count_best_of.columns = ['player_id', f'total_wins_best_of_{best_of}']
    atp_player_features = atp_player_features.merge(wins_count_best_of, on='player_id', how='left')
    atp_player_features[f'total_wins_best_of_{best_of}'].fillna(0, inplace=True)

    loss_count_best_of = matches_best_of['loser_id'].value_counts().reset_index()
    loss_count_best_of.columns = ['player_id', f'total_losses_best_of_{best_of}']
    atp_player_features = atp_player_features.merge(loss_count_best_of, on='player_id', how='left')
    atp_player_features[f'total_losses_best_of_{best_of}'].fillna(0, inplace=True)

for hand in ['L', 'R']:
    wins_count_hand = atp_matches_before_df[atp_matches_before_df['winner_hand'] == hand]['winner_id'].value_counts().reset_index()
    wins_count_hand.columns = ['player_id', f'total_wins_vs_{hand}']
    atp_player_features = atp_player_features.merge(wins_count_hand, on='player_id', how='left')
    atp_player_features[f'total_wins_vs_{hand}'].fillna(0, inplace=True)

    losses_count_hand = atp_matches_before_df[atp_matches_before_df['loser_hand'] == hand]['loser_id'].value_counts().reset_index()
    losses_count_hand.columns = ['player_id', f'total_losses_vs_{hand}']
    atp_player_features = atp_player_features.merge(losses_count_hand, on='player_id', how='left')
    atp_player_features[f'total_losses_vs_{hand}'].fillna(0, inplace=True)

In [None]:
# make brk_pts_srvd, brk_pts_lost, brk_pts_faced, brk_pts_saved
# make first_serves, first_serve_wins, second_serves, second_serve_wins, first_returns, first_return_wins, second_returns, second_return_wins
serve_return_stats = atp_matches_before_df.groupby('winner_id').agg({
    'w_svpt': 'sum',
    'w_1stIn' : 'sum',
    'w_1stWon': 'sum',
    'w_2ndWon': 'sum',
    'l_svpt': 'sum',
    'l_1stIn' : 'sum',
    'l_1stWon': 'sum',
    'l_2ndWon': 'sum',
    'w_bpSaved': 'sum',
    'w_bpFaced': 'sum',
    'l_bpSaved' : 'sum',
    'l_bpFaced' : 'sum'
}).reset_index()
serve_return_stats.columns = ['player_id', 'total_w_svpt', 'total_w_1stIn', 'total_w_1stWon', 'total_w_2ndWon', 'total_w_retpt', 'total_w_1stRet','total_w_1stRetLost', 'total_w_2ndRetLost', 'total_w_bpAgainst_Won', 'total_w_bpAgainst', 'total_w_bpFor_Lost', 'total_w_bpFor']

serve_return_stats_loser = atp_matches_before_df.groupby('loser_id').agg({
    'l_svpt': 'sum',
    'l_1stIn' : 'sum',
    'l_1stWon': 'sum',
    'l_2ndWon': 'sum',
    'w_svpt': 'sum',
    'w_1stIn' : 'sum',
    'w_1stWon': 'sum',
    'w_2ndWon': 'sum',
    'l_bpSaved': 'sum',
    'l_bpFaced': 'sum',
    'w_bpSaved' : 'sum',
    'w_bpFaced' : 'sum'
}).reset_index()
serve_return_stats_loser.columns = ['player_id', 'total_l_svpt', 'total_l_1stIn', 'total_l_1stWon', 'total_l_2ndWon', 'total_l_retpt', 'total_l_1stRet','total_l_1stRetLost', 'total_l_2ndRetLost', 'total_l_bpAgainst_Won', 'total_l_bpAgainst', 'total_l_bpFor_Lost', 'total_l_bpFor']

# Merge winners and losers stats into one DataFrame
serve_return_stats_all = pd.merge(serve_return_stats, serve_return_stats_loser, on='player_id', how='outer')
serve_return_stats_all.fillna(0, inplace=True)

# Calculate serve, return, break features
serve_return_stats_all['total_serves'] = serve_return_stats_all['total_w_svpt'] + serve_return_stats_all['total_l_svpt']
serve_return_stats_all['total_first_serves'] = serve_return_stats_all['total_w_1stIn'] + serve_return_stats_all['total_l_1stIn']
serve_return_stats_all['total_second_serves'] = serve_return_stats_all['total_serves'] - serve_return_stats_all['total_first_serves']
serve_return_stats_all['first_serves_wins'] = serve_return_stats_all['total_w_1stWon'] + serve_return_stats_all['total_l_1stWon']
serve_return_stats_all['second_serves_wins'] = serve_return_stats_all['total_w_2ndWon'] + serve_return_stats_all['total_l_2ndWon']

serve_return_stats_all['total_returns'] = serve_return_stats_all['total_w_retpt'] + serve_return_stats_all['total_l_retpt']
serve_return_stats_all['total_first_returns'] = serve_return_stats_all['total_w_1stRet'] + serve_return_stats_all['total_l_1stRet']
serve_return_stats_all['total_second_returns'] = serve_return_stats_all['total_returns'] - serve_return_stats_all['total_first_returns']
serve_return_stats_all['first_return_wins'] = serve_return_stats_all['total_first_returns'] - (serve_return_stats_all['total_w_1stRetLost'] + serve_return_stats_all['total_l_1stRetLost'])
serve_return_stats_all['second_return_wins'] = serve_return_stats_all['total_second_returns'] - (serve_return_stats_all['total_w_2ndRetLost'] + serve_return_stats_all['total_l_2ndRetLost'])

serve_return_stats_all['break_points_against_won'] = serve_return_stats_all['total_w_bpAgainst_Won'] + serve_return_stats_all['total_l_bpAgainst_Won']
serve_return_stats_all['break_points_against'] = serve_return_stats_all['total_w_bpAgainst'] + serve_return_stats_all['total_l_bpAgainst']
serve_return_stats_all['break_points_against_lost'] =  serve_return_stats_all['break_points_against'] - serve_return_stats_all['break_points_against_won']
serve_return_stats_all['break_points_for_lost'] = serve_return_stats_all['total_w_bpFor_Lost'] + serve_return_stats_all['total_l_bpFor_Lost']
serve_return_stats_all['break_points_for'] = serve_return_stats_all['total_w_bpFor'] + serve_return_stats_all['total_l_bpFor']
serve_return_stats_all['break_points_for_won'] = serve_return_stats_all['break_points_for'] - serve_return_stats_all['break_points_for_lost']

# Merge with atp_player_features
atp_player_features = atp_player_features.merge(serve_return_stats_all, on='player_id', how='left')

# Take care of players in ATP_all but after 2016
columns_to_replace = [
    'total_serves', 'total_first_serves', 'total_second_serves',
    'first_serves_wins', 'second_serves_wins',
    'total_returns', 'total_first_returns', 'total_second_returns',
    'first_return_wins', 'second_return_wins',
    'break_points_against_won', 'break_points_against', 'break_points_against_lost',
    'break_points_for_lost', 'break_points_for', 'break_points_for_won'
]
atp_player_features[columns_to_replace] = atp_player_features[columns_to_replace].fillna(0)

# Drop w/l columns no longer needed
columns_to_drop = [
    'total_w_svpt', 'total_w_1stIn', 'total_w_1stWon', 'total_w_2ndWon',
    'total_w_retpt', 'total_w_1stRet', 'total_w_1stRetLost', 'total_w_2ndRetLost',
    'total_w_bpAgainst_Won', 'total_w_bpAgainst', 'total_w_bpFor_Lost',
    'total_w_bpFor', 'total_l_svpt', 'total_l_1stIn', 'total_l_1stWon',
    'total_l_2ndWon', 'total_l_retpt', 'total_l_1stRet', 'total_l_1stRetLost',
    'total_l_2ndRetLost', 'total_l_bpAgainst_Won', 'total_l_bpAgainst',
    'total_l_bpFor_Lost', 'total_l_bpFor'
]

atp_player_features = atp_player_features.drop(columns=columns_to_drop)

In [None]:
# Making the Glicko-2 System

WIN = 1.
DRAW = 0.5
LOSS = 0.


MU = 1500
PHI = 350
SIGMA = 0.06
TAU = 1.0
EPSILON = 0.000001


class Rating(object):
    def __init__(self, mu=MU, phi=PHI, sigma=SIGMA):
        self.mu = mu
        self.phi = phi
        self.sigma = sigma

    def __repr__(self):
        c = type(self)
        args = (c.__module__, c.__name__, self.mu, self.phi, self.sigma)
        return '%s.%s(mu=%.3f, phi=%.3f, sigma=%.3f)' % args


class Glicko2(object):
    def __init__(self, mu=MU, phi=PHI, sigma=SIGMA, tau=TAU, epsilon=EPSILON):
        self.mu = mu
        self.phi = phi
        self.sigma = sigma
        self.tau = tau
        self.epsilon = epsilon

    def create_rating(self, mu=None, phi=None, sigma=None):
        if mu is None:
            mu = self.mu
        if phi is None:
            phi = self.phi
        if sigma is None:
            sigma = self.sigma
        return Rating(mu, phi, sigma)

    def scale_down(self, rating, ratio=173.7178):
        mu = (rating.mu - self.mu) / ratio
        phi = rating.phi / ratio
        return self.create_rating(mu, phi, rating.sigma)

    def scale_up(self, rating, ratio=173.7178):
        mu = rating.mu * ratio + self.mu
        phi = rating.phi * ratio
        return self.create_rating(mu, phi, rating.sigma)

    def reduce_impact(self, rating):
        """The original form is `g(RD)`. This function reduces the impact of
        games as a function of an opponent's RD.
        """
        return 1. / math.sqrt(1 + (3 * rating.phi ** 2) / (math.pi ** 2))

    def expect_score(self, rating, other_rating, impact):
        return 1. / (1 + math.exp(-impact * (rating.mu - other_rating.mu)))

    def determine_sigma(self, rating, difference, variance):
        """Determines new sigma."""
        phi = rating.phi
        difference_squared = difference ** 2
        # 1. Let a = ln(s^2), and define f(x)
        alpha = math.log(rating.sigma ** 2)

        def f(x):
            """This function is twice the conditional log-posterior density of
            phi, and is the optimality criterion.
            """
            tmp = phi ** 2 + variance + math.exp(x)
            a = math.exp(x) * (difference_squared - tmp) / (2 * tmp ** 2)
            b = (x - alpha) / (self.tau ** 2)
            return a - b

        # 2. Set the initial values of the iterative algorithm.
        a = alpha
        if difference_squared > phi ** 2 + variance:
            b = math.log(difference_squared - phi ** 2 - variance)
        else:
            k = 1
            while f(alpha - k * math.sqrt(self.tau ** 2)) < 0:
                k += 1
            b = alpha - k * math.sqrt(self.tau ** 2)
        # 3. Let fA = f(A) and f(B) = f(B)
        f_a, f_b = f(a), f(b)
        # 4. While |B-A| > e, carry out the following steps.
        # (a) Let C = A + (A - B)fA / (fB-fA), and let fC = f(C).
        # (b) If fCfB < 0, then set A <- B and fA <- fB; otherwise, just set
        #     fA <- fA/2.
        # (c) Set B <- C and fB <- fC.
        # (d) Stop if |B-A| <= e. Repeat the above three steps otherwise.
        while abs(b - a) > self.epsilon:
            c = a + (a - b) * f_a / (f_b - f_a)
            f_c = f(c)
            if f_c * f_b < 0:
                a, f_a = b, f_b
            else:
                f_a /= 2
            b, f_b = c, f_c
        # 5. Once |B-A| <= e, set s' <- e^(A/2)
        return math.exp(1) ** (a / 2)

    def rate(self, rating, series):
        # Step 2. For each player, convert the rating and RD's onto the
        #         Glicko-2 scale.
        rating = self.scale_down(rating)
        # Step 3. Compute the quantity v. This is the estimated variance of the
        #         team's/player's rating based only on game outcomes.
        # Step 4. Compute the quantity difference, the estimated improvement in
        #         rating by comparing the pre-period rating to the performance
        #         rating based only on game outcomes.
        variance_inv = 0
        difference = 0
        if not series:
            # If the team didn't play in the series, do only Step 6
            phi_star = math.sqrt(rating.phi ** 2 + rating.sigma ** 2)
            return self.scale_up(self.create_rating(rating.mu, phi_star, rating.sigma))
        for actual_score, other_rating in series:
            other_rating = self.scale_down(other_rating)
            impact = self.reduce_impact(other_rating)
            expected_score = self.expect_score(rating, other_rating, impact)
            variance_inv += impact ** 2 * expected_score * (1 - expected_score)
            difference += impact * (actual_score - expected_score)
        difference /= variance_inv
        variance = 1. / variance_inv
        # Step 5. Determine the new value, Sigma', ot the sigma. This
        #         computation requires iteration.
        sigma = self.determine_sigma(rating, difference, variance)
        # Step 6. Update the rating deviation to the new pre-rating period
        #         value, Phi*.
        phi_star = math.sqrt(rating.phi ** 2 + sigma ** 2)
        # Step 7. Update the rating and RD to the new values, Mu' and Phi'.
        phi = 1. / math.sqrt(1 / phi_star ** 2 + 1 / variance)
        mu = rating.mu + phi ** 2 * (difference / variance)
        # Step 8. Convert ratings and RD's back to original scale.
        return self.scale_up(self.create_rating(mu, phi, sigma))

    def rate_1vs1(self, rating1, rating2, drawn=False):
        return (self.rate(rating1, [(DRAW if drawn else WIN, rating2)]),
                self.rate(rating2, [(DRAW if drawn else LOSS, rating1)]))

    def quality_1vs1(self, rating1, rating2):
        expected_score1 = self.expect_score(rating1, rating2, self.reduce_impact(rating1))
        expected_score2 = self.expect_score(rating2, rating1, self.reduce_impact(rating2))
        expected_score = (expected_score1 + expected_score2) / 2
        return 2 * (0.5 - abs(0.5 - expected_score))

In [None]:
# Initialize Glicko
glicko2 = Glicko2()

# Initialize player ratings
ratings = {}
initial_rating = Rating(mu=MU, phi=PHI, sigma=SIGMA)

# Iterate through matches and update ratings
for index, row in atp_matches_before_df.iterrows():
    winner = row['winner_id']
    loser = row['loser_id']

    if winner not in ratings:
        ratings[winner] = initial_rating
    if loser not in ratings:
        ratings[loser] = initial_rating

    winner_rating = ratings[winner]
    loser_rating = ratings[loser]

    new_winner_rating, new_loser_rating = glicko2.rate_1vs1(winner_rating, loser_rating)

    ratings[winner] = new_winner_rating
    ratings[loser] = new_loser_rating

In [None]:
# make the index player_id
atp_player_features.set_index('player_id', inplace=True)

# set the ratings
for player_id in atp_player_features.index:
    if player_id in ratings:
        atp_player_features.loc[player_id, 'rating'] = ratings[player_id].mu
        atp_player_features.loc[player_id, 'RD'] = ratings[player_id].phi
        atp_player_features.loc[player_id, 'volatility'] = ratings[player_id].sigma

In [None]:
atp_player_features.columns

In [None]:
# update the last match to before, then make function for updating in future
last_match_dates = {}

for player_id in pd.concat([atp_matches_before_df['winner_id'], atp_matches_before_df['loser_id']]).unique():
  winner_matches = atp_matches_before_df[atp_matches_before_df['winner_id'] == player_id]
  loser_matches = atp_matches_before_df[atp_matches_before_df['loser_id'] == player_id]

  all_matches = pd.concat([winner_matches, loser_matches])

  last_match_date = all_matches['tourney_date'].max()
  last_match_dates[player_id] = last_match_date

def calculate_days_since_last_match(row, player_id):
    match_date = row['tourney_date']

    if player_id in last_match_dates:
        last_date = last_match_dates[player_id]
        days_since_last_match = (match_date - last_date).days
    else:
        days_since_last_match = 365  # No previous matches found

    # Update the last match date for the player
    last_match_dates[player_id] = match_date

    return days_since_last_match

In [None]:
# make the matches DataFrame while updating the atp_features DataFrame
count = 0
new_rows = []
previous_year = 2005
print(atp_matches_after_df.shape)
for index, row in atp_matches_after_df.iterrows():
  count += 1
  if count % 1000 == 0:
    print(count)
  if row['tourney_date'].year != previous_year:
    previous_year = row['tourney_date'].year
    print(previous_year, " at count = ", count)
  rand_num = np.random.rand()
  if rand_num > 0.5:
    player_1 = row['winner_id']
    player_2 = row['loser_id']
    result = 1
    player_1_height = row['winner_ht']
    player_1_hand = 1 if row['winner_hand'] == 'R' else 0
    player_2_height = row['loser_ht']
    player_2_hand = 1 if row['loser_hand'] == 'R' else 0
    player_1_age= row['winner_age']
    player_2_age = row['loser_age']
  else:
    player_1 = row['loser_id']
    player_2 = row['winner_id']
    player_1_height = row['loser_ht']
    player_1_hand = 1 if row['loser_hand'] == 'R' else 0
    player_2_height = row['winner_ht']
    player_2_hand = 1 if row['winner_hand'] == 'R' else 0
    player_1_age= row['loser_age']
    player_2_age = row['winner_age']
    result = 0

  # Deal with player not in ratings yet
  if player_1 not in ratings:
    ratings[player_1] = initial_rating
    atp_player_features.loc[player_1, 'rating'] = new_winner_rating.mu
    atp_player_features.loc[player_1, 'RD'] = new_winner_rating.phi
    atp_player_features.loc[player_1, 'volatility'] = new_winner_rating.sigma

  if player_2 not in ratings:
    ratings[player_2] = initial_rating
    atp_player_features.loc[player_2, 'rating'] = new_winner_rating.mu
    atp_player_features.loc[player_2, 'RD'] = new_winner_rating.phi
    atp_player_features.loc[player_2, 'volatility'] = new_winner_rating.sigma

  previous_matches = atp_matches_all[atp_matches_all['tourney_date'] < row['tourney_date']]
  head_to_head_w_1 = previous_matches[(previous_matches['winner_id'] == player_1) & (previous_matches['loser_id'] == player_2)].shape[0]
  head_to_head_w_2 = previous_matches[(previous_matches['winner_id'] == player_2) & (previous_matches['loser_id'] == player_1)].shape[0]
  days_since_match_1 = calculate_days_since_last_match(row, player_1)
  days_since_match_2 = calculate_days_since_last_match(row, player_2)

  # create new row
  new_row = {
    'date' : row['tourney_date'], # going to drop these features before training
    'tournament' : row['tourney_name'],
    'winner_name' : row['winner_name'],
    'loser_name' : row['loser_name'], # just need them for combining with betting data

    'total_wins_1' : atp_player_features.loc[player_1]['total_wins'],
    'total_losses_1' : atp_player_features.loc[player_1]['total_losses'],
    'total_wins_Hard_1' : atp_player_features.loc[player_1]['total_wins_on_Hard'],
    'total_wins_Carpet_1' : atp_player_features.loc[player_1]['total_wins_on_Carpet'],
    'total_wins_Clay_1' : atp_player_features.loc[player_1]['total_wins_on_Clay'],
    'total_wins_Grass_1' : atp_player_features.loc[player_1]['total_wins_on_Grass'],
    'total_losses_Hard_1' : atp_player_features.loc[player_1]['total_losses_on_Hard'],
    'total_losses_Carpet_1' : atp_player_features.loc[player_1]['total_losses_on_Carpet'],
    'total_losses_Clay_1' : atp_player_features.loc[player_1]['total_losses_on_Clay'],
    'total_losses_Grass_1' : atp_player_features.loc[player_1]['total_losses_on_Grass'],
    'total_wins_bo3_1' : atp_player_features.loc[player_1]['total_wins_best_of_3'],
    'total_wins_bo5_1' : atp_player_features.loc[player_1]['total_wins_best_of_5'],
    'total_losses_bo3_1' : atp_player_features.loc[player_1]['total_losses_best_of_3'],
    'total_losses_bo5_1' : atp_player_features.loc[player_1]['total_losses_best_of_5'],
    'total_wins_against_L_1' : atp_player_features.loc[player_1]['total_wins_vs_L'],
    'total_losses_against_L_1' : atp_player_features.loc[player_1]['total_losses_vs_L'],
    'total_wins_against_R_1' : atp_player_features.loc[player_1]['total_wins_vs_R'],
    'total_losses_against_R_1' : atp_player_features.loc[player_1]['total_losses_vs_R'],
    'rating_1' : atp_player_features.loc[player_1]['rating'],
    'RD_1' : atp_player_features.loc[player_1]['RD'],
    'volatility_1' : atp_player_features.loc[player_1]['volatility'],
    'first_serve_win_perc_1': (atp_player_features.loc[player_1]['first_serves_wins'] / atp_player_features.loc[player_1]['total_first_serves']
                               if atp_player_features.loc[player_1]['total_first_serves'] != 0 else 0),
    'second_serve_win_perc_1': (atp_player_features.loc[player_1]['second_serves_wins'] / atp_player_features.loc[player_1]['total_second_serves']
                                if atp_player_features.loc[player_1]['total_second_serves'] != 0 else 0),
    'first_return_win_perc_1': (atp_player_features.loc[player_1]['first_return_wins'] / atp_player_features.loc[player_1]['total_first_returns']
                                if atp_player_features.loc[player_1]['total_first_returns'] != 0 else 0),
    'second_return_win_perc_1': (atp_player_features.loc[player_1]['second_return_wins'] / atp_player_features.loc[player_1]['total_second_returns']
                                 if atp_player_features.loc[player_1]['total_second_returns'] != 0 else 0),
    'break_point_break_perc_1': (atp_player_features.loc[player_1]['break_points_for_won'] / atp_player_features.loc[player_1]['break_points_for']
                                 if atp_player_features.loc[player_1]['break_points_for'] != 0 else 0),
    'break_point_save_perc_1': (atp_player_features.loc[player_1]['break_points_against_won'] / atp_player_features.loc[player_1]['break_points_against']
                                if atp_player_features.loc[player_1]['break_points_against'] != 0 else 0),
    'head_to_head_wins_1' : head_to_head_w_1,
    'height_1' : player_1_height,
    'age_1' : player_1_age,
    'player_1_hand' : player_1_hand,
    'days_since_1' : days_since_match_1,

    'total_wins_2' : atp_player_features.loc[player_2]['total_wins'],
    'total_losses_2' : atp_player_features.loc[player_2]['total_losses'],
    'total_wins_Hard_2' : atp_player_features.loc[player_2]['total_wins_on_Hard'],
    'total_wins_Carpet_2' : atp_player_features.loc[player_2]['total_wins_on_Carpet'],
    'total_wins_Clay_2' : atp_player_features.loc[player_2]['total_wins_on_Clay'],
    'total_wins_Grass_2' : atp_player_features.loc[player_2]['total_wins_on_Grass'],
    'total_losses_Hard_2' : atp_player_features.loc[player_2]['total_losses_on_Hard'],
    'total_losses_Carpet_2' : atp_player_features.loc[player_2]['total_losses_on_Carpet'],
    'total_losses_Clay_2' : atp_player_features.loc[player_2]['total_losses_on_Clay'],
    'total_losses_Grass_2' : atp_player_features.loc[player_2]['total_losses_on_Grass'],
    'total_wins_bo3_2' : atp_player_features.loc[player_2]['total_wins_best_of_3'],
    'total_wins_bo5_2' : atp_player_features.loc[player_2]['total_wins_best_of_5'],
    'total_losses_bo3_2' : atp_player_features.loc[player_2]['total_losses_best_of_3'],
    'total_losses_bo5_2' : atp_player_features.loc[player_2]['total_losses_best_of_5'],
    'total_wins_against_L_2' : atp_player_features.loc[player_2]['total_wins_vs_L'],
    'total_losses_against_L_2' : atp_player_features.loc[player_2]['total_losses_vs_L'],
    'total_wins_against_R_2' : atp_player_features.loc[player_2]['total_wins_vs_R'],
    'total_losses_against_R_2' : atp_player_features.loc[player_2]['total_losses_vs_R'],
    'rating_2' : atp_player_features.loc[player_2]['rating'],
    'RD_2' : atp_player_features.loc[player_2]['RD'],
    'volatility_2' : atp_player_features.loc[player_2]['volatility'],
    'first_serve_win_perc_2': (atp_player_features.loc[player_2]['first_serves_wins'] / atp_player_features.loc[player_2]['total_first_serves']
                               if atp_player_features.loc[player_2]['total_first_serves'] != 0 else 0),
    'second_serve_win_perc_2': (atp_player_features.loc[player_2]['second_serves_wins'] / atp_player_features.loc[player_2]['total_second_serves']
                                if atp_player_features.loc[player_2]['total_second_serves'] != 0 else 0),
    'first_return_win_perc_2': (atp_player_features.loc[player_2]['first_return_wins'] / atp_player_features.loc[player_2]['total_first_returns']
                                if atp_player_features.loc[player_2]['total_first_returns'] != 0 else 0),
    'second_return_win_perc_2': (atp_player_features.loc[player_2]['second_return_wins'] / atp_player_features.loc[player_2]['total_second_returns']
                                 if atp_player_features.loc[player_2]['total_second_returns'] != 0 else 0),
    'break_point_break_perc_2': (atp_player_features.loc[player_2]['break_points_for_won'] / atp_player_features.loc[player_2]['break_points_for']
                                 if atp_player_features.loc[player_2]['break_points_for'] != 0 else 0),
    'break_point_save_perc_2': (atp_player_features.loc[player_2]['break_points_against_won'] / atp_player_features.loc[player_2]['break_points_against']
                                if atp_player_features.loc[player_2]['break_points_against'] != 0 else 0),
    'head_to_head_wins_2' : head_to_head_w_2,
    'height_2' : player_2_height,
    'age_2' : player_2_age,
    'player_2_hand' : player_2_hand,
    'days_since_2' : days_since_match_2,

    'Hard' : 1 if row['surface'] == 'Hard' else 0, # surface features
    'Carpet' : 1 if row['surface'] == 'Carpet' else 0,
    'Clay' : 1 if row['surface'] == 'Clay' else 0,
    'Grass' : 1 if row['surface'] == 'Grass' else 0,
    'bo3' : 1 if row['best_of'] == 3 else 0, # best of 3 = 1

    'result' : result # = 1 if player_1 wins
  }

  if rand_num > 0.5:
    atp_player_features.loc[player_1, 'total_wins'] += 1
    atp_player_features.loc[player_2, 'total_losses'] += 1
    atp_player_features.loc[player_1, f'total_wins_on_{row["surface"]}'] += 1
    atp_player_features.loc[player_2, f'total_losses_on_{row["surface"]}'] += 1
    atp_player_features.loc[player_1, f'total_wins_best_of_{row["best_of"]}'] += 1
    atp_player_features.loc[player_2, f'total_losses_best_of_{row["best_of"]}'] += 1
    atp_player_features.loc[player_1, f'total_wins_vs_{row["loser_hand"]}'] += 1
    atp_player_features.loc[player_2, f'total_losses_vs_{row["winner_hand"]}'] += 1

    winner_rating = ratings[player_1]
    loser_rating = ratings[player_2]

    new_winner_rating, new_loser_rating = glicko2.rate_1vs1(winner_rating, loser_rating)

    ratings[player_1] = new_winner_rating
    ratings[player_2] = new_loser_rating

    atp_player_features.loc[player_1, 'rating'] = new_winner_rating.mu
    atp_player_features.loc[player_1, 'RD'] = new_winner_rating.phi
    atp_player_features.loc[player_1, 'volatility'] = new_winner_rating.sigma

    atp_player_features.loc[player_2, 'rating'] = new_loser_rating.mu
    atp_player_features.loc[player_2, 'RD'] = new_loser_rating.phi
    atp_player_features.loc[player_2, 'volatility'] = new_loser_rating.sigma

    atp_player_features.loc[player_1, 'first_serves_wins'] += row['w_1stWon']
    atp_player_features.loc[player_1, 'total_first_serves'] += row['w_1stIn']
    atp_player_features.loc[player_1, 'second_serves_wins'] += row['w_2ndWon']
    atp_player_features.loc[player_1, 'total_second_serves'] += (row['w_svpt'] - row['w_1stIn'])
    atp_player_features.loc[player_1, 'first_return_wins'] += (row['l_1stIn'] - row['l_1stWon'])
    atp_player_features.loc[player_1, 'total_first_returns'] += row['l_1stIn']
    atp_player_features.loc[player_1, 'second_return_wins'] += (row['l_svpt'] - row['l_1stIn'] - row['l_2ndWon'])
    atp_player_features.loc[player_1, 'total_second_returns'] += (row['l_svpt'] - row['l_1stIn'])
    atp_player_features.loc[player_1, 'break_points_for_won'] += (row['l_bpFaced'] - row['l_bpSaved'])
    atp_player_features.loc[player_1, 'break_points_for'] += row['l_bpFaced']
    atp_player_features.loc[player_1, 'break_points_against_won'] += row['w_bpSaved']
    atp_player_features.loc[player_1, 'break_points_against'] += row['w_bpFaced']

    atp_player_features.loc[player_2, 'first_serves_wins'] += row['l_1stWon']
    atp_player_features.loc[player_2, 'total_first_serves'] += row['l_1stIn']
    atp_player_features.loc[player_2, 'second_serves_wins'] += row['l_2ndWon']
    atp_player_features.loc[player_2, 'total_second_serves'] += (row['l_svpt'] - row['l_1stIn'])
    atp_player_features.loc[player_2, 'first_return_wins'] += (row['w_1stIn'] - row['w_1stWon'])
    atp_player_features.loc[player_2, 'total_first_returns'] += row['w_1stIn']
    atp_player_features.loc[player_2, 'second_return_wins'] += (row['w_svpt'] - row['w_1stIn'] - row['w_2ndWon'])
    atp_player_features.loc[player_2, 'total_second_returns'] += (row['w_svpt'] - row['w_1stIn'])
    atp_player_features.loc[player_2, 'break_points_for_won'] += (row['w_bpFaced'] - row['w_bpSaved'])
    atp_player_features.loc[player_2, 'break_points_for'] += row['w_bpFaced']
    atp_player_features.loc[player_2, 'break_points_against_won'] += row['l_bpSaved']
    atp_player_features.loc[player_2, 'break_points_against'] += row['l_bpFaced']

  else:
    atp_player_features.loc[player_2, 'total_wins'] += 1
    atp_player_features.loc[player_1, 'total_losses'] += 1
    atp_player_features.loc[player_2, f'total_wins_on_{row["surface"]}'] += 1
    atp_player_features.loc[player_1, f'total_losses_on_{row["surface"]}'] += 1
    atp_player_features.loc[player_2, f'total_wins_best_of_{row["best_of"]}'] += 1
    atp_player_features.loc[player_1, f'total_losses_best_of_{row["best_of"]}'] += 1
    atp_player_features.loc[player_2, f'total_wins_vs_{row["loser_hand"]}'] += 1
    atp_player_features.loc[player_1, f'total_losses_vs_{row["winner_hand"]}'] += 1

    winner_rating = ratings[player_2]
    loser_rating = ratings[player_1]

    new_winner_rating, new_loser_rating = glicko2.rate_1vs1(winner_rating, loser_rating)

    ratings[player_2] = new_winner_rating
    ratings[player_1] = new_loser_rating

    atp_player_features.loc[player_2, 'rating'] = new_winner_rating.mu
    atp_player_features.loc[player_2, 'RD'] = new_winner_rating.phi
    atp_player_features.loc[player_2, 'volatility'] = new_winner_rating.sigma

    atp_player_features.loc[player_1, 'rating'] = new_loser_rating.mu
    atp_player_features.loc[player_1, 'RD'] = new_loser_rating.phi
    atp_player_features.loc[player_1, 'volatility'] = new_loser_rating.sigma

    atp_player_features.loc[player_2, 'first_serves_wins'] += row['w_1stWon']
    atp_player_features.loc[player_2, 'total_first_serves'] += row['w_1stIn']
    atp_player_features.loc[player_2, 'second_serves_wins'] += row['w_2ndWon']
    atp_player_features.loc[player_2, 'total_second_serves'] += (row['w_svpt'] - row['w_1stIn'])
    atp_player_features.loc[player_2, 'first_return_wins'] += (row['l_1stIn'] - row['l_1stWon'])
    atp_player_features.loc[player_2, 'total_first_returns'] += row['l_1stIn']
    atp_player_features.loc[player_2, 'second_return_wins'] += (row['l_svpt'] - row['l_1stIn'] - row['l_2ndWon'])
    atp_player_features.loc[player_2, 'total_second_returns'] += (row['l_svpt'] - row['l_1stIn'])
    atp_player_features.loc[player_2, 'break_points_for_won'] += (row['l_bpFaced'] - row['l_bpSaved'])
    atp_player_features.loc[player_2, 'break_points_for'] += row['l_bpFaced']
    atp_player_features.loc[player_2, 'break_points_against_won'] += row['w_bpSaved']
    atp_player_features.loc[player_2, 'break_points_against'] += row['w_bpFaced']

    atp_player_features.loc[player_1, 'first_serves_wins'] += row['l_1stWon']
    atp_player_features.loc[player_1, 'total_first_serves'] += row['l_1stIn']
    atp_player_features.loc[player_1, 'second_serves_wins'] += row['l_2ndWon']
    atp_player_features.loc[player_1, 'total_second_serves'] += (row['l_svpt'] - row['l_1stIn'])
    atp_player_features.loc[player_1, 'first_return_wins'] += (row['w_1stIn'] - row['w_1stWon'])
    atp_player_features.loc[player_1, 'total_first_returns'] += row['w_1stIn']
    atp_player_features.loc[player_1, 'second_return_wins'] += (row['w_svpt'] - row['w_1stIn'] - row['w_2ndWon'])
    atp_player_features.loc[player_1, 'total_second_returns'] += (row['w_svpt'] - row['w_1stIn'])
    atp_player_features.loc[player_1, 'break_points_for_won'] += (row['w_bpFaced'] - row['w_bpSaved'])
    atp_player_features.loc[player_1, 'break_points_for'] += row['w_bpFaced']
    atp_player_features.loc[player_1, 'break_points_against_won'] += row['l_bpSaved']
    atp_player_features.loc[player_1, 'break_points_against'] += row['l_bpFaced']

  new_rows.append(new_row)

test_matches_df = pd.DataFrame(new_rows)

In [None]:
# get rid of a few rows that have NaN for whatever reason
print(test_matches_df.shape)

test_matches_clean = test_matches_df.dropna(axis=0)
test_matches_clean.drop(columns = ['winner_name', 'loser_name', 'tournament', 'date'], inplace=True)

print(test_matches_clean.shape)

In [None]:
X = test_matches_clean.drop(columns=['result'])  # Drop the target column
y = test_matches_clean['result']  # Target variable

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [None]:
# Initialize and train the model
model = LogisticRegression(max_iter=10000)
model.fit(X_train_scaled, y_train)

# Make predictions
y_pred = model.predict(X_test_scaled)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy:.3f}')
print(classification_report(y_test, y_pred))

In [None]:
optimizer = Adam(learning_rate=0.0005)

model = Sequential([
    Dense(256, activation='relu', input_shape=(X.shape[1],)),
    Dropout(0.5),
    Dense(1, activation='sigmoid')
])

# Compile the model
model.compile(optimizer=optimizer,
              loss='binary_crossentropy',  # Use 'categorical_crossentropy' for multi-class classification
              metrics=['accuracy'])

In [None]:
history = model.fit(X_train_scaled, y_train, epochs=25, batch_size=128, verbose = 2, validation_split=0.2)

In [None]:
loss, accuracy = model.evaluate(X_test_scaled, y_test)
print(f'Test Loss: {loss}')
print(f'Test Accuracy: {accuracy}')

In [None]:
# def create_model(learning_rate=0.01, dropout_rate=0.0, units=64):
#     model = Sequential([
#         Dense(units=units, activation='relu', input_shape=(67,)),
#         Dropout(dropout_rate),
#         Dense(1, activation='sigmoid')
#     ])
#     optimizer = Adam(learning_rate=learning_rate)
#     model.compile(optimizer=optimizer, loss='binary_crossentropy', metrics=['accuracy'])
#     return model

# from sklearn.base import BaseEstimator, ClassifierMixin

# class KerasClassifierWrapper(BaseEstimator, ClassifierMixin):
#     def __init__(self, learning_rate=0.01, dropout_rate=0.0, units=64, epochs=10, batch_size=32):
#         self.learning_rate = learning_rate
#         self.dropout_rate = dropout_rate
#         self.units = units
#         self.epochs = epochs
#         self.batch_size = batch_size

#     def fit(self, X, y):
#         self.model = create_model(
#             learning_rate=self.learning_rate,
#             dropout_rate=self.dropout_rate,
#             units=self.units
#         )

#         self.model.fit(X, y, epochs=self.epochs, batch_size=self.batch_size, verbose=0)
#         return self

#     def score(self, X, y):
#         loss, accuracy = self.model.evaluate(X, y, verbose=0)
#         return accuracy
# param_grid = {
#     'learning_rate': [0.00001, 0.0001, 0.001],
#     'dropout_rate': [0.2, 0.5],
#     'units': [75, 100, 125, 150],
#     'batch_size': [64, 128, 196],
#     'epochs': [20, 30, 40]
# }

# # Create the model wrapper
# model_wrapper = KerasClassifierWrapper()

# # Setup the GridSearchCV
# grid = GridSearchCV(estimator=model_wrapper, param_grid=param_grid, n_jobs=-1, cv=3)

# # Perform the grid search
# grid_result = grid.fit(X_train_scaled, y_train)

# # Print best parameters and score
# print("Best parameters:", grid_result.best_params_)
# print("Best score:", grid_result.best_score_)