In [1]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')
%matplotlib inline
sns.set_style("whitegrid")
pd.set_option('display.max_columns', 500)
pd.set_option('display.max_rows', 500)
pd.set_option('display.width', 200)
pd.set_option('display.colheader_justify', 'center')
pd.set_option('display.float_format', lambda x: '%.2f' % x)

In [2]:
# Load the dataset
df = pd.read_excel('_data/_matches_list_v2.xlsx')

In [3]:
##### SANITY CHECKS REGARDING THE BETTING ODDS #####

# Betting companies
companies = ['B365', 'EX', 'LB', 'PS', 'SJ', 'UB', 'Avg', 'Max']

# Columns for each player for each company
cols_p1 = [f'p1_{company}' for company in companies]
cols_p2 = [f'p2_{company}' for company in companies]

# Convert cols_p1 and cols_p2 to numeric
for col in cols_p1 + cols_p2:
    df[col] = pd.to_numeric(df[col], errors='coerce')

display(df[(df[cols_p1] < 1).any(axis=1) | (df[cols_p2] < 1).any(axis=1)])

# Update cols_p1 and cols_p2 to =1 if <1
for col in cols_p1 + cols_p2:
    df.loc[df[col] < 1, col] = 1

display(df[(df[cols_p1] < 1).any(axis=1) | (df[cols_p2] < 1).any(axis=1)])

Unnamed: 0,match_id,match_date,year,tourney_name,tourney_level,court,surface,draw_size,round,best_of,minutes,total_points,total_games,score,p1_won,p1_points,p2_points,p1_games,p2_games,p1_sets,p2_sets,p1_id,p2_id,p1_name,p2_name,p1_rank,p2_rank,p1_rankpt,p2_rankpt,p1_hand,p2_hand,p1_ht,p2_ht,p1_ioc,p2_ioc,p1_age,p2_age,p1_seed,p2_seed,p1_entry,p2_entry,p1_ace,p2_ace,p1_df,p2_df,p1_svpt,p2_svpt,p1_1stIn,p2_1stIn,p1_1stWon,p2_1stWon,p1_2ndWon,p2_2ndWon,p1_SvGms,p2_SvGms,p1_bpSaved,p2_bpSaved,p1_bpFaced,p2_bpFaced,p1_B365,p2_B365,p1_EX,p2_EX,p1_LB,p2_LB,p1_PS,p2_PS,p1_SJ,p2_SJ,p1_UB,p2_UB,p1_Max,p2_Max,p1_Avg,p2_Avg
28088,2020-580-0164,2020-01-23,2020,Australian Open,G,Outdoor,Hard,128,R64,5,150.0,199.0,29,6-3 7-6(4) 6-1,1,116.0,83.0,19,10,3,0,104745,105643,Rafael Nadal,Federico Delbonis,1.0,76.0,10235.0,711.0,L,L,185.0,190.0,ESP,ARG,33.6,29.2,1.0,,,,8.0,2.0,1.0,8.0,89.0,110.0,61.0,69.0,52.0,45.0,17.0,18.0,14.0,14.0,0.0,17.0,0.0,20.0,29.0,0.97,,,,,1.01,29.89,,,,,1.02,35.0,1.01,21.52
28800,2020-520-1164,2020-09-28,2020,Roland Garros,G,Outdoor,Clay,128,R128,5,125.0,148.0,28,6-4 6-4 6-2,1,88.0,60.0,18,10,3,0,104745,106078,Rafael Nadal,Egor Gerasimov,2.0,83.0,9850.0,750.0,L,R,185.0,196.0,ESP,BLR,34.3,27.8,2.0,,,,3.0,8.0,2.0,1.0,71.0,77.0,45.0,49.0,37.0,34.0,17.0,9.0,14.0,14.0,1.0,0.0,2.0,5.0,0.97,34.0,,,,,1.01,30.08,,,,,1.02,35.0,1.01,22.68
28836,2020-520-1232,2020-09-30,2020,Roland Garros,G,Outdoor,Clay,128,R64,5,100.0,132.0,22,6-1 6-0 6-3,1,85.0,47.0,18,4,3,0,104745,111456,Rafael Nadal,Mackenzie Mcdonald,2.0,236.0,9850.0,217.0,L,R,185.0,178.0,ESP,USA,34.3,25.4,2.0,,,,1.0,1.0,4.0,2.0,72.0,60.0,49.0,40.0,37.0,18.0,14.0,8.0,11.0,11.0,0.0,1.0,0.0,8.0,1.01,26.0,,,,,0.97,36.86,,,,,1.02,36.86,1.01,22.51
28837,2020-520-1201,2020-10-01,2020,Roland Garros,G,Outdoor,Clay,128,R64,5,83.0,119.0,23,6-1 6-2 6-2,0,43.0,76.0,5,18,0,3,105575,104925,Ricardas Berankis,Novak Djokovic,66.0,1.0,854.0,11260.0,R,R,175.0,188.0,LTU,SRB,30.2,33.3,,1.0,,,0.0,10.0,1.0,1.0,53.0,66.0,40.0,38.0,20.0,36.0,6.0,13.0,11.0,12.0,1.0,2.0,7.0,2.0,0.97,34.0,,,,,28.0,1.01,,,,,41.0,1.02,21.54,1.01
29388,2021-580-0164,2021-02-10,2021,Australian Open,G,Outdoor,Hard,128,R64,5,210.0,268.0,44,6-3 6-7(3) 7-6(2) 6-3,0,123.0,145.0,19,25,1,3,126207,104925,Frances Tiafoe,Novak Djokovic,64.0,1.0,1005.0,12030.0,R,R,188.0,188.0,USA,SRB,23.0,33.7,,1.0,,,23.0,26.0,8.0,5.0,148.0,120.0,78.0,74.0,58.0,62.0,34.0,27.0,21.0,21.0,9.0,1.0,14.0,3.0,0.97,29.0,,,,,24.5,1.02,,,,,29.0,1.02,19.55,1.01
30309,2021-520-0100,2021-06-01,2021,Roland Garros,G,Outdoor,Clay,128,R128,5,118.0,167.0,26,6-2 6-4 6-2,0,68.0,99.0,8,18,0,3,105815,104925,Tennys Sandgren,Novak Djokovic,66.0,1.0,1033.0,11313.0,R,R,188.0,188.0,USA,SRB,29.8,34.0,,1.0,,,5.0,4.0,1.0,2.0,94.0,73.0,56.0,49.0,30.0,42.0,20.0,13.0,13.0,13.0,6.0,6.0,11.0,6.0,0.97,29.0,,,,,31.38,1.01,,,,,35.0,1.02,23.01,1.01
30368,2021-520-0196,2021-06-05,2021,Roland Garros,G,Outdoor,Clay,128,R32,5,92.0,142.0,24,6-1 6-4 6-1,0,55.0,87.0,6,18,0,3,105575,104925,Ricardas Berankis,Novak Djokovic,93.0,1.0,839.0,11313.0,R,R,175.0,188.0,LTU,SRB,30.9,34.0,,1.0,,,1.0,5.0,3.0,3.0,75.0,67.0,53.0,42.0,31.0,37.0,7.0,13.0,12.0,12.0,3.0,0.0,9.0,0.0,0.97,29.0,,,,,32.24,1.01,,,,,34.0,1.01,21.85,1.01
32820,2022-520-0187,2022-05-26,2022,Roland Garros,G,Outdoor,Clay,128,R64,5,,328.0,48,6-3 7-6(8) 6-7(3) 7-6(7),1,167.0,161.0,26,22,3,1,126774,144645,Stefanos Tsitsipas,Zdenek Kolar,4.0,134.0,5965.0,468.0,R,R,193.0,185.0,GRE,CZE,23.7,25.6,4.0,,,,25.0,9.0,5.0,7.0,163.0,165.0,96.0,117.0,70.0,82.0,38.0,24.0,23.0,22.0,6.0,9.0,8.0,12.0,1.02,17.0,,,,,0.97,35.02,,,,,1.04,35.02,1.01,19.24


Unnamed: 0,match_id,match_date,year,tourney_name,tourney_level,court,surface,draw_size,round,best_of,minutes,total_points,total_games,score,p1_won,p1_points,p2_points,p1_games,p2_games,p1_sets,p2_sets,p1_id,p2_id,p1_name,p2_name,p1_rank,p2_rank,p1_rankpt,p2_rankpt,p1_hand,p2_hand,p1_ht,p2_ht,p1_ioc,p2_ioc,p1_age,p2_age,p1_seed,p2_seed,p1_entry,p2_entry,p1_ace,p2_ace,p1_df,p2_df,p1_svpt,p2_svpt,p1_1stIn,p2_1stIn,p1_1stWon,p2_1stWon,p1_2ndWon,p2_2ndWon,p1_SvGms,p2_SvGms,p1_bpSaved,p2_bpSaved,p1_bpFaced,p2_bpFaced,p1_B365,p2_B365,p1_EX,p2_EX,p1_LB,p2_LB,p1_PS,p2_PS,p1_SJ,p2_SJ,p1_UB,p2_UB,p1_Max,p2_Max,p1_Avg,p2_Avg


In [4]:
##### REMOVE DEVIATED BETTING ODDS USING COEFFICIENT OF VARIATION TEST #####

# Define a threshold for CV
threshold_cv = 0.4

# Betting companies
companies = ['B365', 'EX', 'LB', 'PS', 'SJ', 'UB', 'Avg', 'Max']

# Columns for each player for each company
cols_p1 = [f'p1_{company}' for company in companies]
cols_p2 = [f'p2_{company}' for company in companies]

# Convert betting columns to numeric
for col in cols_p1 + cols_p2:
    df[col] = pd.to_numeric(df[col], errors='coerce')

# DataFrame to store updated fields information
updated_fields_df = pd.DataFrame(columns=['match_id', 'updated_field', 'original_value', 'cv_odds', 'p1_rank', 'p2_rank', 'p1_won'] + cols_p1 + cols_p2)

# Function to append updated row
def append_updated_row(row, outlier_col, original_value, cv_odds):
    updated_row = {
        'match_id': row['match_id'], 
        'updated_field': outlier_col, 
        'original_value': original_value,
        'cv_odds': cv_odds,
        'p1_rank': row['p1_rank'], 
        'p2_rank': row['p2_rank'], 
        'p1_won': row['p1_won']
    }
    updated_row.update(row[cols_p1 + cols_p2].to_dict())
    return updated_row

# Iterate through the rows
for idx, row in df.iterrows():
    for cols in [cols_p1, cols_p2]:
        while True:
            # Calculate mean and standard deviation
            mean_odds = row[cols].mean()
            std_odds = row[cols].std()
            cv_odds = std_odds / mean_odds

            # If CV is greater than the threshold
            if cv_odds > threshold_cv:
                # Calculate the difference between each value and the mean
                differences = row[cols].apply(lambda x: abs(x - mean_odds))
                # Find the specific column causing the high CV
                outlier_col = differences.idxmax()
                # Save the original value
                original_value = row[outlier_col]
                # Update the specific cell with NaN
                df.at[idx, outlier_col] = np.nan
                # Update the row for further iteration
                row[outlier_col] = np.nan

                # Append to the DataFrame
                updated_fields_df = updated_fields_df.append(append_updated_row(row, outlier_col, original_value, cv_odds), ignore_index=True)
            else:
                break

# Sort the DataFrame by 'cv_odds' in descending order
updated_fields_df = updated_fields_df.sort_values(by='cv_odds', ascending=False)

# Filter out 'updated_field' == 'p1_Avg', 'p2_Avg', 'p1_Max', 'p2_Max' and print "Updated cells:" + total number of rows
print(f"Number of updated betting odds (only company odds): {len(updated_fields_df[~updated_fields_df['updated_field'].isin(['p1_Avg', 'p2_Avg', 'p1_Max', 'p2_Max'])])}")

# Display the DataFrame with updated fields information
updated_fields_df[~updated_fields_df['updated_field'].isin(['p1_Avg', 'p2_Avg', 'p1_Max', 'p2_Max'])]


Number of updated betting odds (only company odds): 209


Unnamed: 0,match_id,updated_field,original_value,cv_odds,p1_rank,p2_rank,p1_won,p1_B365,p1_EX,p1_LB,p1_PS,p1_SJ,p1_UB,p1_Avg,p1_Max,p2_B365,p2_EX,p2_LB,p2_PS,p2_SJ,p2_UB,p2_Avg,p2_Max
428,2020-520-1201,p2_B365,34.0,1.78,66.0,1.0,0,,,,28.0,,,21.54,41.0,,,,1.01,,,1.01,1.02
439,2021-520-0196,p2_B365,29.0,1.75,93.0,1.0,0,,,,32.24,,,21.85,34.0,,,,1.01,,,1.01,1.01
423,2020-580-0164,p1_B365,29.0,1.75,1.0,76.0,1,,,,1.01,,,1.01,1.02,1.0,,,29.89,,,21.52,35.0
436,2021-520-0100,p2_B365,29.0,1.75,66.0,1.0,0,,,,31.38,,,23.01,35.0,,,,1.01,,,1.01,1.02
431,2021-580-0164,p2_B365,29.0,1.75,64.0,1.0,0,,,,24.5,,,19.55,29.0,,,,1.02,,,1.01,1.02
432,2021-0451-0284,p1_B365,51.0,1.55,,45.0,0,,,,3.26,,,3.4,3.75,1.0,,,1.38,,,1.31,1.42
301,2013-404-0089,p1_EX,8.0,1.24,1.0,8.0,1,1.08,,1.11,1.11,1.13,,1.1,1.14,8.0,6.71,6.5,8.5,6.5,,6.88,8.7
14,2009-520-0038,p1_SJ,9.0,1.03,41.0,65.0,0,1.66,1.7,1.8,,,1.7,,,2.1,2.16,1.91,,1.04,2.1,,
37,2010-1536-0040,p2_PS,86.0,0.93,3.0,62.0,1,,1.01,1.01,1.01,1.01,,1.01,1.02,21.0,15.0,21.0,,15.0,,14.89,21.0
6,2009-451-0030,p1_SJ,8.0,0.91,2.0,4.0,0,1.72,1.8,1.83,,,1.86,,,2.0,2.0,1.83,,1.07,1.9,,


In [5]:
##### FILL MISSING BETTING ODDS WITH MEAN AND MAX OF THE OTHER ODDS #####

# Betting companies
companies = ['B365', 'EX', 'LB', 'PS', 'SJ', 'UB']

# Columns for each player for each company
cols_p1 = [f'p1_{company}' for company in companies]
cols_p2 = [f'p2_{company}' for company in companies]

# Calculate mean and max only for rows where not all odds of companies are missing
mask_p1 = df[cols_p1].notna().any(axis=1)
mask_p2 = df[cols_p2].notna().any(axis=1)

# Fill 'p1_Avg' and 'p2_Avg' with the mean of the betting odds where applicable
df.loc[mask_p1, 'p1_Avg'] = df.loc[mask_p1, cols_p1].mean(axis=1, skipna=True)
df.loc[mask_p2, 'p2_Avg'] = df.loc[mask_p2, cols_p2].mean(axis=1, skipna=True)

# Fill 'p1_Max' and 'p2_Max' with the maximum of the betting odds where applicable
df.loc[mask_p1, 'p1_Max'] = df.loc[mask_p1, cols_p1].max(axis=1, skipna=True)
df.loc[mask_p2, 'p2_Max'] = df.loc[mask_p2, cols_p2].max(axis=1, skipna=True)

# Print missing or null values in 'p1_Avg', 'p2_Avg', 'p1_Max', 'p2_Max' columns
print(df[['p1_Avg', 'p2_Avg', 'p1_Max', 'p2_Max']].isnull().sum())

# Filter rows where any of the specified columns have missing or null values
missing_df = df[df[['p1_Avg', 'p2_Avg', 'p1_Max', 'p2_Max']].isnull().any(axis=1)]
missing_df[['match_id', 'match_date', 'tourney_name', 'tourney_level', 'round', 'p1_name', 'p2_name', 'p1_won', 'score', 'p1_rank', 'p2_rank', 'p1_Avg', 'p2_Avg', 'p1_Max', 'p2_Max']]


p1_Avg    51
p2_Avg    51
p1_Max    51
p2_Max    51
dtype: int64


Unnamed: 0,match_id,match_date,tourney_name,tourney_level,round,p1_name,p2_name,p1_won,score,p1_rank,p2_rank,p1_Avg,p2_Avg,p1_Max,p2_Max
392,2009-424-0014,2009-02-11,San Jose,A,R32,John Isner,Ramon Delgado,1,7-6(9) 6-4,139.0,277.0,,,,
828,2009-717-0009,2009-04-07,Houston,A,R32,Amer Delic,Paul Capdeville,0,6-7(5) 7-6(5) 6-1,148.0,87.0,,,,
1061,2009-5053-0008,2009-05-05,Belgrade,A,R32,Dominik Hrbaty,Kristof Vliegen,0,2-6 6-2 6-2,160.0,105.0,,,,
1062,2009-5053-0009,2009-05-05,Belgrade,A,R32,Arsenije Zlatanovic,Lukasz Kubot,0,6-3 7-5,1421.0,179.0,,,,
1063,2009-5053-0010,2009-05-05,Belgrade,A,R32,Santiago Ventura,Arnaud Clement,0,6-1 6-4,117.0,57.0,,,,
1067,2009-5053-0011,2009-05-06,Belgrade,A,R32,Marcos Baghdatis,Flavio Cipolla,0,1-6 6-3 6-1,87.0,122.0,,,,
1125,2009-1536-0024,2009-05-11,Madrid Masters,M,R64,Robin Soderling,Ivan Navarro,1,6-4 6-2,23.0,84.0,,,,
1173,2009-306-0007,2009-05-19,Kitzbuhel,A,R32,Mario Ancic,Ruben Ramirez Hidalgo,0,6-4 6-3,38.0,131.0,,,,
1227,2009-520-0059,2009-05-25,Roland Garros,G,R128,Jeremy Chardy,Thiago Alves,1,6-2 7-6(10) 6-3,39.0,117.0,,,,
1360,2009-311-0032,2009-06-10,Queen's Club,A,R32,Xavier Malisse,Rik De Voest,1,7-6(6) 7-6(8),172.0,206.0,,,,


In [6]:
 # Drop company odds columns
df = df.drop(columns=cols_p1 + cols_p2)

In [7]:
# Export dataset to excel
df.to_excel('_data/_matches_list_v3.xlsx', index=False)