In [1]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')
%matplotlib inline
sns.set_style("whitegrid")
pd.set_option('display.max_columns', 500)
pd.set_option('display.max_rows', 500)
pd.set_option('display.width', 200)
pd.set_option('display.colheader_justify', 'center')
pd.set_option('display.float_format', lambda x: '%.2f' % x)

In [2]:
# Load the dataset
df = pd.read_excel('_data/_matches_list_v1.xlsx')

In [3]:
# Set a random seed for reproducibility
np.random.seed(42)

# List of columns that start with 'w_' and 'l_'
w_cols = [col for col in df.columns if col.startswith('w_')]
l_cols = [col for col in df.columns if col.startswith('l_')]

# Create a new DataFrame to store the results
df_new = df.copy()

# Define 'p1_won' column (1 if player 1 wins, 0 otherwise)
# You can define this column based on your data
df_new['p1_won'] = np.random.randint(0, 2, df_new.shape[0])

# Iterate through the DataFrame and update columns and values
for idx, row in df_new.iterrows():
    # Iterate through the 'w_' and 'l_' columns
    for w_col, l_col in zip(w_cols, l_cols):
        # Create new column names for 'p1' and 'p2'
        p1_col = 'p1_' + w_col[2:]
        p2_col = 'p2_' + l_col[2:]
        
        # Assign the values based on 'p1_won'
        if row['p1_won'] == 1:
            df_new.at[idx, p1_col] = row[w_col]
            df_new.at[idx, p2_col] = row[l_col]
        else:
            df_new.at[idx, p1_col] = row[l_col]
            df_new.at[idx, p2_col] = row[w_col]

# Drop the old 'w_' and 'l_' columns
df_new.drop(w_cols + l_cols, axis=1, inplace=True)

df_new.head()

Unnamed: 0,match_id,match_date,year,tourney_name,tourney_level,court,surface,draw_size,round,best_of,minutes,total_points,total_games,score,p1_won,p1_points,p2_points,p1_games,p2_games,p1_sets,p2_sets,p1_id,p2_id,p1_name,p2_name,p1_rank,p2_rank,p1_rankpt,p2_rankpt,p1_hand,p2_hand,p1_ht,p2_ht,p1_ioc,p2_ioc,p1_age,p2_age,p1_seed,p2_seed,p1_entry,p2_entry,p1_ace,p2_ace,p1_df,p2_df,p1_svpt,p2_svpt,p1_1stIn,p2_1stIn,p1_1stWon,p2_1stWon,p1_2ndWon,p2_2ndWon,p1_SvGms,p2_SvGms,p1_bpSaved,p2_bpSaved,p1_bpFaced,p2_bpFaced,p1_B365,p2_B365,p1_EX,p2_EX,p1_LB,p2_LB,p1_PS,p2_PS,p1_SJ,p2_SJ,p1_UB,p2_UB,p1_Max,p2_Max,p1_Avg,p2_Avg
0,2009-339-0011,2009-01-04,2009,Brisbane,A,Outdoor,Hard,32,R32,3,127.0,187.0,30,3-6 6-3 7-5,0,94.0,93.0,14.0,16.0,1.0,2.0,104068.0,103898.0,Robby Ginepri,Julien Benneteau,49.0,40.0,1480.0,1608.0,R,R,183.0,185.0,USA,FRA,26.2,27.0,,,,,8.0,10.0,2.0,9.0,88.0,99.0,53.0,62.0,32.0,41.0,21.0,17.0,15.0,15.0,4.0,2.0,9.0,6.0,1.66,2.1,1.65,2.25,1.66,2.1,,,1.73,2.1,1.8,1.96,,,,
1,2009-339-0012,2009-01-04,2009,Brisbane,A,Outdoor,Hard,32,R32,3,55.0,101.0,18,6-3 6-3,1,59.0,42.0,12.0,6.0,2.0,0.0,104417.0,105023.0,Robin Soderling,Sam Querrey,17.0,36.0,2650.0,1740.0,R,R,193.0,198.0,SWE,USA,24.3,21.2,4.0,,,,7.0,4.0,2.0,3.0,52.0,49.0,30.0,27.0,26.0,20.0,13.0,9.0,9.0,9.0,0.0,2.0,0.0,5.0,1.44,2.62,1.49,2.65,1.53,2.37,,,1.5,2.62,1.4,2.85,,,,
2,2009-339-0013,2009-01-04,2009,Brisbane,A,Outdoor,Hard,32,R32,3,133.0,191.0,26,4-6 6-2 6-2,0,89.0,102.0,10.0,16.0,1.0,2.0,102967.0,104755.0,Marc Gicquel,Richard Gasquet,50.0,23.0,1428.0,2320.0,R,R,188.0,185.0,FRA,FRA,31.7,22.5,,7.0,,,7.0,6.0,6.0,5.0,100.0,91.0,45.0,42.0,30.0,29.0,23.0,26.0,13.0,13.0,13.0,3.0,18.0,5.0,3.75,1.25,3.81,1.27,3.5,1.28,,,3.5,1.25,3.6,1.27,,,,
3,2009-339-0014,2009-01-04,2009,Brisbane,A,Outdoor,Hard,32,R32,3,86.0,138.0,21,7-6(0) 6-2,0,62.0,76.0,8.0,13.0,0.0,2.0,104327.0,103758.0,Steve Darcis,Taylor Dent,59.0,865.0,1362.0,28.0,R,R,178.0,188.0,BEL,USA,24.8,27.7,,,,,4.0,8.0,2.0,5.0,70.0,68.0,46.0,37.0,24.0,31.0,14.0,13.0,10.0,10.0,1.0,6.0,5.0,8.0,1.4,2.75,,,,,,,,,1.37,3.0,,,,
4,2009-339-0005,2009-01-05,2009,Brisbane,A,Outdoor,Hard,32,R32,3,65.0,106.0,18,6-4 6-2,0,42.0,64.0,6.0,12.0,0.0,2.0,106071.0,104269.0,Bernard Tomic,Fernando Verdasco,764.0,15.0,42.0,2830.0,R,L,193.0,188.0,AUS,ESP,16.2,25.1,,3.0,WC,,3.0,1.0,1.0,2.0,56.0,50.0,34.0,39.0,19.0,31.0,9.0,5.0,9.0,9.0,9.0,3.0,13.0,4.0,8.0,1.06,9.52,1.06,9.0,1.04,,,11.0,1.03,11.0,1.04,,,,


In [4]:
df_new['p1_won'].value_counts()

1    17107
0    17014
Name: p1_won, dtype: int64

In [5]:
# Export dataset to excel
df_new.to_excel('_data/_matches_list_v2.xlsx', index=False)