In [44]:
import numpy as np
import pandas as pd
import datetime

In [5]:
df_top100 = pd.read_excel('top_female_poker_players_and_events.xlsx', sheet_name='top_100')
df_top100_event = pd.read_excel('top_female_poker_players_and_events.xlsx', sheet_name='top_100_poker_events')

In [23]:
# Add the player names to their poker events
df = df_top100_event.merge(df_top100, how='left', on='player_id')

In [22]:
display(df.head(2))
print(df.dtypes)

Unnamed: 0,event_date,event_country,event_name,player_place,prize_usd,player_id,source_x,last_updated_x,position,country,name,all_time_money_usd,player_url,source_y,last_updated_y
0,2020-02-21,Canada,"C$ 4,700 + 300 No Limit Hold'em - WPT Main Eve...",22nd,14915.0,68149,https://pokerdb.thehendonmob.com/player.php?a=...,2021-10-19,1st,United States,Vanessa Selbst,11906247,https://pokerdb.thehendonmob.com/player.php?a=...,https://pokerdb.thehendonmob.com/ranking/137/,2021-10-19
1,2019-09-15,United States,"$ 3,300 + 200 No Limit Hold'em - WPT Borgata P...",14th,39950.0,68149,https://pokerdb.thehendonmob.com/player.php?a=...,2021-10-19,1st,United States,Vanessa Selbst,11906247,https://pokerdb.thehendonmob.com/player.php?a=...,https://pokerdb.thehendonmob.com/ranking/137/,2021-10-19


event_date            datetime64[ns]
event_country                 object
event_name                    object
player_place                  object
prize_usd                    float64
player_id                      int64
source_x                      object
last_updated_x        datetime64[ns]
position                      object
country                       object
name                          object
all_time_money_usd             int64
player_url                    object
source_y                      object
last_updated_y        datetime64[ns]
dtype: object


In [26]:
# Replace any nulls in prize_usd with zero
df['prize_usd'] = df['prize_usd'].fillna(0)

In [55]:
# Find the dates of the players first and last events
df_player = pd.DataFrame()
df_player['first_event'] = df.groupby('player_id')['event_date'].min()
df_player['last_event'] = df.groupby('player_id')['last_updated_x'].max()

# Use these dates to calculate the length of poker career in years (with decimals)
df_player['pocker_career'] = df_player['last_event'] - df_player['first_event']
df_player['pocker_career'] = df_player['pocker_career'].astype('timedelta64[Y]')
df_player.dtypes

first_event      datetime64[ns]
last_event       datetime64[ns]
pocker_career           float64
dtype: object

In [127]:
# Create an aggregated view to find the following player stats:
# Number of events they've taken part in
#  Total prize money
df_agg = pd.DataFrame()
df_agg['total_prize_money'] = df.groupby('player_id')['prize_usd'].sum()

#  Their biggest win
df['position_int'] = df['position'].apply(lambda x:x[:-2])
df_agg['biggest_win'] = df.groupby('player_id')['position_int'].min() 

#  The percentage of events they've won
player_nume = df[df['player_place'] == '1st'].groupby('player_id')['player_place'].count()
player_denom = df.groupby('player_id')['event_name'].count()
df_agg['percentage_of_won'] = player_nume / player_denom

#  The distinct count of the country played in
df_agg['count_of_country_played_in'] = pd.pivot_table(df, index='player_id', aggfunc='nunique')['country']

#  Their length of career
df_agg['length_of_career'] = df_player['pocker_career']
df_agg.head()

Unnamed: 0_level_0,total_prize_money,biggest_win,percentage_of_won,count_of_country_played_in,length_of_career
player_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
74,4270548.0,4,0.088608,1,27.0
78,1739867.0,20,0.121107,1,35.0
109,2768839.0,12,0.064516,1,27.0
113,890104.0,49,0.060976,1,26.0
154,6465318.0,2,0.070529,1,27.0


In [134]:
df_agg = df_agg.merge(df_top100[['player_id', 'name']], how='left', on='player_id')

In [173]:
final = pd.melt(df_agg, id_vars=['name', 'player_id'], var_name='metric', value_name='raw_value')
final

Unnamed: 0,name,player_id,metric,raw_value
0,Annie Duke,74,total_prize_money,4.27055e+06
1,Barbara Enright,78,total_prize_money,1.73987e+06
2,Jennifer Harman,109,total_prize_money,2.76884e+06
3,Melissa Hayden,113,total_prize_money,890104
4,Kathy Liebert,154,total_prize_money,6.46532e+06
...,...,...,...,...
495,Linglin Zeng,428461,length_of_career,6
496,Cate Hall,434850,length_of_career,6
497,Ness Reilly,435766,length_of_career,12
498,Lisa Meredith,479528,length_of_career,5
