In [None]:
%matplotlib inline
# data analysis and wrangling
import pandas as pd
import numpy as np
import random as rnd

# visualization
import seaborn as sns
import matplotlib.pyplot as plt

pd.set_option('float_format', '{:f}'.format)

def plotPlayer(id):
    player = id
    fig = plt.figure(figsize=(10, 6))
    ax = fig.gca()
    df[df['id'] == player][['wrating', 'age']].set_index('age').sort_index().plot(ax=ax)
    df[df['id'] == player][['rating', 'age']].set_index('age').sort_index().plot(ax=ax)
    return (df[df['id'] == player][['wrating', 'tour', 'age', 'team', 'start']].set_index('age').sort_index())

In [2]:
def load_csv(system):
    df = pd.read_csv(system + '.csv', encoding="utf-8")
    df.start = pd.to_datetime(df.start, format='%d/%m/%Y')
    df.end = pd.to_datetime(df.end, format='%d/%m/%Y')
    dob = pd.read_csv('dob.csv', encoding="utf-8")
    dob.id = dob.id.str.replace(r'\W+', '').str.lower()
    return df, dob

def cumulativeRating(diff, ratingName, df):
    cumsums = df[['id', 'start', diff]].groupby(['id', 'start']).sum().groupby(level=0).cumsum()
    df = df.set_index(['id', 'start'])
    df['cumur'] = cumsums
    df = df.reset_index()
    init = df[df['startglicko2rating'].notnull()].sort_values('start')[['startglicko2rating', 'id']].groupby('id').first().reset_index().rename(columns={'startglicko2rating': 'init'})
    df = pd.merge(df, init, how='left', on='id')
    df[ratingName] = df.cumur + df.init
    return df

def teamRating(rating, df):
    teameff = df.merge((df.groupby(['team', 'start']).mean()[rating] * 5).reset_index().rename(columns={rating: 'teamrating'}), on=['team', 'start'])
    teameff.teamrating = (teameff.teamrating - teameff[rating]) / 4
    return teameff

def experience(df):
    dfid2 = pd.DataFrame()
    for player in df.id.unique():
        dfid = pd.DataFrame(df[df['id'] == player].sort_values('start').tour.unique()).reset_index().rename(columns={'index': 'exp', 0: 'tour'})
        dfid['id'] = player
        dfid2 = pd.concat([dfid2, dfid])
    return pd.merge(df, dfid2, how='left', on=['id', 'tour'])
    
def dobMerge(df, dob):
    dob.id = dob.id.str.replace(r'\W+', '').str.lower()
    df = df.merge(dob, how='left', on=['id'])
    df.dob = pd.to_datetime(df.dob, format='%Y/%m/%d')
    df['age'] = (df.start - df.dob) / pd.Timedelta(days=365.25)
    return df

In [43]:
df2 = pd.read_csv('glickoandprize.csv', encoding="utf-8")
df2.start = pd.to_datetime(df2.start, format='%d/%m/%Y')
df2.end = pd.to_datetime(df2.end, format='%d/%m/%Y')
df2['diffr'] = df2.endglicko2rating - df2.startglicko2rating
df2 = cumulativeRating('diffr', 'normalRating', df2)
df2 = teamRating('normalRating', df2)
df2 = experience(df2)
#df2.drop_duplicates(subset=['id', 'start'], inplace=True)
#df2.dropna(inplace=True)
df2.prizepool = pd.to_numeric(df2.prizepool, errors='coerce')
df2.prizeusd = pd.to_numeric(df2.prizeusd.str.replace(r'\D', ''), errors='coerce')

In [44]:
df2['season'] = 'ti3'
df2.loc[(df2.start > pd.to_datetime('2013-08-02')) & (df2.start <= pd.to_datetime('2014-07-08')), 'season'] = 'ti4'
df2.loc[(df2.start > pd.to_datetime('2014-07-08')) & (df2.start <= pd.to_datetime('2015-07-27')), 'season'] = 'ti5'
df2.loc[(df2.start > pd.to_datetime('2015-07-27')) & (df2.start <= pd.to_datetime('2016-08-02')), 'season'] = 'ti6'
df2.loc[(df2.start > pd.to_datetime('2016-08-02')) & (df2.start <= pd.to_datetime('2017-08-02')), 'season'] = 'ti7'

df2['tiprize'] = np.log(int(df2[df2['tour'] == 'The International 2013'].prizepool.iloc[0]))
df2.loc[(df2.season == 'ti4'), 'tiprize'] = np.log(int(df2[df2['tour'] == 'The International 2014'].prizepool.iloc[0]))
df2.loc[(df2.season == 'ti5'), 'tiprize'] = np.log(int(df2[df2['tour'] == 'The International 2015'].prizepool.iloc[0]))
df2.loc[(df2.season == 'ti6'), 'tiprize'] = np.log(int(df2[df2['tour'] == 'The International 2016'].prizepool.iloc[0]))
df2.loc[(df2.season == 'ti7'), 'tiprize'] = np.log(int(df2[df2['tour'] == 'The International 2017'].prizepool.iloc[0]))

df2['wdiffr'] = (df2.prizepool.apply(np.log) / df2.tiprize) * df2.diffr

#cumulative sum of wdiffr
cumsums = df2[['id', 'start', 'wdiffr']].groupby(['id', 'start']).sum().groupby(level=0).cumsum()
df2 = df2.set_index(['id', 'start'])
df2['cumuwr'] = cumsums
df2 = df2.reset_index()
df2['perf'] = df2.cumuwr + df2.init

In [45]:
grouped = df2.groupby(['team', 'tour'])

In [46]:
pf = pd.DataFrame(columns=['tour', 'team', 'prizepool', 'prizeusd', 'perf1', 'perf2', 'perf3', 'perf4', 'perf5'])
for name, group in grouped:
    try:
        perf1 = group[group.pos == 1].perf.iloc[0]
    except:
        perf1 = np.nan
    try:
        perf2 = group[group.pos == 2].perf.iloc[0]
    except:
        perf2 = np.nan
    try:
        perf3 = group[group.pos == 3].perf.iloc[0]
    except:
        perf3 = np.nan
    try:
        perf4 = group[group.pos == 4].perf.iloc[0]
    except:
        perf4 = np.nan
    try:
        perf5 = group[group.pos == 5].perf.iloc[0]
    except:
        perf5 = np.nan
    pf = pf.append({'season': group.season.iloc[0],
                    'tour': name[1],
                    'start': group.start.iloc[0],
                    'team': name[0],
                    'prizepool': group.prizepool.iloc[0],
                    'prizeusd': group.prizeusd.iloc[0],
                    'perf1': perf1,
                    'perf2': perf2,
                    'perf3': perf3,
                    'perf4': perf4,
                    'perf5': perf5 
                   }, ignore_index=True)

In [47]:
pf = pf.dropna()
pf.prizeusd = pd.to_numeric(pf.prizeusd)
pf = pf.drop_duplicates(['team', 'start'])

In [31]:
pf.head()

Unnamed: 0,tour,team,prizepool,prizeusd,perf1,perf2,perf3,perf4,perf5,season,start
7,StarLadder StarSeries Season 5,3dmax,15000.0,0,1333.138496,1333.138496,1333.138496,1333.138496,1333.138496,ti3,2013-02-11
20,Corsair Gaming Summer Tournament 2013,4friendschrillee,10000.0,0,1431.514235,1415.703233,1391.409425,1391.409425,1391.409425,ti3,2013-06-28
21,DreamHack Summer 2013,4friendschrillee,46422.0,0,1427.237319,1427.237319,1427.237319,1427.237319,1427.237319,ti3,2013-06-15
23,Electronic Sports World Cup 2013,4friendschrillee,25000.0,0,1449.456046,1439.095493,1449.456046,1449.456046,1479.450199,ti4,2013-10-31
24,Esportal Dota 2 League Invitational Tournament 1,4friendschrillee,18691.472328,623,1426.782769,1481.501966,1426.782769,1497.32047,1497.32047,ti5,2014-12-10


In [17]:
pf = pf.groupby(['season', 'team']).agg({'start': 'first',
                                        'prizepool': 'first',
                                        'prizeusd': 'sum',
                                        'perf1': 'mean',
                                        'perf2': 'mean',
                                        'perf3': 'mean',
                                        'perf4': 'mean',
                                        'perf5': 'mean' 
                   }).reset_index()

In [21]:
pf['start'] = pd.to_datetime('2013')
pf.loc[(pf.season == 'ti4'), 'start'] = pd.to_datetime('2014')
pf.loc[(pf.season == 'ti5'), 'start'] = pd.to_datetime('2015')
pf.loc[(pf.season == 'ti6'), 'start'] = pd.to_datetime('2016')
pf.loc[(pf.season == 'ti7'), 'start'] = pd.to_datetime('2017')

In [20]:
pf.head()

Unnamed: 0,season,team,start,prizepool,prizeusd,perf1,perf2,perf3,perf4,perf5
0,ti3,3dmax,2013-01-01,15000.0,0,1333.138496,1333.138496,1333.138496,1333.138496,1333.138496
1,ti3,4friendschrillee,2013-01-01,10000.0,0,1425.961501,1430.820263,1417.940539,1417.940539,1417.940539
2,ti3,absolutelegends,2013-01-01,15000.0,0,1417.488704,1417.488704,1417.488704,1417.488704,1417.488704
3,ti3,alliance,2013-01-01,6000.0,7702300,1813.107431,1813.107431,1813.107431,1813.107431,1813.107431
4,ti3,darer,2013-01-01,5000.0,0,1316.524591,1316.524591,1316.524591,1316.524591,1316.524591


In [None]:
import statsmodels.api as sm
Y = pf[['prizeusd']]
X = pf[['perf1', 'perf2', 'perf3', 'perf4', 'perf5']]
model = sm.OLS(Y,X)
results = model.fit()
results.summary()

In [48]:
ent = pd.DataFrame(pf.team.unique()).reset_index().rename(columns={'index': 'entity', 0: 'team'})
pf = ent.merge(pf, on='team', how='left')
pf.head()

Unnamed: 0,entity,team,tour,prizepool,prizeusd,perf1,perf2,perf3,perf4,perf5,season,start
0,0,3dmax,StarLadder StarSeries Season 5,15000.0,0,1333.138496,1333.138496,1333.138496,1333.138496,1333.138496,ti3,2013-02-11
1,1,4friendschrillee,Corsair Gaming Summer Tournament 2013,10000.0,0,1431.514235,1415.703233,1391.409425,1391.409425,1391.409425,ti3,2013-06-28
2,1,4friendschrillee,DreamHack Summer 2013,46422.0,0,1427.237319,1427.237319,1427.237319,1427.237319,1427.237319,ti3,2013-06-15
3,1,4friendschrillee,Electronic Sports World Cup 2013,25000.0,0,1449.456046,1439.095493,1449.456046,1449.456046,1479.450199,ti4,2013-10-31
4,1,4friendschrillee,Esportal Dota 2 League Invitational Tournament 1,18691.472328,623,1426.782769,1481.501966,1426.782769,1497.32047,1497.32047,ti5,2014-12-10


In [58]:
panel_data[['perf1', 'perf2', 'perf3', 'perf4', 'perf5']].apply(np.log)

Unnamed: 0_level_0,Unnamed: 1_level_0,perf1,perf2,perf3,perf4,perf5
entity,start,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,2013-02-11,7.195291,7.195291,7.195291,7.195291,7.195291
1,2013-05-19,7.263655,7.263655,7.263655,7.263655,7.263655
1,2013-05-21,7.272848,7.272848,7.272848,7.272848,7.272848
1,2013-06-15,7.263496,7.263496,7.263496,7.263496,7.263496
1,2013-06-19,7.246328,7.274513,7.246328,7.246328,7.246328
1,2013-06-28,7.266488,7.255382,7.238072,7.238072,7.238072
1,2013-08-31,7.256073,7.283988,7.256073,7.256073,7.273076
1,2013-09-11,7.287898,7.314951,7.287898,7.287898,7.300696
1,2013-10-21,7.280240,7.280240,7.300880,7.280240,7.300880
1,2013-10-31,7.278944,7.271770,7.278944,7.278944,7.299426


In [60]:
from linearmodels import PanelOLS 
panel_data = pf.set_index(['entity', 'start']).sort_index()
Y = np.add(1, panel_data['prizeusd']).apply(np.log)
X = panel_data[['perf1', 'perf2', 'perf3', 'perf4', 'perf5']].apply(np.log)
mod = PanelOLS(Y, X, time_effects=True)
mod.fit()

0,1,2,3
Dep. Variable:,prizeusd,R-squared:,0.1361
Estimator:,PanelOLS,R-squared (Between):,-348.47
No. Observations:,1329,R-squared (Within):,0.0508
Date:,"Fri, Mar 23 2018",R-squared (Overall):,-196.54
Time:,23:04:26,Log-likelihood,-3538.1
Cov. Estimator:,Unadjusted,,
,,F-statistic:,34.951
Entities:,103,P-value,0.0000
Avg Obs:,12.903,Distribution:,"F(5,1109)"
Min Obs:,1.0000,,

0,1,2,3,4,5,6
,Parameter,Std. Err.,T-stat,P-value,Lower CI,Upper CI
perf1,1.8059,1.3671,1.3210,0.1868,-0.8765,4.4884
perf2,4.9543,1.5812,3.1333,0.0018,1.8519,8.0567
perf3,3.9524,1.6198,2.4401,0.0148,0.7742,7.1306
perf4,4.9989,1.4982,3.3366,0.0009,2.0593,7.9385
perf5,3.9238,1.4715,2.6666,0.0078,1.0366,6.8110
