In [1]:
import h5py
import pandas as pd
import numpy as np
from scipy import stats
import os
import zipfile
from operator import itemgetter
from pga.return_pid import return_pid
from pga.parse_dk_contest import return_contest_array
from datetime import datetime
import numpy_indexed as npi
from tqdm import tqdm

In [2]:
def parse_salary_csv(df, top_players):
    pid_ls = []
    for player in sal_df['player_name'].unique():
        pid = return_pid(player)

        if pid is not np.nan:
            pid_ls.append(pid)
        else:
            print(player, 'player id not found, skipped!!')
            
    df['pid'] = pid_ls
    df['pid'] = df['pid'].apply(lambda x: int(x))
    df.sort_values(by = 'pid', inplace = True)
    df['temp_pid'] = df['pid'].rank(ascending = True)
    df['temp_pid'] = df['temp_pid'].apply(lambda x: int(x - 1))
    df['sal/100'] = df['salary'] / 100
    df['sal/100'] = df['sal/100'].apply(lambda x: int(x))
    df['sal_rank'] = df['salary'].rank(ascending = False)
    df['sal_rank'] = df['sal_rank'].apply(lambda x: int(x))
    
    salary_dict = dict(zip(np.array(df['temp_pid'].values).astype(np.int16), 
                    np.array(df['sal/100'].values).astype(np.int16)))
    
    pid_array = np.array(df['pid'].values).astype(np.int32)
   
    pid_mapped_dict = dict(zip(np.array(df['temp_pid'].values).astype(np.int16),
                    pid_array))
    
    top_df = df.loc[df['sal_rank'] <= top_players]
    top_players_dict = dict(zip(np.array(top_df['temp_pid'].values).astype(np.int16),
                     np.array(top_df['sal/100'].values).astype(np.int16)))
    
    return df, pid_mapped_dict
    #return salary_dict, pid_mapped_dict, top_players_dict

In [3]:
os.chdir('/home/valesco/Documents/')

sal_df = pd.read_csv('DKSalaries (42).csv', skiprows = 7, usecols = range(7,14))
sal_df.reset_index(inplace = True, drop = True)
sal_df.columns = ['pos', 'name_id', 'player_name', 'id', 'salary', 'tournament', 'team']
sal_df = sal_df.loc[sal_df['player_name'] != 'Stewart Hagestad']
#sal_df = sal_df.loc[sal_df['player_name'] != 'Toto Gana']
sal_df.head()

Unnamed: 0,pos,name_id,player_name,id,salary,tournament,team
0,G,Dustin Johnson (8852012),Dustin Johnson,8852012,12000,the Memorial Tournament presented by Nationwide,Golf
1,G,Jon Rahm (8852013),Jon Rahm,8852013,11100,the Memorial Tournament presented by Nationwide,Golf
2,G,Jordan Spieth (8852014),Jordan Spieth,8852014,11000,the Memorial Tournament presented by Nationwide,Golf
3,G,Jason Day (8852015),Jason Day,8852015,10300,the Memorial Tournament presented by Nationwide,Golf
4,G,Hideki Matsuyama (8852016),Hideki Matsuyama,8852016,10000,the Memorial Tournament presented by Nationwide,Golf


In [4]:
sal_df, pid_dict = parse_salary_csv(sal_df, 20)
sal_df.reset_index(inplace = True, drop = True)
sal_df.tail()

Unnamed: 0,pos,name_id,player_name,id,salary,tournament,team,pid,temp_pid,sal/100,sal_rank
115,G,Brett Coletta (8852102),Brett Coletta,8852102,6500,the Memorial Tournament presented by Nationwide,Golf,45988,115,65,92
116,G,Smylie Kaufman (8852095),Smylie Kaufman,8852095,6600,the Memorial Tournament presented by Nationwide,Golf,46440,116,66,84
117,G,Ollie Schniederjans (8852025),Ollie Schniederjans,8852025,8300,the Memorial Tournament presented by Nationwide,Golf,46501,117,83,14
118,G,Jon Rahm (8852013),Jon Rahm,8852013,11100,the Memorial Tournament presented by Nationwide,Golf,46970,118,111,2
119,G,Scott Gregory (8852131),Scott Gregory,8852131,6200,the Memorial Tournament presented by Nationwide,Golf,50735,119,62,116


In [144]:
def create_points_array(df, num_sims):
    proj_array = np.zeros((len(df), num_sims + 1)).astype(np.int16)

    for index, row in df.iterrows():
        temp_pid = np.array(row['temp_pid']).astype(np.int16).reshape((1,1))
        proj_points = row['proj_points']
        std = 20

        point_sims = np.random.normal(proj_points, std, num_sims)
        flat_sim = np.round(point_sims.reshape((1, num_sims)), decimals = 0)

        flat_sim = np.hstack((temp_pid, flat_sim))

        proj_array[index] = flat_sim


    return proj_array

proj_array = create_points_array(sal_df, 100000)
proj_array

array([[  0,  57,  89, ...,  68,  32,  58],
       [  1,  50, 105, ...,  45,  61,  89],
       [  2,  49,  18, ...,   7,  25,  31],
       ..., 
       [ 91,  29,  32, ...,  42,  20,  23],
       [ 92,  55,  44, ...,  52,  49,  52],
       [ 93,  53,  23, ...,  38,  39,  18]], dtype=int16)

In [145]:
contest_array = return_contest_array('/home/valesco/Datasets/dk_downloads/5_5_2017/contest-standings-39085500.csv')
contest_array

array([[ 1810, 12716, 20396, 22405, 24924, 28237],
       [ 1810, 12716, 24924, 25396, 27649, 34046],
       [12716, 21209, 21528, 24924, 28237, 33204],
       ..., 
       [21209, 26329, 27649, 27896, 28237, 33293],
       [26851, 27649, 29454, 33293, 34046, 46970],
       [26331, 26851, 27649, 28237, 33293, 46970]], dtype=int32)

In [146]:
paid = np.full((4000,1), 100)
lost = np.full((5195,1), -100)

payout = np.vstack((paid, lost)).astype(np.int16)

In [147]:
pid_dict_inverse = dict((value, key) for key, value in pid_dict.items())
mapped_array = np.vectorize(pid_dict_inverse.__getitem__)(contest_array).astype(np.int16)

In [148]:
indices = np.arange(len(mapped_array)).reshape(len(mapped_array), 1)
mapped_array = np.hstack((indices, mapped_array))

In [149]:
start = datetime.now()

payout_array = indices

for i in range(1, proj_array.shape[1]):
    point_dict = dict(zip(proj_array[:,0], proj_array[:,i]))
    
    sim_array = np.vectorize(point_dict.__getitem__)(mapped_array[:,1:])
    
    total_points = sim_array.sum(axis = 1).reshape((len(sim_array),1))
    
    result_array = np.hstack((indices, total_points))
    
    result_array = result_array[result_array[:,1].argsort()[::-1]]
    
    result_array = np.hstack((result_array, payout))
    
    result_array = result_array[result_array[:,0].argsort()]
    
    payout_array = np.hstack((payout_array, result_array[:,2].reshape(len(result_array),1)))
    
print(datetime.now() - start)
    
payout_array

1 day, 3:38:56.862354


array([[   0,  100, -100, ..., -100,  100,  100],
       [   1, -100, -100, ..., -100, -100, -100],
       [   2,  100, -100, ..., -100, -100,  100],
       ..., 
       [9192,  100, -100, ..., -100, -100,  100],
       [9193,  100, -100, ..., -100, -100, -100],
       [9194,  100, -100, ..., -100,  100, -100]])

In [150]:
total_profit = payout_array[:,1:].sum(axis = 1).reshape((len(total_profit), 1))
lineup_array = np.hstack((contest_array, total_profit))
lineup_array = lineup_array[lineup_array[:,-1].argsort()[::-1]]
lineup_array

array([[   21209,    22405,    24502, ...,    40026,    46970,  2337800],
       [   21209,    22405,    24502, ...,    40026,    46970,  2337200],
       [   21209,    22405,    24502, ...,    40026,    46970,  2337000],
       ..., 
       [    1717,    10423,    24502, ...,    28237,    34046, -9518200],
       [    1823,    10423,    28237, ...,    36689,    46970, -9607800],
       [    1810,     1823,    10423, ...,    30925,    34046, -9634800]])

In [155]:
name_dict = dict(zip(sal_df['pid'].values, sal_df['player_name'].values))

result_df = pd.DataFrame(columns = ['team', 'profit'])

for i in range(len(lineup_array)):
    row = lineup_array[i, :]
    team = ''
    for p, player in enumerate(row):
        if p == 0:
            team = name_dict[player] + ','
        elif p < 6:
            team += name_dict[player] + ','
        else:
            profit = row[-1]
        
    result_df.loc[i] = [team[:-1], profit]
            

In [156]:
result_df = result_df.groupby(['team']).mean()
result_df.reset_index(inplace = True)
result_df.sort_values(by = 'profit', ascending = False, inplace = True)
result_df['profit'] = result_df['profit'] / 100000
result_df

Unnamed: 0,team,profit
1675,"Sergio Garcia,Justin Rose,Adam Scott,Paul Case...",23.286635
666,"Justin Rose,Matt Kuchar,Paul Casey,Rickie Fowl...",22.896733
1734,"Sergio Garcia,Justin Rose,Paul Casey,Patrick R...",22.395778
1677,"Sergio Garcia,Justin Rose,Adam Scott,Paul Case...",22.004000
1728,"Sergio Garcia,Justin Rose,Paul Casey,Louis Oos...",21.318889
1671,"Sergio Garcia,Justin Rose,Adam Scott,Bill Haas...",20.557320
1731,"Sergio Garcia,Justin Rose,Paul Casey,Louis Oos...",20.536000
630,"Justin Rose,Matt Kuchar,Adam Scott,Paul Casey,...",19.592500
1720,"Sergio Garcia,Justin Rose,Matt Kuchar,Paul Cas...",18.755200
1674,"Sergio Garcia,Justin Rose,Adam Scott,Paul Case...",18.544000


In [153]:
len(result_df.loc[result_df['profit'] > 0])

145