In [1]:
import h5py
import pandas as pd
import numpy as np
from scipy import stats
import os
import zipfile
from operator import itemgetter
from pga.return_pid import return_pid
from datetime import datetime
import numpy_indexed as npi
from tqdm import tqdm



In [2]:
def parse_salary_csv(df, top_players):
    pid_ls = []
    for player in sal_df['player_name'].unique():
        pid = return_pid(player)

        if pid is not np.nan:
            pid_ls.append(pid)
        else:
            print(player, 'player id not found, skipped!!')
            
    df['pid'] = pid_ls
    df['pid'] = df['pid'].apply(lambda x: int(x))
    df.sort_values(by = 'pid', inplace = True)
    df['temp_pid'] = df['pid'].rank(ascending = True)
    df['temp_pid'] = df['temp_pid'].apply(lambda x: int(x - 1))
    df['sal/100'] = df['salary'] / 100
    df['sal/100'] = df['sal/100'].apply(lambda x: int(x))
    df['sal_rank'] = df['salary'].rank(ascending = False)
    df['sal_rank'] = df['sal_rank'].apply(lambda x: int(x))
    
    salary_dict = dict(zip(np.array(df['temp_pid'].values).astype(np.int16), 
                    np.array(df['sal/100'].values).astype(np.int16)))
    pid_mapped_dict = dict(zip(np.array(df['temp_pid'].values).astype(np.int16),
                    np.array(df['pid'].values).astype(np.int16)))
    
    top_df = df.loc[df['sal_rank'] <= top_players]
    top_players_dict = dict(zip(np.array(top_df['temp_pid'].values).astype(np.int16),
                     np.array(top_df['sal/100'].values).astype(np.int16)))
    
    return df
    #return salary_dict, pid_mapped_dict, top_players_dict

def map_pids_to_array(salary_dict, pid_mapped_dict, top_players_dict, hdf5_filepath, hdf5_savepath):
    os.chdir('/home/valesco/Datasets/PGA_Data/lineup_combos/')
    #Open 5 combo's hdf5 file
    h5_file = h5py.File(hdf5_filepath, 'r')
    h5_keys = list(h5_file.keys())
    
    #Create new hdf5 file for outputs
    new_h5_file = h5py.File(hdf5_savepath, 'w')
    
    #set player cutoff for combos
    player_cutoff = len(salary_dict.keys())
    
    for k, key in enumerate(h5_keys):
        print('batch {} starting'.format(k))
        
        #iterate through top players to create 6 combo teams
        complete_array = np.zeros(shape = (1,12)).astype(np.int16)
        for top_pid, salary in tqdm(top_players_dict.items()):
            
            #read h5 dataset
            combos = np.array(h5_file[key]).astype(np.int16)
            print('combos', combos.shape)
            
            #limit players to max players in field
            combos = combos[np.where((combos < player_cutoff).all(axis = 1))]
            
            print('combos', combos.shape)
            pid_col = np.full((len(combos), 1), top_pid, np.int16)
            
            #add 6th player id
            combos = np.hstack((pid_col, combos))
            print('combos', combos.shape)
            
            #drop teams with duplicate players
            vals, counts = stats.mode(combos, axis = 1)
            combos = np.hstack((combos, counts)).astype(np.int16)
            combos = combos[combos[:,6] == 1]
            combos = combos[:,:6]
            print('combos', combos.shape)
            
            #map temp_pids to array with player salaries
            salary_array = np.vectorize(salary_dict.__getitem__)(combos).astype(np.int16)
            print('salary_array', salary_array.shape)
            
            #retreive lineup indices that meet salary constraints
            indices = np.where((salary_array.sum(axis = 1) < 501) & (salary_array.sum(axis = 1) > 450))
            
            #combine combo and salary arrays
            salary_array = np.hstack((combos[indices], salary_array[indices]))
            print('salary_array', salary_array.shape)
            
            #append salary array to complete array
            complete_array = np.vstack((complete_array, salary_array))
            print('complete_array', complete_array.shape)
            
        #write complete array to h5
        new_h5_file.create_dataset('batch' + str(k), data = complete_array[1:])
    
    print('Process complete hdf5 file {} saved'.format(hdf5_savepath))

In [8]:
os.chdir('D:/data/lineup_combos/')

sal_df = pd.read_csv('DKSalaries (36).csv', skiprows = 7, usecols = range(7,15))
sal_df.reset_index(inplace = True, drop = True)
sal_df.columns = ['pos', 'name_id', 'player_name', 'id', 'salary', 'tournament', 'team', 'proj_points']
sal_df = sal_df.loc[sal_df['player_name'] != 'Stewart Hagestad']
sal_df = sal_df.loc[sal_df['player_name'] != 'Toto Gana']
sal_df.head()

Unnamed: 0,pos,name_id,player_name,id,salary,tournament,team,proj_points
0,G,Jordan Spieth (8422900),Jordan Spieth,8422900,11500,Masters Tournament,Golf,105
1,G,Dustin Johnson (8422944),Dustin Johnson,8422944,11300,Masters Tournament,Golf,108
2,G,Rory McIlroy (8422908),Rory McIlroy,8422908,10600,Masters Tournament,Golf,103
3,G,Jason Day (8422926),Jason Day,8422926,10200,Masters Tournament,Golf,97
4,G,Hideki Matsuyama (8422978),Hideki Matsuyama,8422978,9900,Masters Tournament,Golf,96


In [25]:
sal_df = parse_salary_csv(sal_df, 20)
sal_df.reset_index(inplace = True, drop = True)
sal_df.tail()

Unnamed: 0,pos,name_id,player_name,id,salary,tournament,team,proj_points,pid,temp_pid,sal/100,sal_rank
88,G,Daniel Berger (8422974),Daniel Berger,8422974,7300,Masters Tournament,Golf,80,40026,88,73,33
89,G,Matthew Fitzpatrick (8422939),Matthew Fitzpatrick,8422939,7400,Masters Tournament,Golf,80,40098,89,74,30
90,G,Jon Rahm (8422920),Jon Rahm,8422920,8600,Masters Tournament,Golf,95,46970,90,86,12
91,G,Brad Dalke (8422980),Brad Dalke,8422980,6200,Masters Tournament,Golf,47,49826,91,62,86
92,G,Scott Gregory (8422987),Scott Gregory,8422987,6200,Masters Tournament,Golf,47,50735,92,62,86


In [24]:
proj_array = np.zeros((len(sal_df),1001)).astype(np.int16)

for index, row in sal_df.iterrows():
    temp_pid = np.array(row['temp_pid']).astype(np.int16).reshape((1,1))
    proj_points = row['proj_points']
    std = 20
    
    point_sims = np.random.normal(proj_points, std, 1000)
    flat_sim = np.round(point_sims.reshape((1, 1000)), decimals = 0)
    
    flat_sim = np.hstack((temp_pid, flat_sim))
    
    proj_array[index] = flat_sim
    

proj_array

93it [00:00, 4428.89it/s]


array([[  0,  76,  78, ...,  33, 104,  68],
       [  1,  56,  35, ...,  85,  84,  64],
       [  2,  31,  28, ...,  10,  42,  62],
       ..., 
       [ 90,  91, 110, ..., 114,  98,  66],
       [ 91,  64,  52, ...,  43,  47,  26],
       [ 92,  -1,  38, ...,  79,  79,  24]], dtype=int16)