In [10]:
import pandas as pd
import numpy as np
from datetime import datetime
import json
import time
import os
from tqdm import tqdm
from scipy import stats
import networkx as nx
from itertools import combinations
from collections import Counter
from fuzzywuzzy import process
from glob import glob
import zipfile
from pga.return_pid import return_pid

In [11]:
player_ids = pd.read_csv('/home/valesco/Datasets/PGA_Data/pga_master_files/player_ids.csv', sep = ';')
player_ids_name_ls = player_ids['player_name'].values

def fuzz_match(x):
    matched = process.extractOne(x, player_ids_name_ls)
    if matched[1] > 89:
        return player_ids['player_id'].loc[player_ids['player_name'] == matched[0]].values[0]
    else:
        return 'NULL'
    
def create_pid_ls(ls):
    pid_ls = []
    for player in ls:
        pid_ls.append(pid_dict[player])
    return pid_ls

def return_player_ls(csv_file):
    contest = pd.read_csv(csv_file)
    
    own_df = contest[['Player', '%Drafted', 'FPTS']]
    own_df.columns = ['player_name', 'own_percent', 'actual_points']
    contest_df = contest.drop(['Player', '%Drafted', 'Unnamed: 6',
                            'FPTS'], axis = 1)

    contest_df.columns = ['rank', 'entry_id', 'entry_name', 'time_remaining', 
                    'team_points', 'lineup']

    contest_df.dropna(inplace = True)
    contest_df['lineup'] = contest_df['lineup'].apply(lambda x: x.replace(' G ',','))
    contest_df['lineup'] = contest_df['lineup'].apply(lambda x:\
                    x.replace('G ','').split(','))
    contest_df[['player_1', 'player_2', 'player_3', 'player_4', 
               'player_5', 'player_6']] = contest_df['lineup'].apply(pd.Series)

    for col in ['player_1', 'player_2', 'player_3', 'player_4', 
               'player_5', 'player_6']:
        contest_df = contest_df.merge(own_df[['player_name', 'own_percent']], left_on = col, 
                    right_on = 'player_name')

    for col in ['player_1', 'player_2', 'player_3', 'player_4', 
               'player_5', 'player_6']:
        contest_df = contest_df.merge(own_df[['player_name', 'actual_points']], left_on = col, 
                    right_on = 'player_name')

    contest_df.columns = ['team_rank', 'entry_id', 'dfs_player', 'time_remaining', 
            'team_points', 'lineup', 'player_1', 'player_2', 'player_3', 
            'player_4', 'player_5', 'player_6', 'p1', 'p1_own', 'p2', 'p2_own',
            'p3', 'p3_own', 'p4', 'p4_own', 'p5', 'p5_own', 'p6', 'p6_own',
            'p1a', 'p1_points', 'p2a', 'p2_points', 'p3', 'p3_points', 'p4', 
            'p4_points', 'p5', 'p5_points', 'p6', 'p6_points']

    contest_df = contest_df[['team_rank', 'entry_id', 'dfs_player', 'time_remaining',
            'team_points', 'lineup', 'player_1', 'p1_own', 'p1_points', 'player_2', 'p2_own',
            'p2_points', 'player_3', 'p3_own', 'p3_points', 'player_4', 'p4_own', 'p4_points',
            'player_5', 'p5_own', 'p5_points', 'player_6', 'p6_own', 'p6_points']]

    player_ids_name_ls = player_ids['player_name'].values   

    player_ls = []

    for col in ['player_1', 'player_2', 'player_3', 'player_4', 'player_5', 'player_6']:
        temp_unique = contest_df[col].unique()

        for player in temp_unique:
            if player not in player_ls:
                player_ls.extend([player])

    pid_dict = {}
    for player in player_ls:
        pid = return_pid(player)
        if pid == 'NULL':
            print(player)
        pid_dict[player] = int(pid)

    return sorted(player_ls)

In [14]:
masters_ls = ['Fred Couples', 'Bernhard Langer', 'Sandy Lyle', 'Phil Mickelson',
       'Larry Mize', "Mark O'Meara", 'Ian Woosnam', 'Jose Maria Olazabal',
       'Ernie Els', 'Steve Stricker', 'Vijay Singh', 'Tiger Woods',
       'Mike Weir', 'Jim Furyk', 'Charley Hoffman', 'Lee Westwood',
       'Rod Pampling', 'Angel Cabrera', 'Sergio Garcia', 'Hideto Tanihara',
       'Henrik Stenson', 'Chris Wood', 'Trevor Immelman', 'Justin Rose',
       'Soren Kjeldsen', 'Matt Kuchar', 'Zach Johnson', "Sean O'Hair",
       'Pat Perez', 'Adam Scott', 'Bill Haas', 'Francesco Molinari',
       'Paul Casey', 'Kevin Na', 'Jimmy Walker', 'Jason Dufner',
       'Bubba Watson', 'Scott Piercy', 'Daniel Summerhays',
       'Louis Oosthuizen', 'Charl Schwartzel', 'Rafael Cabrera-Bello',
       'Ryan Moore', 'Marc Leishman', 'Yuta Ikeda', 'Jhonattan Vegas',
       'J.B. Holmes', 'Alexander Noren', 'Martin Kaymer',
       'Brandt Snedeker', 'Ross Fisher', 'Jason Day', 'Rory McIlroy',
       'Webb Simpson', 'Billy Hurley', 'Bernd Wiesberger', 'Kevin Kisner',
       'Brendan Steele', 'Branden Grace', 'Tommy Fleetwood',
       'Dustin Johnson', 'William McGirt', 'Gary Woodland', 'Brian Stuard',
       'Emiliano Grillo', 'Rickie Fowler', 'Danny Willett',
       'Roberto Castro', 'Kevin Chappell', 'James Hahn',
       'Hideki Matsuyama', 'Russell Knox', 'Shane Lowry', 'Thomas Pieters',
       'Adam Hadwin', 'Justin Thomas', 'Byeong-Hun An', 'Jordan Spieth',
       'Russell Henley', 'Hudson Swafford', 'Patrick Reed',
       'Tyrrell Hatton', 'Andy Sullivan', 'Mackenzie Hughes',
       'Brooks Koepka', 'Curtis Luck', 'Si Woo Kim', 'Jeunghun Wang',
       'Daniel Berger', 'Matthew Fitzpatrick', 'Jon Rahm', 'Brad Dalke',
       'Scott Gregory']


os.chdir('/home/valesco/Datasets/dk_downloads/5_5_2017/')

files = glob('*')


for file in files:
    if '.zip' in file and 'contest-standings' in file:
        try:
            csv_file = zipfile.ZipFile(file, 'r')
            new_file_name = file.replace('.zip', '.csv')
            csv_file = csv_file.open(new_file_name)
            player_ls = return_player_ls(csv_file)

            in_masters = 0
            out_masters = 0

            for player in player_ls:
                if player in masters_ls:
                    in_masters += 1
                else:
                    out_masters += 1
            
            ratio = in_masters/ out_masters
            
            if ratio > 1:
                print(file, ratio)
        except:
            pass
            
        

  if self.run_code(code, result):


contest-standings-35244077.zip 4.428571428571429
contest-standings-37906113.zip 4.5
contest-standings-37170728.zip 1.5789473684210527
contest-standings-38055316.zip 9.166666666666666
contest-standings-36022101.zip 4.428571428571429
contest-standings-37906109.zip 4.846153846153846
contest-standings-38055320.zip 5.9
contest-standings-39085500.zip 90.0
contest-standings-37839155.zip 5.545454545454546
contest-standings-37906110.zip 4.5
contest-standings-38055318.zip 5.166666666666667
contest-standings-37906111.zip 4.5


In [5]:
masters_contest = [
    'contest-standings-38055316.zip',
    ''
]

['contest-standings-37030921.zip',
 'contest-standings-35855775.csv',
 'DKSalaries (19).csv',
 'contest-standings-37173652.csv',
 'contest-standings-37173652.zip',
 'DKSalaries (21).csv',
 'DKSalaries_temp_week4.csv',
 'contest-standings-37419195.zip',
 'contest-standings-37293933.zip',
 'contest-standings-26510926 (1).zip',
 'contest-standings-37170731.csv',
 'contest-standings-34439732.zip',
 'contest-standings-36058804.zip',
 'contest-standings-26511649.zip',
 'DKSalaries (30).csv',
 'contest-standings-37419195.csv',
 'DKSalaries (18).csv',
 'DKSalaries (29).csv',
 'DKSalaries (9).csv',
 'contest-standings-37169783.csv',
 'contest-standings-37170734.zip',
 'contest-standings-38327100 (1).zip',
 'contest-standings-38327101.zip',
 'contest-standings-27752976.zip',
 'contest-standings-35244077.zip',
 'contest-standings-38327101.csv',
 'DKSalaries (36).csv',
 'contest-standings-29630243.csv',
 'contest-standings-37293932.csv',
 'contest-standings-36249479 (1).csv',
 'contest-standings-3