# Collect

This notebook contains the replication of the data collection and generation of features matrices.
These functions do not need to be ran again for Replicate.ipynb to run correcly.
This file was created so the code is accesible in a notebook but it takes a while to run.

# Data Collection

In [None]:
#Imports
import requests
import json
import pickle
import os
from time import sleep
import sys
import shutil
import glob
import numpy as np

In [None]:
#headers used for requests

cookies = {
    'ug': '564bc5cb06dd690a3c852e7da205ef8b',
    'ugs': '1',
    '_gat': '1',
    '_ga': 'GA1.2.1028222052.1454632808',
    'crtg_trnr': '',
    's_cc': 'true',
    's_vi': '[CS]v1|2B1C901605010772-40000146E0028408[CE]',
    's_sq': 'nbag-n-league%3D%2526pid%253Dstats.nba.com%25253A%25252Fplayer%25252F%2526pidt%253D1%2526oid%253Dhttp%25253A%25252F%25252Fstats.nba.com%25252Fplayer%25252F%2526ot%253DA',
    's_fid': '611F6CAB17F03E6A-0F7BD99682C4658E',
}

headers = {
    'Accept-Encoding': 'gzip, deflate, sdch',
    'Accept-Language': 'en-US,en;q=0.8',
    'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Ubuntu Chromium/45.0.2454.85 Chrome/45.0.2454.85 Safari/537.36',
    'Accept': 'application/json, text/plain, */*',
    'Referer': 'http://stats.nba.com/league/player/',
    'Connection': 'keep-alive',
    'Cache-Control': 'max-age=0',
}

In [None]:
#dumps in pickle file the list of players in a given season (has to be str i.e '2013-14')
def get_player_list(season):
    if os.path.exists('data' + os.sep + season):
        shutil.rmtree('data' + os.sep + season)

    os.makedirs('data' + os.sep + season)
    os.makedirs('data' + os.sep + season + os.sep + 'averages')

    req = requests.request('GET', 'http://stats.nba.com/stats/leaguedashplayerstats?College=&Conference=&Country=&DateFrom=&DateTo=&Division=&DraftPick=&DraftYear=&GameScope=&GameSegment=&Height=&LastNGames=0&LeagueID=00&Location=&MeasureType=Base&Month=0&OpponentTeamID=0&Outcome=&PORound=0&PaceAdjust=N&PerMode=PerGame&Period=0&PlayerExperience=&PlayerPosition=&PlusMinus=N&Rank=N&Season=' + season + '&SeasonSegment=&SeasonType=Regular+Season&ShotClockRange=&StarterBench=&TeamID=0&VsConference=&VsDivision=&Weight=', headers=headers, cookies=cookies)

    player_list = json.loads(req.content)

    pickle.dump(player_list, open('data' + os.sep + season + os.sep + 'player_list.pkl', 'wb'))

In [None]:
#gets the data of all players for one season in a structured dict (player list of season must exist)
def get_data(season):
    if os.path.exists('data' + os.sep + season + os.sep + 'player_stats'):
        shutil.rmtree('data' + os.sep + season + os.sep + 'player_stats')

    os.makedirs('data' + os.sep + season + os.sep + 'player_stats')


    player_info = dict()

    player_list = pickle.load(open('data' + os.sep + season + os.sep + 'player_list.pkl', 'rb'))

    for i, player in enumerate(player_list['resultSets'][0]['rowSet']):

        print "fetching stats of player %d" % (i + 1)

        name_req = requests.request('GET', 'http://stats.nba.com/stats/commonplayerinfo?LeagueID=00&PlayerID=' + str(player[0]) + '&SeasonType=Regular+Season', headers=headers, cookies=cookies)
        sleep(1)

        player_info['name'] = json.loads(name_req.content)['resultSets'][0]['rowSet'][0][3]

        birthdate = json.loads(name_req.content)['resultSets'][0]['rowSet'][0][6]
        start_year = json.loads(name_req.content)['resultSets'][0]['rowSet'][0][22]

        player_info['experience'] = int(season.split('-')[0]) - start_year
        player_info['age'] = int(season.split('-')[0]) - int(birthdate.split('-')[0]) #approximation

        #may be interesting but incomplete for previous seasons
        # player_info['height'] = json.loads(name_req.content)['resultSets'][0]['rowSet'][0][10]
        # player_info['weight'] = json.loads(name_req.content)['resultSets'][0]['rowSet'][0][11]
        # player_info['position'] = json.loads(name_req.content)['resultSets'][0]['rowSet'][0][14]

        stats_req = requests.request('GET', 'http://stats.nba.com/stats/playergamelog?LeagueID=00&PlayerID=' + str(player[0]) + '&Season=' + season + '&SeasonType=Regular+Season', headers=headers, cookies=cookies)
        sleep(1)


        player_info['stats'] = []

        #removing useless data
        for match in json.loads(stats_req.content)['resultSets'][0]['rowSet'][::-1]:
            player_info['stats'].append(match[2:-1])

        pickle.dump(player_info, open('data' + os.sep + season + os.sep + 'player_stats' + os.sep + str(player[0]) + '.pkl', 'wb'))

In [None]:
#Example to fetch data for season 2004-05 (~15 minutes)
get_player_list("2004-05")
get_data("2004-05")

# Averaging methods

In [None]:
#Returns the averaged stats (all, home and away) of a given player in his first given number of games as well as winrate
#Returns averaged of all games but last by default
def average(season, playerID, number_games = -1):
    player = pickle.load(open('data' + os.sep + season + os.sep + 'player_stats' + os.sep + playerID + '.pkl', 'rb'))
    games_num = len(player['stats'])

    if number_games == 0:
        tmp = [0.]*23
        tmp[21] = player['experience']
        tmp[22] = player['age']
        return tmp, tmp, tmp
        # print "Please choose a strictly positive number of games"
        # exit()

    if number_games == -1:
        return average(season, playerID, games_num - 1)

    elif number_games > games_num:
        print "not enough games, returned average of all available games (%d)" % games_num
        return average(season, playerID, games_num)

    else:
        averaged = [float(sum(x))/float(len(x)) for x in zip(*[match[4:] for match in player['stats'][:number_games]])]
        won = float([match[3] for match in player['stats'][:number_games]].count('W'))
        winrate = won/number_games
        averaged.append(winrate)
        averaged.append(player['experience'])
        averaged.append(player['age'])

        home = [match for match in player['stats'][:number_games] if match[2][4] == '@']
        away = [match for match in player['stats'][:number_games] if match[2][4] != '@']

        #In order to avoid unreferenced return
        home_avg = []
        away_avg = []

        if len(home) != 0:
            home_avg = [float(sum(x))/float(len(x)) for x in zip(*[match[4:] for match in home])]
            home_won = float([match[3] for match in home].count('W'))
            home_winrate = home_won/len(home)
            home_avg.append(home_winrate)
            home_avg.append(player['experience'])
            home_avg.append(player['age'])

        if len(away) != 0:
            away_avg = [float(sum(x))/float(len(x)) for x in zip(*[match[4:] for match in away])]
            away_won = float([match[3] for match in away].count('W'))
            away_winrate = away_won/len(away)
            away_avg.append(away_winrate)
            away_avg.append(player['experience'])
            away_avg.append(player['age'])

        return averaged, home_avg, away_avg

In [None]:
#example
print average('2011-12', '201149')

In [None]:
def compute_fantasy(season, playerID, game_number = -1,
                    PTS = 1, BLK = 1, STL = 1, AST = 1, REB = 1, FGM = 1, FTM = 1, FGA = -1, FTA = -1, TOV = -1):
    player = pickle.load(open('data' + os.sep + season + os.sep + 'player_stats' + os.sep + playerID + '.pkl', 'rb'))
    games_num = len(player['stats'])

    if game_number == -1:
        return compute_fantasy(season, playerID, games_num,
                        PTS, BLK, STL, AST, REB, FGM, FTM, FGA, FTA, TOV)

    elif game_number > games_num:
        print "This game does not exist, returned last game played instead"
        return compute_fantasy(season, playerID, games_num,
                        PTS, BLK, STL, AST, REB, FGM, FTM, FGA, FTA, TOV)

    else:
        game = player['stats'][game_number - 1]
        score = PTS*game[20] + BLK*game[19] + STL*game[18] + AST*game[17] + REB*game[16] + FGM*game[5]
        + FTM*game[11] + FGA*game[6] + FTA*game[12] + TOV*game[20]

        return score

In [None]:
#example
print compute_fantasy('2011-12', '201149')

In [None]:
#returns weighted average with avg1 more important than avg by factor of weight
def weighted_average(avg1, avg2, weight = 2):
    if len(avg1) == 0:
        return avg2

    elif len(avg2) == 0:
        return avg1

    avg = []
    for a, b, in zip(avg1, avg2):
        tmp = (a*weight + b)/(weight + 1)
        avg.append(tmp)

    return avg

# Generating Feature Matrix

In [None]:
#prepares data to be fit using only raw averages of all games (but the last) of each players
def raw_averages(season):
    averages = []
    next_match_points = []
    players = glob.glob('data' + os.sep + season + os.sep + 'player_stats' + os.sep + "*.pkl")
    for file in players:
        playerID = file[26:-4]
        averages.append(average(season, playerID)[0])
        next_match_points.append(compute_fantasy(season, playerID))
    X = np.array(averages)
    y = np.array(next_match_points)

    if os.path.exists('data' + os.sep + season + os.sep + 'averages' + os.sep + 'raw_X' + '.pkl'):
        os.remove('data' + os.sep + season + os.sep + 'averages' + os.sep + 'raw_X' + '.pkl')
        os.remove('data' + os.sep + season + os.sep + 'averages' + os.sep + 'raw_y' + '.pkl')

    pickle.dump(X, open('data' + os.sep + season + os.sep + 'averages' + os.sep + 'raw_X' + '.pkl', 'wb'))
    pickle.dump(y, open('data' + os.sep + season + os.sep + 'averages' + os.sep + 'raw_y' + '.pkl', 'wb'))

    return X, y

In [None]:
#sliding average using weight depending on whether next game is home or away
def sliding_loc_weight_average(season, playerID, weight = 2):
    averages = []
    next_match_points = []
    player = pickle.load(open('data' + os.sep + season + os.sep + 'player_stats' + os.sep + playerID + '.pkl', 'rb'))
    games_num = len(player['stats'])

    for i in range(1, games_num - 1):
        all, home, away = average(season, playerID, i)
        next_match_points.append(compute_fantasy(season, playerID, i + 1))
        #test if next game is home or away
        if player['stats'][i + 1][2][4] == '@':
            avg = weighted_average(home, away, weight)
            averages.append(avg)

        else:
            avg = weighted_average(away, home, weight)
            averages.append(avg)

    X = np.array(averages)
    y = np.array(next_match_points)

    return X, y

In [None]:
#sliding averages for all players in one season using location weighted average
def sliding_averages(season, weight = 2):
    Xs = []
    ys = []
    players = glob.glob('data' + os.sep + season + os.sep + 'player_stats' + os.sep + "*.pkl")
    for file in players:
        playerID = file[26:-4]
        X, y = sliding_loc_weight_average(season, playerID, weight)

        if X.shape != (0,):
            Xs.append(X)
            ys.append(y)

    Xf = np.concatenate(Xs)
    yf = np.concatenate(ys)

    if os.path.exists('data' + os.sep + season + os.sep + 'averages' + os.sep + 'sliding_X' + str(weight) + '.pkl'):
        os.remove('data' + os.sep + season + os.sep + 'averages' + os.sep + 'sliding_X' + str(weight) + '.pkl')
        os.remove('data' + os.sep + season + os.sep + 'averages' + os.sep + 'sliding_y' + str(weight) + '.pkl')

    pickle.dump(Xf, open('data' + os.sep + season + os.sep + 'averages' + os.sep + 'sliding_X' + str(weight) + '.pkl', 'wb'))
    pickle.dump(yf, open('data' + os.sep + season + os.sep + 'averages' + os.sep + 'sliding_y' + str(weight) + '.pkl', 'wb'))

    return Xf, yf

In [None]:
#sliding average with twice as many features (one general and one H/A)
def sliding_loc_average(season, playerID):
    averages = []
    next_match_points = []
    player = pickle.load(open('data' + os.sep + season + os.sep + 'player_stats' + os.sep + playerID + '.pkl', 'rb'))
    games_num = len(player['stats'])

    for i in range(1, games_num - 1):
        all, home, away = average(season, playerID, i)
        points = compute_fantasy(season, playerID, i + 1)
        next_match_points.append(points)
        averages.append(all)

        #test if next game is home or away
        if player['stats'][i + 1][2][4] == '@' and home != []:
            next_match_points.append(points)
            averages.append(home)


        elif away != []:
            next_match_points.append(points)
            averages.append(away)

    X = np.array(averages)
    y = np.array(next_match_points)

    return X, y

In [None]:
#sliding average with twice as many features (one general and one H/A) for the whole season
def sliding_loc_averages(season):
    Xs = []
    ys = []
    players = glob.glob('data' + os.sep + season + os.sep + 'player_stats' + os.sep + "*.pkl")
    for file in players:
        playerID = file[26:-4]
        X, y = sliding_loc_average(season, playerID)

        if X.shape != (0,):
            Xs.append(X)
            ys.append(y)

    Xf = np.concatenate(Xs)
    yf = np.concatenate(ys)

    if os.path.exists('data' + os.sep + season + os.sep + 'averages' + os.sep + 'sliding_loc_X' + '.pkl'):
        os.remove('data' + os.sep + season + os.sep + 'averages' + os.sep + 'sliding_loc_X' + '.pkl')
        os.remove('data' + os.sep + season + os.sep + 'averages' + os.sep + 'sliding_loc_yc' + '.pkl')

    pickle.dump(Xf, open('data' + os.sep + season + os.sep + 'averages' + os.sep + 'sliding_loc_X' + '.pkl', 'wb'))
    pickle.dump(yf, open('data' + os.sep + season + os.sep + 'averages' + os.sep + 'sliding_loc_y' + '.pkl', 'wb'))

    return Xf, yf

In [None]:
#Generates all feature models on a example season. Make sure data is fetched for this season
#Takes very long to run!

raw_averages("2004-05")
#basic sliding average
sliding_averages("2004-05")
#with a weight of 2
sliding_averages("2004-05", 2)
sliding_loc_averages("2004-05")

# Baselines

In [None]:
def baseline(season):
    errors = []
    players = glob.glob('data' + os.sep + season + os.sep + 'player_stats' + os.sep + "*.pkl")

    for file in players:
        playerID = file[26:-4]
        player = pickle.load(open('data' + os.sep + season + os.sep + 'player_stats' + os.sep + playerID + '.pkl', 'rb'))
        games_num = len(player['stats'])

        for i in range(1, games_num - 1):
            next_points = compute_fantasy(season, playerID, i + 1)
            curr_points = compute_fantasy(season, playerID, i)
            errors.append(abs(next_points - curr_points))

    error = np.mean(errors), np.max(errors)

    file = open('data' + os.sep + season + os.sep + 'averages' + os.sep + 'baseline.txt', "w")
    file.write("{}".format(error))
    file.close()

    print "Average error and max error for season {} is {}".format(season, error)

    return error

In [None]:
#example that takes a while to run
baseline("2004-05")

In [None]:
def baselines(seasons):
    avg_error = 0.
    avg_max = 0.

    for season in seasons:
        print "computing for season {}".format(season)
        error = baseline(season)
        avg_error += error[0]
        avg_max += error[1]

    result = avg_error/len(seasons), avg_max/len(seasons)

    print "Average error and Averaged max error over all seasons is %s" % (result,)

    return result

In [None]:
#actual result that can be seen in the README
#takes FOREVER to run
seasons = ['2005-06', '2006-07', '2007-08', '2008-09', '2009-10', '2010-11', '2011-12', '2012-13', '2013-14']
baselines(seasons)