In [1]:
import pandas as pd
import numpy as np
import json
pd.set_option("display.max_columns", None)
from IPython.display import display
from joblib import Parallel, delayed

In [None]:
teams_dict = {}
with open("teams.json", 'r') as _f:
    teams_dict = json.load(_f)

def convert_team(name, year):
    convert_dict = {}
    for team in teams_dict:
        convert_dict[team['teamName']] = team['abbreviation']
    
    name_changes = {
        'Charlotte Hornets': 'CHA',
        'Charlotte Bobcats': 'CHN',
        'New Orleans Hornets': 'NOK',
        'New Orleans Pelicans': 'NOP'
    }
    if name in name_changes:
        return name_changes[name]
    
    elif name == 'Brooklyn Nets':
        if year == 2012:
            return 'BRK'
        else:
            return 'BKN'
    
    return convert_dict[name]

In [None]:
schedules = pd.read_csv('2012_2017_schedules.csv', index_col=0)
played_schedule = schedules.dropna()
played_schedule['start_time'] = pd.to_datetime(played_schedule['start_time'])\
                                .dt.tz_localize('UTC').dt.tz_convert('US/Eastern')

box_scores = pd.read_csv('2012_2017_box_scores.csv', index_col=0)
box_scores['date'] = pd.to_datetime(box_scores['date'])

season_stats = pd.read_csv('2012_2017_season_player_stats.csv', index_col=0)

In [None]:
def get_starters(home_team, visitor_team, date):
    df_slice = box_scores.query('team=="'+ home_team +'" or team=="'+ visitor_team +'"')
    game_players = df_slice[df_slice['date'] == date] 
    visitor_players = game_players[game_players['team']==visitor_team]
    home_players = game_players[game_players['team']==home_team]
    home_starters_df = home_players.sort_values('seconds_played', ascending=False).head(5)
    home_starters = [(p['first_name'], p['last_name']) for _,p in home_starters_df.iterrows()]
    visitor_starters_df = visitor_players.sort_values('seconds_played', ascending=False).head(5)
    visitor_starters = [(p['first_name'], p['last_name']) for _,p in visitor_starters_df.iterrows()]
    return home_starters, visitor_starters

In [None]:
def get_previous_wins(team, date):
    lower_year = date.year
    if date.month < 7:
        lower_year -= 1
    lower_bound = pd.to_datetime('09-01-' + str(lower_year)).date()
    df_slice = played_schedule[(played_schedule['start_time'] < date) \
                               & (played_schedule['start_time'] > lower_bound)]
    our_team_slice = df_slice.query('home_team_name=="' + team + '" or visiting_team_name=="' + team + '"')
    
    home_wins = our_team_slice.apply(lambda row: row['home_team_name'] == team and\
                                     row['home_team_score'] > row['visiting_team_score'], axis=1).sum()
    visiting_wins = our_team_slice.apply(lambda row: row['visiting_team_name'] == team and\
                                     row['home_team_score'] < row['visiting_team_score'], axis=1).sum()
    
    win_streak = 0
    for _,row in our_team_slice.sort_values('start_time', ascending=False).iterrows():
        if row['home_team_name'] == team and row['home_team_score'] > row['visiting_team_score']:
            win_streak += 1
        elif row['visiting_team_name'] == team and row['home_team_score'] < row['visiting_team_score']:
            win_streak += 1
        else:
            break
    
    losing_streak = 0
    if win_streak > 0:
        pass
    else:
        for _,row in our_team_slice.sort_values('start_time', ascending=False).iterrows():
            if row['home_team_name'] == team and row['home_team_score'] < row['visiting_team_score']:
                losing_streak += 1
            elif row['visiting_team_name'] == team and row['home_team_score'] > row['visiting_team_score']:
                losing_streak += 1
            else:
                break
    
    return {
        'wins_as_home_team': home_wins,
        'wins_as_visiting_team': visiting_wins,
        'total_current_wins': home_wins + visiting_wins,
        'current_win_streak': win_streak,
        'current_losing_streak': losing_streak
    }

In [None]:
X, Y = [], []
df_regr_list = []

def feature_generation(row):    
    game_dict = {}
    # Get the teams and starting players for this game
    game_date = row['start_time'].date()
    home = convert_team(row['home_team_name'], game_date.year)
    visitors = convert_team(row['visiting_team_name'], game_date.year)
    home_starters, visitor_starters = get_starters(home, visitors, game_date)
    
    for player_i in xrange(len(home_starters)):
        first_name = home_starters[player_i][0]
        last_name = home_starters[player_i][1]
        df_player_slice = box_scores.query('first_name=="' + first_name + '" and last_name=="' + last_name +'"')
        current_player = df_player_slice[df_player_slice['date'] < game_date]
        features = ['assists','blocks','defensive_rebounds','field_goal_attempts','field_goals',
                   'free_throw_attempts','free_throws','offensive_rebounds','personal_fouls','points',
                   'steals','three_point_field_goal_attempts','three_point_field_goals','total_rebounds',
                    'turnovers']
        game_dict['starter_' + str(player_i)] = first_name + " " + last_name
        
        for f in features:
            desired_feat = current_player[f]
            career_mean_feat = np.mean(desired_feat)
            game_dict['starter_' + str(player_i) + "_CAREER_" + f] = career_mean_feat
    
    game_dict['is_home'] = row['is_home']
    game_dict['home_won'] = row['home_team_score'] > row['visiting_team_score']
    home_win_stats = get_previous_wins(row['home_team_name'], game_date)
    game_dict['home_team_total_wins'] = home_win_stats['total_current_wins']
    game_dict['home_team_wins_as_home'] = home_win_stats['wins_as_home_team']
    game_dict['home_team_wins_as_visitor'] = home_win_stats['wins_as_visiting_team']
    game_dict['home_team_current_win_streak'] = home_win_stats['current_win_streak']
    game_dict['home_team_current_losing_streak'] = home_win_stats['current_losing_streak']
    visiting_win_stats = get_previous_wins(row['visiting_team_name'], game_date)
    game_dict['visiting_team_total_wins'] = visiting_win_stats['total_current_wins']
    game_dict['visiting_team_wins_as_home'] = visiting_win_stats['wins_as_home_team']
    game_dict['visiting_team_wins_as_visitor'] = visiting_win_stats['wins_as_visiting_team']
    game_dict['visiting_team_current_win_streak'] = visiting_win_stats['current_win_streak']
    game_dict['visiting_team_current_losing_streak'] = visiting_win_stats['current_losing_streak']
    
    return game_dict

result = pd.DataFrame(Parallel(n_jobs=4)(delayed(feature_generation)(row)\
                                         for _, row in list(played_schedule.iterrows())))

In [None]:
#Notes for regression:
# x_train = list of lists (interior list is all of features) - features should be (aggregate typically) numbers - x-train are vars to predict
# y_train = list of results (0-loss or 1-win)
# feature generation - creating x_train and y_train
# plug in and chug 5 lines code almost
# Make sure x_train and y_train are same length

# Tasks: 
# (1) create initial list of variables that thought to be important (8-25 variables - choose easiest first)
# (2) create the interior lists of x-train - one for each game and which games chosen dependent on current perspective
# (3) create the corresponding y-train (0,1) - basically in same order whether game chosen in x-train won or loss
# (4) plug in and chug once x-train and y-train completed

# Methods/Approaches - (1) PCA/SVM (2) Random Forest (3) Linear Regression - within each, can tweak parameters
# test model on subset of data that was not put into x-train - testtrainsplit() function arbitrarily picks values 
# for x-test and y-test and for train model (model.fit()) and 
# then use remaining values not used are used for model.predict()

# Predict point spread potentially

# start out with generic features - team stats first 
# individual stats would have to go into box scores and check/iterate through who played in those game
# historical team performances vs away/home 
# account for collinearity 

# have 3 models that work that have about 60-70% or higher (have at least one to) 
# cannot use variables that are not accessible before game starts 

# some good ones: previous game W/L, is_home/is_away, etc. 