In [1]:
import pandas as pd
import numpy as np
import json
pd.set_option("display.max_columns", None)
from IPython.display import display
from joblib import Parallel, delayed

In [2]:
teams_dict = {}
with open("teams.json", 'r') as _f:
    teams_dict = json.load(_f)

def convert_team(name, year):
    convert_dict = {}
    for team in teams_dict:
        convert_dict[team['teamName']] = team['abbreviation']
    
    name_changes = {
        'Charlotte Hornets': 'CHA',
        'Charlotte Bobcats': 'CHA',
        'New Orleans Hornets': 'NOH',
        'New Orleans Pelicans': 'NOP',
        'Brooklyn Nets': 'BRK'
    }
    if name in name_changes:
        return name_changes[name]
    
    return convert_dict[name]

In [3]:
schedules = pd.read_csv('2012_2017_schedules.csv', index_col=0)
played_schedule = schedules.dropna()
played_schedule['start_time'] = pd.to_datetime(played_schedule['start_time'])\
                                .dt.tz_localize('UTC').dt.tz_convert('US/Eastern')

box_scores = pd.read_csv('2012_2017_box_scores.csv', index_col=0)
box_scores['date'] = pd.to_datetime(box_scores['date'])

season_stats = pd.read_csv('2012_2017_season_player_stats.csv', index_col=0)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  app.launch_new_instance()


In [4]:
def get_starters(home_team, visitor_team, date):
    df_slice = box_scores.query('team=="'+ home_team +'" or team=="'+ visitor_team +'"')
    game_players = df_slice[df_slice['date'] == date] 
    visitor_players = game_players[game_players['team']==visitor_team]
    home_players = game_players[game_players['team']==home_team]
    home_starters_df = home_players.sort_values('seconds_played', ascending=False).head(5)
    home_starters = [(p['first_name'], p['last_name']) for _,p in home_starters_df.iterrows()]
    visitor_starters_df = visitor_players.sort_values('seconds_played', ascending=False).head(5)
    visitor_starters = [(p['first_name'], p['last_name']) for _,p in visitor_starters_df.iterrows()]
    return home_starters, visitor_starters

In [5]:
def get_previous_wins(team, opponent, date):
    # TODO: Get past record this season against opponent
    lower_year = date.year
    if date.month < 7:
        lower_year -= 1
    lower_bound = pd.to_datetime('09-01-' + str(lower_year)).date()
    df_slice = played_schedule[(played_schedule['start_time'] < date) \
                               & (played_schedule['start_time'] > lower_bound)]
    our_team_slice = df_slice.query('home_team_name=="' + team + '" or visiting_team_name=="' + team + '"')
    
    home_wins = our_team_slice.apply(lambda row: row['home_team_name'] == team and\
                                     row['home_team_score'] > row['visiting_team_score'], axis=1)
    home_wins = home_wins.sum() if not home_wins.empty else 0
    visiting_wins = our_team_slice.apply(lambda row: row['visiting_team_name'] == team and\
                                     row['home_team_score'] < row['visiting_team_score'], axis=1)
    visiting_wins = visiting_wins.sum() if not visiting_wins.empty else 0
    
    win_streak = 0
    for _,row in our_team_slice.sort_values('start_time', ascending=False).iterrows():
        if row['home_team_name'] == team and row['home_team_score'] > row['visiting_team_score']:
            win_streak += 1
        elif row['visiting_team_name'] == team and row['home_team_score'] < row['visiting_team_score']:
            win_streak += 1
        else:
            break
    
    losing_streak = 0
    if win_streak > 0:
        pass
    else:
        for _,row in our_team_slice.sort_values('start_time', ascending=False).iterrows():
            if row['home_team_name'] == team and row['home_team_score'] < row['visiting_team_score']:
                losing_streak += 1
            elif row['visiting_team_name'] == team and row['home_team_score'] > row['visiting_team_score']:
                losing_streak += 1
            else:
                break
                
    
    season_series = our_team_slice.query('home_team_name=="' + opponent + '" or visiting_team_name=="' + opponent + '"')
    season_series_wins = 0
    season_series_losses = 0
    for _,row in season_series.iterrows():
        if row['home_team_name'] == team:
            if row['home_team_score'] > row['visiting_team_score']:
                season_series_wins += 1
            else:
                season_series_losses +=1
        elif row['visiting_team_name'] == team:
            if row['home_team_score'] < row['visiting_team_score']:
                season_series_wins += 1
            else:
                season_series_losses += 1
    
    return {
        'wins_as_home_team': home_wins,
        'wins_as_visiting_team': visiting_wins,
        'total_current_wins': (home_wins + visiting_wins),
        'current_win_streak': win_streak,
        'current_losing_streak': losing_streak,
        'season_series_wins': season_series_wins,
        'season_series_losses': season_series_losses
    }

In [6]:
X, Y = [], []
df_regr_list = []

for _, row in played_schedule.iterrows():    
    game_dict = {}
    # Get the teams and starting players for this game
    game_date = row['start_time'].date()
    home = convert_team(row['home_team_name'], game_date.year)
    visitors = convert_team(row['visiting_team_name'], game_date.year)
    home_starters, visitor_starters = get_starters(home, visitors, game_date)
    game_dict['home_team_name'] = row['home_team_name']
    game_dict['home_team_abbr'] = home
    game_dict['visiting_team_name'] = row['visiting_team_name']
    game_dict['visiting_team_abbr'] = visitors
    game_dict['game_date'] = game_date
    
    for player_i in xrange(len(home_starters)):
        first_name = home_starters[player_i][0]
        last_name = home_starters[player_i][1]
        df_player_slice = box_scores.query('first_name=="' + first_name + '" and last_name=="' + last_name +'"')
        current_player = df_player_slice[df_player_slice['date'] < game_date]
        features = ['assists','blocks','defensive_rebounds','field_goal_attempts','field_goals',
                   'free_throw_attempts','free_throws','offensive_rebounds','personal_fouls','points',
                   'steals','three_point_field_goal_attempts','three_point_field_goals','total_rebounds',
                    'turnovers']
        game_dict['visitor_starter_' + str(player_i)] = first_name + " " + last_name
        
        for f in features:
            desired_feat = current_player[f]
            career_mean_feat = np.mean(desired_feat)
            career_mean_feat = 0 if career_mean_feat != career_mean_feat else career_mean_feat
            game_dict['home_starter_' + str(player_i) + "_CAREER_" + f] = career_mean_feat
    
    for player_i in xrange(len(visitor_starters)):
        first_name = visitor_starters[player_i][0]
        last_name = visitor_starters[player_i][1]
        df_player_slice = box_scores.query('first_name=="' + first_name + '" and last_name=="' + last_name +'"')
        current_player = df_player_slice[df_player_slice['date'] < game_date]
        features = ['assists','blocks','defensive_rebounds','field_goal_attempts','field_goals',
                   'free_throw_attempts','free_throws','offensive_rebounds','personal_fouls','points',
                   'steals','three_point_field_goal_attempts','three_point_field_goals','total_rebounds',
                    'turnovers']
        game_dict['visitor_starter_' + str(player_i)] = first_name + " " + last_name
        
        for f in features:
            desired_feat = current_player[f]
            career_mean_feat = np.mean(desired_feat) 
            career_mean_feat = 0 if career_mean_feat != career_mean_feat else career_mean_feat
            game_dict['visitor_starter_' + str(player_i) + "_CAREER_" + f] = career_mean_feat
    
    game_dict['home_won'] = row['home_team_score'] > row['visiting_team_score']
    home_win_stats = get_previous_wins(row['home_team_name'], row['visiting_team_name'], game_date)
    game_dict['home_team_total_wins'] = home_win_stats['total_current_wins']
    game_dict['home_team_wins_as_home'] = home_win_stats['wins_as_home_team']
    game_dict['home_team_wins_as_visitor'] = home_win_stats['wins_as_visiting_team']
    game_dict['home_team_current_win_streak'] = home_win_stats['current_win_streak']
    game_dict['home_team_current_losing_streak'] = home_win_stats['current_losing_streak']
    game_dict['home_team_season_series_wins'] = home_win_stats['season_series_wins']
    game_dict['home_team_season_series_losses'] = home_win_stats['season_series_losses']
    visiting_win_stats = get_previous_wins(row['visiting_team_name'], row['home_team_name'], game_date)
    game_dict['visiting_team_total_wins'] = visiting_win_stats['total_current_wins']
    game_dict['visiting_team_wins_as_home'] = visiting_win_stats['wins_as_home_team']
    game_dict['visiting_team_wins_as_visitor'] = visiting_win_stats['wins_as_visiting_team']
    game_dict['visiting_team_current_win_streak'] = visiting_win_stats['current_win_streak']
    game_dict['visiting_team_current_losing_streak'] = visiting_win_stats['current_losing_streak']
    game_dict['visiting_team_season_series_wins'] = visiting_win_stats['season_series_wins']
    game_dict['visiting_team_season_series_losses'] = visiting_win_stats['season_series_losses']
    
    df_regr_list.append(game_dict)

In [7]:
df_regr = pd.DataFrame(df_regr_list)

In [8]:
df_regr.to_csv('df_regr.csv')

In [9]:
#Notes for regression:
# x_train = list of lists (interior list is all of features) - features should be (aggregate typically) numbers - x-train are vars to predict
# y_train = list of results (0-loss or 1-win)
# feature generation - creating x_train and y_train
# plug in and chug 5 lines code almost
# Make sure x_train and y_train are same length

# Tasks: 
# (1) create initial list of variables that thought to be important (8-25 variables - choose easiest first)
# (2) create the interior lists of x-train - one for each game and which games chosen dependent on current perspective
# (3) create the corresponding y-train (0,1) - basically in same order whether game chosen in x-train won or loss
# (4) plug in and chug once x-train and y-train completed

# Methods/Approaches - (1) PCA/SVM (2) Random Forest (3) Linear Regression - within each, can tweak parameters
# test model on subset of data that was not put into x-train - testtrainsplit() function arbitrarily picks values 
# for x-test and y-test and for train model (model.fit()) and 
# then use remaining values not used are used for model.predict()

# Predict point spread potentially

# start out with generic features - team stats first 
# individual stats would have to go into box scores and check/iterate through who played in those game
# historical team performances vs away/home 
# account for collinearity 

# have 3 models that work that have about 60-70% or higher (have at least one to) 
# cannot use variables that are not accessible before game starts 

# some good ones: previous game W/L, is_home/is_away, etc. 

In [13]:
x = df_regr[df_regr.isnull().any(axis=1)]

In [23]:
y = x[['home_team_abbr', 'visiting_team_abbr', 'game_date']]

In [24]:
from collections import Counter
y

Unnamed: 0,home_team_abbr,visiting_team_abbr,game_date
12,SAS,OKC,2012-11-01
40,BRK,MIN,2012-11-05
41,DAL,POR,2012-11-05
42,LAC,CLE,2012-11-05
43,MEM,UTA,2012-11-05
44,MIA,PHO,2012-11-05
45,PHI,NYK,2012-11-05
46,SAC,GSW,2012-11-05
47,SAS,IND,2012-11-05
93,CHI,BOS,2012-11-12


In [25]:
box_scores[box_scores['date'] == pd.to_datetime("2012-11-01")]

Unnamed: 0,Unnamed: 0.1,assists,blocks,date,defensive_rebounds,field_goal_attempts,field_goals,first_name,free_throw_attempts,free_throws,is_home,last_name,offensive_rebounds,opponent,personal_fouls,points,seconds_played,steals,team,three_point_field_goal_attempts,three_point_field_goals,total_rebounds,turnovers


In [31]:
box_scores[(box_scores['team'] == 'SAS') & (box_scores['date'] < pd.to_datetime('2012-11-03'))]

Unnamed: 0,Unnamed: 0.1,assists,blocks,date,defensive_rebounds,field_goal_attempts,field_goals,first_name,free_throw_attempts,free_throws,is_home,last_name,offensive_rebounds,opponent,personal_fouls,points,seconds_played,steals,team,three_point_field_goal_attempts,three_point_field_goals,total_rebounds,turnovers
68,68,3,3,2012-10-31,9,15,10,Tim,5,4,False,Duncan,2,NOH,3,24,2062.0,0,SAS,0,0,11,5
75,75,6,0,2012-10-31,3,19,9,Tony,6,4,False,Parker,0,NOH,2,23,2125.0,0,SAS,1,1,3,0
81,81,1,0,2012-10-31,6,13,6,Kawhi,4,4,False,Leonard,1,NOH,5,19,2057.0,5,SAS,6,3,7,2
113,113,6,1,2012-10-31,2,3,2,Boris,0,0,False,Diaw,2,NOH,1,4,1789.0,2,SAS,1,0,4,2
135,135,1,0,2012-10-31,3,5,3,Tiago,2,1,False,Splitter,3,NOH,1,7,950.0,0,SAS,0,0,6,1
162,162,0,2,2012-10-31,3,11,4,Danny,0,0,False,Green,0,NOH,1,9,1957.0,1,SAS,5,1,3,1
169,169,1,0,2012-10-31,4,6,2,Stephen,6,3,False,Jackson,1,NOH,5,7,1485.0,1,SAS,3,0,5,0
209,209,3,0,2012-10-31,2,5,1,Gary,1,0,False,Neal,1,NOH,1,2,1040.0,0,SAS,2,0,3,1
212,212,1,1,2012-10-31,1,0,0,Patty,2,1,False,Mills,0,NOH,0,1,363.0,0,SAS,0,0,1,2
214,214,0,1,2012-10-31,0,3,1,Matt,0,0,False,Bonner,0,NOH,2,3,572.0,0,SAS,2,1,0,1
