In [2]:
import pandas as pd
import numpy as np
import datetime

# Plotting libraries
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.graph_objects as go
import chart_studio.tools as tls
import chart_studio
import chart_studio.plotly as py
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
from plotly_api_key import plotly_api_key

# Scikit libraries
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import PowerTransformer
from sklearn.ensemble import RandomForestClassifier
from sklearn.externals import joblib

In [3]:
# Load the model from the file 
RF_from_joblib = joblib.load('final_RF_model.pkl')

In [4]:
RF_from_joblib

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=None, max_features='auto', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=300,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

In [5]:
# Load the scaler
pt_scaler = joblib.load('pt_scaler.pkl')

In [6]:
pt_scaler

PowerTransformer(copy=True, method='yeo-johnson', standardize=True)

In [7]:
# Getting raw data
keep_columns = ['posteam','game_date','play_type', 'yardline_100', 'half_seconds_remaining', 'game_seconds_remaining','ydstogo', 'score_differential', 'down']




In [8]:
raw_data = pd.read_csv('resources/NFL_Play_by_Play_2009_2018.csv', low_memory=False)
raw_data.head()

Unnamed: 0,play_id,game_id,home_team,away_team,posteam,posteam_type,defteam,side_of_field,yardline_100,game_date,...,penalty_player_id,penalty_player_name,penalty_yards,replay_or_challenge,replay_or_challenge_result,penalty_type,defensive_two_point_attempt,defensive_two_point_conv,defensive_extra_point_attempt,defensive_extra_point_conv
0,46,2009091000,PIT,TEN,PIT,home,TEN,TEN,30.0,2009-09-10,...,,,,0,,,0.0,0.0,0.0,0.0
1,68,2009091000,PIT,TEN,PIT,home,TEN,PIT,58.0,2009-09-10,...,,,,0,,,0.0,0.0,0.0,0.0
2,92,2009091000,PIT,TEN,PIT,home,TEN,PIT,53.0,2009-09-10,...,,,,0,,,0.0,0.0,0.0,0.0
3,113,2009091000,PIT,TEN,PIT,home,TEN,PIT,56.0,2009-09-10,...,,,,0,,,0.0,0.0,0.0,0.0
4,139,2009091000,PIT,TEN,PIT,home,TEN,PIT,56.0,2009-09-10,...,,,,0,,,0.0,0.0,0.0,0.0


In [9]:
raw_data.columns

Index(['play_id', 'game_id', 'home_team', 'away_team', 'posteam',
       'posteam_type', 'defteam', 'side_of_field', 'yardline_100', 'game_date',
       ...
       'penalty_player_id', 'penalty_player_name', 'penalty_yards',
       'replay_or_challenge', 'replay_or_challenge_result', 'penalty_type',
       'defensive_two_point_attempt', 'defensive_two_point_conv',
       'defensive_extra_point_attempt', 'defensive_extra_point_conv'],
      dtype='object', length=255)

In [10]:
raw_data.shape

(449371, 255)

In [11]:
raw_data.columns

Index(['play_id', 'game_id', 'home_team', 'away_team', 'posteam',
       'posteam_type', 'defteam', 'side_of_field', 'yardline_100', 'game_date',
       ...
       'penalty_player_id', 'penalty_player_name', 'penalty_yards',
       'replay_or_challenge', 'replay_or_challenge_result', 'penalty_type',
       'defensive_two_point_attempt', 'defensive_two_point_conv',
       'defensive_extra_point_attempt', 'defensive_extra_point_conv'],
      dtype='object', length=255)

In [12]:
selected = raw_data[keep_columns].copy()
selected['game_date']= pd.to_datetime(selected['game_date'])

In [13]:
selected.head()

Unnamed: 0,posteam,game_date,play_type,yardline_100,half_seconds_remaining,game_seconds_remaining,ydstogo,score_differential,down
0,PIT,2009-09-10,kickoff,30.0,1800.0,3600.0,0,,
1,PIT,2009-09-10,pass,58.0,1793.0,3593.0,10,0.0,1.0
2,PIT,2009-09-10,run,53.0,1756.0,3556.0,5,0.0,2.0
3,PIT,2009-09-10,pass,56.0,1715.0,3515.0,8,0.0,3.0
4,PIT,2009-09-10,punt,56.0,1707.0,3507.0,8,0.0,4.0


In [14]:
prkp = selected[(selected.play_type == 'pass') | 
                (selected.play_type == 'run') | 
                (selected.play_type == 'field_goal') | 
                (selected.play_type == 'punt')]

In [15]:
prkp.head()

Unnamed: 0,posteam,game_date,play_type,yardline_100,half_seconds_remaining,game_seconds_remaining,ydstogo,score_differential,down
1,PIT,2009-09-10,pass,58.0,1793.0,3593.0,10,0.0,1.0
2,PIT,2009-09-10,run,53.0,1756.0,3556.0,5,0.0,2.0
3,PIT,2009-09-10,pass,56.0,1715.0,3515.0,8,0.0,3.0
4,PIT,2009-09-10,punt,56.0,1707.0,3507.0,8,0.0,4.0
5,TEN,2009-09-10,run,98.0,1696.0,3496.0,10,0.0,1.0


In [16]:
prkp.shape

(353060, 9)

In [17]:
weather = pd.read_csv('resources/weather_data/weather_final_full.csv')

In [18]:
weather.head()

Unnamed: 0.1,Unnamed: 0,zip_code,date,desc,temperature,snow,windspeed,visibility,humidity,precipitation
0,0,19148,2009-09-01,Sunny,71,0.0,9,6,58,0.0
1,1,19148,2009-09-02,Sunny,73,0.0,9,6,63,0.0
2,2,19148,2009-09-03,Sunny,74,0.0,10,6,64,0.0
3,3,19148,2009-09-04,Partly cloudy,71,0.0,6,6,69,0.0
4,4,19148,2009-09-05,Sunny,81,0.0,5,6,52,0.0


In [19]:
weather.shape

(41491, 10)

In [20]:
humidity = weather[['date', 'humidity']].copy()

In [21]:
humidity.head()

Unnamed: 0,date,humidity
0,2009-09-01,58
1,2009-09-02,63
2,2009-09-03,64
3,2009-09-04,69
4,2009-09-05,52


In [22]:
humidity.rename(columns={'date': 'game_date'}, inplace=True)

In [23]:
humidity['game_date'] = pd.to_datetime(humidity['game_date'])

In [24]:
humidity.head()

Unnamed: 0,game_date,humidity
0,2009-09-01,58
1,2009-09-02,63
2,2009-09-03,64
3,2009-09-04,69
4,2009-09-05,52


In [25]:
prkp['humidity'] = humidity.loc[humidity.game_date.isin(prkp.game_date)]['humidity']



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



In [26]:
prkp['humidity']

1        NaN
2        NaN
3        NaN
4        NaN
5        NaN
          ..
449363   NaN
449364   NaN
449366   NaN
449367   NaN
449368   NaN
Name: humidity, Length: 353060, dtype: float64

In [35]:
prkp_w_weather = prkp.groupby('game_date').bfill()

In [37]:
prkp_w_weather.head(20)

Unnamed: 0,posteam,play_type,yardline_100,half_seconds_remaining,game_seconds_remaining,ydstogo,score_differential,down,humidity
1,PIT,pass,58.0,1793.0,3593.0,10,0.0,1.0,93.0
2,PIT,run,53.0,1756.0,3556.0,5,0.0,2.0,93.0
3,PIT,pass,56.0,1715.0,3515.0,8,0.0,3.0,93.0
4,PIT,punt,56.0,1707.0,3507.0,8,0.0,4.0,93.0
5,TEN,run,98.0,1696.0,3496.0,10,0.0,1.0,93.0
6,TEN,pass,98.0,1660.0,3460.0,10,0.0,2.0,93.0
7,TEN,run,94.0,1631.0,3431.0,6,0.0,3.0,93.0
8,TEN,punt,96.0,1594.0,3394.0,8,0.0,4.0,93.0
9,PIT,pass,43.0,1584.0,3384.0,10,0.0,1.0,93.0
10,PIT,pass,40.0,1548.0,3348.0,7,0.0,2.0,64.0


In [None]:
humidity.loc[humidity.game_date.isin(prkp.game_date)]

In [None]:
merged_data = pd.merge(prkp, humidity, on='game_date', how='right')

In [None]:
merged_data

In [None]:
merged_data.isna().sum()

In [None]:
merged_no_na = merged_data.dropna()

In [None]:
merged_no_na.head()

In [None]:
all_teams = list(merged_no_na.posteam.unique())
all_teams

In [None]:
DET = merged_no_na[merged_no_na.posteam == 'DET'][['play_type', 'yardline_100', 
                                                  'half_seconds_remaining', 
                                                  'game_seconds_remaining', 
                                                  'down', 'ydstogo', 
                                                  'score_differential', 
                                                  'humidity']]

In [None]:
DET.shape

In [None]:
X = DET.drop(columns=['play_type'])
y = DET.play_type
X_transformed = pt_scaler.transform(X)
DET_pred = RF_from_joblib.score(X_transformed, y)

In [None]:
team_accuracies = pd.DataFrame()
team_accuracies['team'] = ['PIT', 'TEN', 'CLE', 'MIN', 'NO', 'DET']
team_accuracies['score'] = [pit_pred, ten_pred, CLE_pred, MIN_pred, NO_pred, DET_pred]

In [None]:
team_accuracies

In [None]:
merged1['humidity'] = 

In [None]:
prkp['humidity'] = humidity.loc[humidity.game_date.isin(prkp.game_date)]['humidity']

In [None]:
prkp.humidity.isna().sum()

In [None]:
prkp.shape

In [None]:
prkp.loc[prkp.game_date.isin(humidity.game_date), 'humidity'] = humidity.humidity

In [None]:
prkp.humidity

In [None]:
 humidity.loc[humidity.game_date.isin(prkp.game_date)]['humidity']

In [None]:
humidity.game_date

In [None]:
prkp.dtypes

In [None]:
merged_no_na.shape

In [None]:
merged_no_na.drop_duplicates()