In [35]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier

In [36]:
#Define which columns I want to use
features=['qtr','down','TimeSecs','yrdline100','ydstogo','posteam','PosTeamScore','DefTeamScore','PlayType']

In [37]:
all_plays=pd.read_csv('NFL Play by Play 2009-2016 (v3).csv',usecols=features)

In [38]:
# Define function to return all plays for a given team
# Removes penalties from list of plays
# Removes team name from data set we'll work with
def plays_for_team(plays,team_name):
    return plays.loc[(plays['posteam']==team_name) & \
                     (plays['PlayType'].isin(['Pass','Run'])) & \
                     ~(plays['down'].isnull())].drop('posteam',axis=1)

In [39]:
team_plays=plays_for_team(all_plays,'NE') #Read in plays

In [40]:
#Split up into features and labels, and convert to numpy arrays
labels=np.array(team_plays['PlayType'])
team_plays=team_plays.drop('PlayType',axis=1)
feature_list=list(team_plays.columns)
team_plays_np=np.array(team_plays)

In [41]:
# Split up data into training and testing
# Since the data is chronological, will train on first 80% of data, test on last 20% of data
train_test_split=int(.8*team_plays_np.shape[0])

train_features=team_plays_np[0:train_test_split]
train_labels=labels[0:train_test_split]

test_features=team_plays_np[train_test_split:]
test_labels=labels[train_test_split:]

In [42]:
print(test_features.shape)
print(test_labels.shape)

print(train_features.shape)
print(train_labels.shape)

(1654, 7)
(1654,)
(6616, 7)
(6616,)


In [43]:
# Compute baseline -- will be percentage of run plays or pass plays, whichever's larger
num_pass=test_labels[test_labels=="Pass"].shape[0]
num_run=test_labels[test_labels=="Run"].shape[0]
baseline=max(num_pass,num_run)/float(num_pass+num_run)
print(baseline)

0.569528415961


In [55]:
# Create and train the model
rf = RandomForestClassifier(n_estimators = 10000, random_state = 42)
# Train the model on training data
rf.fit(train_features, train_labels);

In [56]:
#Predict on test data
predictions = rf.predict(test_features)

In [57]:
float(np.sum(np.equal(predictions,test_labels)))/test_labels.shape[0]

0.652962515114873