In [1]:
# Split data into training set and testing set
import pandas as pd
import numpy as np
import sklearn.metrics
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.svm import SVC
from sklearn import cross_validation
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import cross_val_score
from sklearn.metrics import classification_report
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier



In [2]:
# getting rid home_starter_0, visitor_starter_0, etc.
# home_starters = ['home_starter_' + str(i) for i in range(5)]
home_starters = []
visitor_starters = ['visitor_starter_' + str(i) for i in range(5)]
starters = home_starters + visitor_starters

In [3]:
dataset = pd.read_csv('df_regr.csv', index_col=0)

In [4]:
X = dataset.drop(['game_date','home_won', 'home_team_abbr', 'visiting_team_abbr', 'home_team_name', 'visiting_team_name'] + starters, 
                 axis=1)
y = dataset['home_won']

In [5]:
# Model 1 - PCA 

In [6]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2)

In [7]:
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

In [8]:
pca = PCA(n_components = 2)
pca.fit(X_train)
X_t_train = pca.transform(X_train)
X_t_test = pca.transform(X_test)
explained_variance = pca.explained_variance_ratio_

In [9]:
classifier = SVC()
classifier.fit(X_t_train, y_train)

SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)

In [10]:
print 'score', classifier.score(X_t_test, y_test)
prediction = classifier.predict(X_t_test)
print 'pred label', prediction

score 0.61600537996
pred label [ True  True  True ...,  True  True  True]


In [11]:
# Model 2 - Decision Trees
features = ['home_team_current_losing_streak', 'home_team_current_win_streak', 'home_team_season_series_losses',
           'home_team_season_series_wins', 'home_team_season_series_wins', 'home_team_total_wins', 'home_team_wins_as_home',
           'home_team_wins_as_visitor','visiting_team_current_losing_streak', 'visiting_team_current_win_streak', 'visiting_team_season_series_losses',
           'visiting_team_season_series_wins', 'visiting_team_total_wins', 'visiting_team_wins_as_home', 'visiting_team_wins_as_visitor']
X_features_only = dataset[features]

In [12]:
clf = DecisionTreeClassifier()
scores = cross_val_score(clf, X_features_only, dataset['home_won'], scoring='accuracy')
print(scores)
print("Accuracy: {0:.1f}%".format(np.mean(scores) * 100))

[ 0.5699879   0.57223567  0.56439241]
Accuracy: 56.9%


In [13]:
X_train, X_test, y_train, y_test = train_test_split(X_features_only, dataset['home_won'], test_size=.2)
classifier = DecisionTreeClassifier()
classifier = classifier.fit(X_train, y_train)
predictions = classifier.predict(X_test)
print sklearn.metrics.confusion_matrix(y_test, predictions)
print("Accuracy: {0:.1f}%".format(sklearn.metrics.accuracy_score(y_test, predictions) * 100))

[[292 317]
 [312 566]]
Accuracy: 57.7%


In [14]:
# Model 3 - Random Forests/Grid Search

In [15]:
parameter_space = {
    "max_features": [2,10,'auto'],
    "n_estimators": [100,],
    "criterion": ["gini", "entropy"],
    "min_samples_leaf": [2,4,6],
}

clf = RandomForestClassifier()
grid = GridSearchCV(clf, parameter_space)
grid.fit(X_train, y_train)
print("Accuracy: {0:.1f}%".format(grid.best_score_ * 100))

Accuracy: 64.7%
