In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
import pickle

In [3]:
games = pd.read_csv("data/sorted_merged_table.csv")

In [4]:
games['Home'] = games['Home'].apply(lambda x: 1 if x == 'H' else 0)  # Assuming 'H' represents home games


In [5]:
games.fillna(0, inplace=True)

In [6]:
games[['Wins', 'Losses', 'Ties']] = games['Overall'].str.split('-', expand=True).astype(int)


In [7]:
games['WL'] = (games['Wins'] > games['Losses']).astype(int)


In [8]:
games.rename(columns={'<1': 'less_than_1_goal', '>3': 'more_than_3_goals'}, inplace=True)


In [9]:

# One-hot encode categorical columns
categorical_columns = [
    'EAS', 'WES', 'ATL', 'MET', 'CEN', 'PAC', 
    'Oct', 'Nov', 'Dec', 'Jan', 'Feb', 'Mar', 'Apr'
]
games = pd.get_dummies(games, columns=categorical_columns, drop_first=True)

# Map 'Yes' to 1 and 'No' to 0 for specific columns
games['Home'] = games['Home'].map({'Yes': 1, 'No': 0})
games['Road'] = games['Road'].map({'Yes': 1, 'No': 0})
games['Shootout'] = games['Shootout'].map({'Yes': 1, 'No': 0})
games['Overtime'] = games['Overtime'].map({'Yes': 1, 'No': 0})

In [10]:
games = games.drop(columns=['Overall'])

In [11]:

# One-hot encode team columns
team_columns = ['ANA', 'ARI', 'BOS', 'BUF', 'CAR', 'CBJ', 'CGY', 'CHI', 'COL', 
                'DAL', 'DET', 'EDM', 'FLA', 'LAK', 'MIN', 'MTL', 'NJD', 'NSH', 
                'NYI', 'NYR', 'OTT', 'PHI', 'PIT', 'SEA', 'SJS', 'STL', 'TBL', 
                'TOR', 'VAN', 'VEG', 'WPG', 'WSH']
games = pd.get_dummies(games, columns=team_columns, drop_first=True)


In [12]:
le = LabelEncoder()

In [13]:
for col in games.select_dtypes(include=['object']).columns:
    games[col] = le.fit_transform(games[col])


In [14]:
# Define the target and feature columns
target = 'WL'
features = [col for col in games.columns if col != target]


In [15]:

# Split the data into training and test sets
X = games[features]
y = games[target]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier


In [16]:

# Define the parameter grid for RandomForest
param_grid = {
    'n_estimators': [100, 200, 300],  # Number of trees
    'max_depth': [10, 20, 30, None],  # Depth of each tree
    'min_samples_split': [2, 5, 10],  # Minimum samples to split a node
    'min_samples_leaf': [1, 2, 4],  # Minimum samples to be a leaf node
    'max_features': ['sqrt', 'log2', None],  # Number of features to consider for splitting
    'class_weight': ['balanced', None]  # Handle class imbalance
}

# Initialize the RandomForest model
rf_model = RandomForestClassifier(random_state=42)


In [17]:
# Perform grid search with cross-validation
grid_search = GridSearchCV(estimator=rf_model, param_grid=param_grid, cv=5, n_jobs=-1, verbose=2, scoring='accuracy')

# Fit the grid search
grid_search.fit(X_train, y_train)

# Print the best hyperparameters found by GridSearchCV
print("Best Hyperparameters found by GridSearchCV:", grid_search.best_params_)


Fitting 5 folds for each of 648 candidates, totalling 3240 fits
Best Hyperparameters found by GridSearchCV: {'class_weight': 'balanced', 'max_depth': 10, 'max_features': 'log2', 'min_samples_leaf': 4, 'min_samples_split': 2, 'n_estimators': 300}


In [18]:
# Use the best model from GridSearchCV
best_rf_model = grid_search.best_estimator_

# Evaluate the model on the test set
y_pred = best_rf_model.predict(X_test)

In [19]:
# Print the accuracy of the optimized model
print("Accuracy of the tuned RandomForest model:", accuracy_score(y_test, y_pred))

# Save the optimized model for later use
with open('best_rf_model.pkl', 'wb') as f:
    pickle.dump(best_rf_model, f)

Accuracy of the tuned RandomForest model: 0.8571428571428571
