Imports

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from matplotlib import rcParams
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import joblib

Load imputed dataset with all columns

In [2]:
df = pd.read_csv(r'Data/seasonstats_imputed.csv')

Linear Regression for imputed dataset with all columns

In [3]:
# Features (X) and target (y)
X = df.drop(columns=['Unnamed: 0', 'Season', 'Squad', 'Pts'])
y = df['Pts']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Create the logistic regression model
lin_reg_model = LinearRegression()

# Train the model
lin_reg_model.fit(X_train, y_train)

# Make predictions on the test set
y_pred = lin_reg_model.predict(X_test)

# Evaluate the model
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f'Mean Absolute Error: {mae}')
print(f'Mean Squared Error: {mse}')
print(f'R^2 Score: {r2}')

Mean Absolute Error: 5.8772057480773965
Mean Squared Error: 65.61840471669557
R^2 Score: 0.627417986965703


Random Forest Model for imputed dataset with all columns

In [4]:
# Features (X) and target (y)
X = df.drop(columns=['Unnamed: 0', 'Season', 'Squad', 'Pts'])
y = df['Pts']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Create the logistic regression model
RF_model = RandomForestRegressor()

# Train the model
RF_model.fit(X_train, y_train)

# Make predictions on the test set
y_pred = RF_model.predict(X_test)

# Evaluate the model
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f'Mean Absolute Error: {mae}')
print(f'Mean Squared Error: {mse}')
print(f'R^2 Score: {r2}')

# Random Forest Model reduces error and increases r^2

Mean Absolute Error: 4.316510721247563
Mean Squared Error: 50.62412261208578
R^2 Score: 0.7125556832364297


In [5]:
# Save the model to a file
joblib.dump(RF_model, 'random_forest_model.pkl')

['random_forest_model.pkl']

Random Forest Model on imputed dataset with only key features

In [6]:
# Features (X) and target (y)
X = df.drop(columns=['Unnamed: 0', 'Season', 'Squad', 'Pts'])
y = df['Pts']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Create the logistic regression model
RF_model_2 = RandomForestRegressor()

# Train the model
RF_model_2.fit(X_train, y_train)

# Make predictions on the test set
y_pred = RF_model_2.predict(X_test)

# Evaluate the model
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f'Mean Absolute Error: {mae}')
print(f'Mean Squared Error: {mse}')
print(f'R^2 Score: {r2}')

# Dropping features decreased model accuracy

# TODO Try regularization, PCA, and/or cross validation 

Mean Absolute Error: 4.370604288499025
Mean Squared Error: 51.14643430799221
R^2 Score: 0.709589991767209
