In [9]:
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt 


from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.linear_model import LinearRegression
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.ensemble import RandomForestRegressor
from datetime import datetime
from pathlib import Path


In [4]:
current_datetime =  datetime.now()
current_date_str = str(current_datetime.date()).replace('-','_')
output_directory = '../data/processed/player_data/'
output_filename = current_date_str + '_clean_player_data.csv'
file_path = output_directory + output_filename


df = pd.read_csv(file_path)

duplicate_players = df[df.duplicated(subset=['Player'], keep=False)]

duplicate_players_same_team = duplicate_players[duplicate_players.duplicated(subset=['Player', 'Team'], keep=False)]

df = df.drop_duplicates(subset=['Player', 'Team'], keep='first')



defense_df = df[df['Pos'] == 'D']

In [5]:
def prep_atoi(atoi_string):
    list = atoi_string.split(':')
    minutes = int(list[0])
    seconds = int(list[1])

    return minutes + (seconds/60)

In [6]:
columns_to_drop = ['Unnamed: 0', 'Pos', 'EV.1', 'PP.1', 'SH.1', 'TOI','FOW', 'FOL', 'FO%',  'S/C','Team', 'specific_pos', 'Cap%', 'ATOI', 'Ht', 'Wt' ]

defense_df['icetime_per_game'] = defense_df['ATOI'].apply(prep_atoi)

df = defense_df.drop(columns=columns_to_drop)
df = df.dropna()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  defense_df['icetime_per_game'] = defense_df['ATOI'].apply(prep_atoi)


In [7]:
df['points_per_game'] = (df['G'] + df['A']) / df['GP']
df['penalty_mins_per_game'] = df['PIM'] / df['GP']
df['shots_per_game'] = df['S'] / df['GP']
df['shot%_per_game'] = df['S%'] / df['GP']
df['hits_per_game'] = df['HIT'] / df['GP']
df['blocks_per_game'] = df['BLK'] / df['GP']
df['years_of_experience'] = df['Exp']

 
df = df.drop(columns=['G', 'A', 'PIM', 'S', 'S%', 'HIT', 'BLK', 'GP', 'PTS', 'Exp'])

In [8]:
player_names = df['Player']


# Features (X) - Exclude the target column
X = df.drop(['Salary', 'Player'], axis=1)

# Target (y) - The column you want to predict
y = df['Salary']


X_encoded = pd.get_dummies(X)


# Split the data into training and testing sets
# Adjust the test_size parameter as needed
X_train, X_test, y_train, y_test, player_names_train, player_names_test = train_test_split(
    X_encoded, y, player_names, test_size=0.2, random_state=42
)


In [13]:

# Initialize the Random Forest Regressor
rf_model = RandomForestRegressor()

scores = cross_val_score(rf_model, X_train, y_train, cv=6, scoring='neg_mean_squared_error')
cv_mse = -scores.mean()  # Calculate mean squared error

# 5. Evaluate model performance during cross-validation
print("Cross-Validation Mean Squared Error:", cv_mse)




# 6. Train model on full data
rf_model.fit(X_encoded, y)

# 7. Make predictions
all_predictions = rf_model.predict(X)

# 8. Evaluate predictions (optional)
mse = mean_squared_error(y, all_predictions)
print("Mean Squared Error on Full Dataset:", mse)

Cross-Validation Mean Squared Error: 2763235356139.483
Mean Squared Error on Full Dataset: 368889689093.96967


In [19]:
print(f'{368889689093.96967:e}')
print(f'{2763235356139.483:e}')

3.688897e+11
2.763235e+12
