# Fit the model

In [1]:
import pandas as pd
import numpy as np

In [2]:
player_game_history = pd.read_csv('data/player_game_history_clean.csv')
player_game_history

Unnamed: 0,round,player_name,team_name,opponent_team_name,was_home,total_points
0,1,Johan Dahlin,Malmö FF,Hammarby,True,0
1,2,Johan Dahlin,Malmö FF,BK Häcken,False,0
2,3,Johan Dahlin,Malmö FF,Östersunds FK,True,0
3,4,Johan Dahlin,Malmö FF,Djurgården,False,0
4,5,Johan Dahlin,Malmö FF,Varbergs BoIS,True,0
...,...,...,...,...,...,...
7324,16,Karl Serrano,Degerfors IF,Malmö FF,False,0
7325,16,Nikola Djurdjic,Degerfors IF,Malmö FF,False,0
7326,16,Ahmed Yasin Ghani Mousa,Örebro SK,IK Sirius,True,0
7327,16,Rasmus Bonde,AIK,BK Häcken,True,0


In [4]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer

# Split into X/y
X = player_game_history.drop("total_points", axis=1)
y = player_game_history["total_points"]

categorical_features = ["round", "player_name", "team_name", "opponent_team_name"];
one_hot = OneHotEncoder()
transformer = ColumnTransformer([("one_hot", one_hot, categorical_features)], remainder="passthrough")

transformed_X = transformer.fit_transform(X)

In [5]:
from sklearn.model_selection import train_test_split

# Split into training and test
X_train, X_test, y_train, y_test = train_test_split(transformed_X, y, test_size=0.2)

# Build model
from sklearn.ensemble import RandomForestRegressor

model = RandomForestRegressor()
model.fit(X_train, y_train)

RandomForestRegressor()

In [6]:
model.score(X_test, y_test)

0.20329815200138712

In [7]:
y_preds = model.predict(X_test)
y_preds

array([0.77, 0.  , 1.11, ..., 0.53, 0.  , 0.48])

In [8]:
np.mean(y_preds == y_test)

0.24351978171896316

### Calculate R^2

In [9]:
from sklearn.metrics import r2_score

y_test_mean = np.full(len(y_test), y_test.mean())
y_test.mean()

1.655525238744884

In [10]:
r2_score(y_test, y_test_mean)

0.0

In [11]:
r2_score(y_test, y_test)

1.0

### Mean Absolute Error (MAE)

Average of the absolute differences between predictions and the actual values.

In [12]:
from sklearn.metrics import mean_absolute_error

y_preds = model.predict(X_test)
mae = mean_absolute_error(y_test, y_preds)
mae

1.2365484311050476

In [13]:
diffs = pd.DataFrame(data={"actual": y_test, "predicted": y_preds, "diff": np.abs(y_test - y_preds)})
diffs

Unnamed: 0,actual,predicted,diff
4686,0,0.77,0.77
1367,1,0.00,1.00
4967,10,1.11,8.89
6603,2,3.31,1.31
4302,1,0.54,0.46
...,...,...,...
4591,0,0.43,0.43
5205,1,1.29,0.29
2333,1,0.53,0.47
2214,0,0.00,0.00


In [14]:
diffs["diff"].mean()

1.2365484311050456

### Mean Squared Error (MSE)

In [15]:
from sklearn.metrics import mean_squared_error

mean_squared_error(y_test, y_preds)

5.068806412005457

In [16]:
diffs["diff^2"] = diffs["diff"] ** 2
diffs

Unnamed: 0,actual,predicted,diff,diff^2
4686,0,0.77,0.77,0.5929
1367,1,0.00,1.00,1.0000
4967,10,1.11,8.89,79.0321
6603,2,3.31,1.31,1.7161
4302,1,0.54,0.46,0.2116
...,...,...,...,...
4591,0,0.43,0.43,0.1849
5205,1,1.29,0.29,0.0841
2333,1,0.53,0.47,0.2209
2214,0,0.00,0.00,0.0000


In [17]:
diffs["diff^2"].mean()

5.068806412005465