# PART 2

#### – Input: 
prepared data
#### – Output:
machine learning model, expected generalisation RMSE
#### – Features:
This system takes the prepared dataframe and builds a machine learning model
for predicting scores. Model selection, feature selection and handling missing data are
important parts of this system. You should evaluate at least 3 fundamentally different
modelling approaches before selecting the final model. We evaluate the performance of the
system by comparing the predicted scores with the known scores on a validation/test data
set. Specifically, the system should be evaluated with the root mean squared error (RMSE)
of predictions.

### imports
imports all the models and libraries that i intend to use, aswell as the data that i am going to use

In [1]:
#imports
from sklearn.preprocessing import PolynomialFeatures
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import mean_squared_error
from sklearn.neural_network import MLPRegressor
from sklearn.naive_bayes import GaussianNB
from sklearn.dummy import DummyRegressor
from sklearn.linear_model import ElasticNet
from sklearn.ensemble import RandomForestRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestClassifier

from sklearn.preprocessing import LabelBinarizer


from sklearn import linear_model
from sklearn import tree
from sklearn import svm

import pickle

import numpy as np
import pandas as pd

#gets data from last part of the task
prepared_data = pd.read_csv("prepared_data.csv", encoding = "UTF-8")
y2020 = pd.read_csv("prosjekt//2020//testwithfeat.csv", encoding = "UTF-8")
prepared_data

Unnamed: 0,ID,Date,Season,Team,Opponent,Venue,Goals,Rk_x,G-PK_x,Age_x,...,Rk_y,G-PK_y,Age_y,W_y,D_y,L_y,GD_y,Pts_y,GF_y,WR_y
0,0,2017-04-17,2017,Medkila,Sandviken,Home,1,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000
1,0,2017-04-17,2017,Sandviken,Medkila,Away,1,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000
2,1,2017-04-17,2017,Avaldsnes,Vålerenga,Home,2,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000
3,1,2017-04-17,2017,Vålerenga,Avaldsnes,Away,1,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000
4,2,2017-04-17,2017,Grand Bodø,Arna-Bjørnar,Home,2,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
791,395,2019-11-16,2019,Klepp,Fart,Away,6,2.0,37.0,25.1,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000
792,396,2019-11-24,2019,IF Fløya,Lyn,Home,0,0.0,0.0,0.0,...,11.0,24.0,19.8,3.0,3.0,16.0,-29.0,12.0,27.0,0.136364
793,396,2019-11-24,2019,Lyn,IF Fløya,Away,5,11.0,24.0,19.8,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000
794,397,2019-12-01,2019,Lyn,IF Fløya,Home,2,11.0,24.0,19.8,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000


### transforms data
most regression models only accept integers or doubles, so we need to transform the data so that the models can read it
<ul>
    <li> i split the dataframes into two different ones where X is the input data and y is the expected output</li>
    <li> i changed home to 1 and away to 0 in Venue</li>
    <li> since text isnt easily transformed into numbers i add a new collumn for each team which is eighter 1 or 0 depending on if that team is playing or not. the first 14 numbers are for the home team and the second 14 numbers are for the away team</li>
<ul>

In [2]:
# splits the data into whats needed to guess and the answer
y = prepared_data["Goals"]
X = prepared_data
x2020 = y2020

#splits the "team" and "opponent" rows into 14 different rows as it makes it easier for the models to understand
label = LabelBinarizer().fit(X["Team"])

X = pd.concat([X, pd.DataFrame(label.transform(X["Team"]))], axis=1)
X = pd.concat([X, pd.DataFrame(label.transform(X["Opponent"]))], axis=1)
x2020 = pd.concat([x2020, pd.DataFrame(label.transform(y2020["Team"]))], axis=1)
x2020 = pd.concat([x2020, pd.DataFrame(label.transform(y2020["Opponent"]))], axis=1)
X = X.drop(["ID", "Date", "Season", "Team", "Opponent", "Goals"], axis = 1)
x2020 = x2020.drop(["ID", "Date", "Season", "Team", "Opponent"], axis = 1)

# changes venue to a 0 or 1 depending on if it is home or away
venue_dict = {"Home" : 1,  "Away" : 0}
X["Venue"] = X["Venue"].map(venue_dict)
x2020["Venue"] = x2020["Venue"].map(venue_dict)

X

Unnamed: 0,Venue,Rk_x,G-PK_x,Age_x,W_x,D_x,L_x,GD_x,Pts_x,GF_x,...,5,6,7,8,9,10,11,12,13,14
0,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,1,0,0,0
1,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,1,0,0,0,0,0
2,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,1
3,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
4,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
791,0,2.0,37.0,25.1,15.0,3.0,4.0,18.0,48.0,39.0,...,0,0,0,0,0,0,0,0,0,0
792,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,1,0,0,0,0,0,0
793,0,11.0,24.0,19.8,3.0,3.0,16.0,-29.0,12.0,27.0,...,0,0,0,0,0,0,0,0,0,0
794,1,11.0,24.0,19.8,3.0,3.0,16.0,-29.0,12.0,27.0,...,0,0,0,0,0,0,0,0,0,0


### splits the data
i am splitting the data such that i have 70% of the data to train the models, 15% to validate and find the best model and 15% to test

In [3]:
X_train, X_testval, y_train, y_testval = train_test_split(X, y, test_size=0.30, shuffle = False)
X_val, X_test, y_val, y_test = train_test_split(X_testval, y_testval, test_size=0.50, shuffle = False)

X_train

Unnamed: 0,Venue,Rk_x,G-PK_x,Age_x,W_x,D_x,L_x,GD_x,Pts_x,GF_x,...,5,6,7,8,9,10,11,12,13,14
0,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,1,0,0,0
1,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,1,0,0,0,0,0
2,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,1
3,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
4,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
552,1,2.0,37.0,25.1,15.0,3.0,4.0,18.0,48.0,39.0,...,0,0,0,0,0,1,0,0,0,0
553,0,7.0,38.0,22.7,10.0,2.0,10.0,0.0,32.0,42.0,...,1,0,0,0,0,0,0,0,0,0
554,1,11.0,24.0,19.8,3.0,3.0,16.0,-29.0,12.0,27.0,...,0,1,0,0,0,0,0,0,0,0
555,0,5.0,32.0,21.2,11.0,6.0,5.0,-1.0,39.0,34.0,...,0,0,0,1,0,0,0,0,0,0


### adding models 
we need to check a lot of different models to find which is the best one to use, i am also putting in a lot of different parameters so that we can check what is going to give the best result

In [4]:
models = {
    "lasso Regression" : [linear_model.Lasso(), {
        "alpha": [1],
        "fit_intercept" : [True, False],
        "precompute": [True, False],
        "positive" : [True, False]
    }],
    "Support Vector Machines" : [svm.SVC(random_state=42), {
        
    }],
    "BaysianRidge" : [linear_model.BayesianRidge(), {
        "compute_score": [True, False], 
        "fit_intercept" : [True, False], 
        "n_iter" : [300, 400]
    }],
    "Decition Tree" : [tree.DecisionTreeClassifier(random_state=42), {
        "min_samples_leaf" : [2, 3],
        "min_samples_split" : [2, 3]
    }],
    "Support Vector Machines" : [svm.SVC(random_state=42), {
        "C" : [1, 2, 3], 
        "break_ties": [True, False], 
        "probability": [True, False]
    }],
    "Base line" : [DummyRegressor(strategy = "mean"), {
        
    }],
    "Multi-layer Perception" : [MLPRegressor(random_state=42, max_iter = 1000), {
        "hidden_layer_sizes" : [10, 25, 50], 
    }],
    "K Nearest Neighbors" : [KNeighborsClassifier(), {
        "n_neighbors" : [5, 10, 20, 40], 
        "p": [1, 2, 3]
    }],
    "Linear Regression" : [LinearRegression(), {
        "fit_intercept": [True, False], 
        "copy_X" : [True, False], 
        "positive" : [True, False]
    }],
    "Elastic Net" : [ElasticNet(), {
        "alpha" : [0.1, 1, 10], 
        "copy_X" : [True, False], 
        "fit_intercept": [True, False]
    }],
    "Random Forrest Regressor" : [RandomForestRegressor(random_state=42), {
        "max_depth": [2, 3], 
        "n_estimators" : [50, 100, 300]
    }],
    "Decision Tree Regressor" : [DecisionTreeRegressor(random_state=42), {
        "min_samples_split" : [2, 3, 4],
        "min_samples_leaf" : [1, 2, 3]
    }],
}

### using models to make a prediction
we need to use our model to find what is going to give the best result. gridsearch allows us to check the different parameters aswell. i add the data and the name to a list so that we can find what is the best algorythm later

In [5]:
datalist = []

#goes throught the keys
for i in models.keys():
    #does the gridsearch on the models
    model = GridSearchCV(models[i][0], models[i][1])
    #makes a prediction
    prediction = np.clip(np.round(model.fit(X_train, y_train).predict(X_val)), 0, 6)
    #finds out how good that prediction is
    error = np.sqrt(mean_squared_error(y_val, prediction)).round(4)
    datalist.append([i , error])
    
    #models[i][1] = model.get_params()

  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(


### finding the best model
i sort by RMSE to find the best possible model. the best model currently is Elastic Net with 1.4634. i am not sure why this is so small of an improvement compared to others who has done the same as me.

In [6]:
model_df = pd.DataFrame(datalist).rename(columns={0: "Model", 1: "RMSE"}).sort_values(by=['RMSE'])
model_df

Unnamed: 0,Model,RMSE
0,lasso Regression,1.3253
8,Elastic Net,1.341
5,Multi-layer Perception,1.3963
7,Linear Regression,1.4112
2,BaysianRidge,1.4172
9,Random Forrest Regressor,1.4407
4,Base line,1.5035
1,Support Vector Machines,1.5146
6,K Nearest Neighbors,1.6903
3,Decition Tree,1.7002


### do the same with test data
now we need to do the same thing with the test data to see if the models hold up

In [7]:
X_fin = X_train.append(X_val)
y_fin = y_train.append(y_val)

datalist = []

for i in models.keys():
    model = GridSearchCV(models[i][0], models[i][1])
    prediction = np.clip(np.round(model.fit(X_fin, y_fin).predict(X_test)),0, 6)
    error = np.sqrt(mean_squared_error(y_test, prediction)).round(4)
    datalist.append([i , error])

  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(


In [8]:
model_df = pd.DataFrame(datalist).rename(columns={0: "Model", 1: "RMSE"}).sort_values(by=['RMSE'])
model_df

Unnamed: 0,Model,RMSE
8,Elastic Net,1.3964
2,BaysianRidge,1.4172
0,lasso Regression,1.4434
4,Base line,1.4663
7,Linear Regression,1.4832
5,Multi-layer Perception,1.5652
9,Random Forrest Regressor,1.589
1,Support Vector Machines,1.5969
10,Decision Tree Regressor,1.7488
3,Decition Tree,1.763


### then i export the best model and its RSME
after trying out the models on both the validation and test data, i find that elastic net can do very vell, but that baysian ridge is more consistent. therefore i am going to choose to use Baysian Ridge as my prediction algorythm. i export the algorythm to a joblib file with dump from joblib

In [9]:
model = models[model_df["Model"].iloc[0]]
moddel = GridSearchCV(model[0], model[1]).fit(X, y)
predict = pd.DataFrame(np.clip(np.round(moddel.predict(x2020)),0, 6)).rename(columns = {0:"Score"}).rename_axis("ID")

model_df.to_csv("best_models_df.csv", index = False, encoding = "UTF-8")
predict.to_csv("kagglepred.csv", encoding = "UTF-8")
x2020.to_csv("footballApp//data//y2020x.csv", index = False, encoding = "UTF-8")
pickle.dump(moddel, open("footballApp//data//model.pkl", "wb"))
pickle.dump(label, open("footballApp//data//labelBin.pkl", "wb"))

  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
