# PART 2

#### – Input: 
prepared data
#### – Output:
machine learning model, expected generalisation RMSE
#### – Features:
This system takes the prepared dataframe and builds a machine learning model
for predicting scores. Model selection, feature selection and handling missing data are
important parts of this system. You should evaluate at least 3 fundamentally different
modelling approaches before selecting the final model. We evaluate the performance of the
system by comparing the predicted scores with the known scores on a validation/test data
set. Specifically, the system should be evaluated with the root mean squared error (RMSE)
of predictions.

In [19]:
from plotly.subplots import make_subplots

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import PolynomialFeatures
from sklearn.metrics import mean_squared_error
from sklearn.neural_network import MLPRegressor
from sklearn.naive_bayes import GaussianNB

from sklearn import linear_model
from sklearn import tree
from sklearn import svm

import numpy as np
import pandas as pd


prepared_data = pd.read_csv("prepared_data.csv", encoding = "UTF-8")
prepared_data

Unnamed: 0,ID,Date,Season,Team,Opponent,Venue,Goals,Winner
0,0,2017-04-17,2017,Medkila,Sandviken,Home,1,Draw
1,0,2017-04-17,2017,Sandviken,Medkila,Away,1,Draw
2,1,2017-04-17,2017,Avaldsnes,Vålerenga,Home,2,Home
3,1,2017-04-17,2017,Vålerenga,Avaldsnes,Away,1,Home
4,2,2017-04-17,2017,Grand Bodø,Arna-Bjørnar,Home,2,Draw
...,...,...,...,...,...,...,...,...
791,156,2019-11-16,2019,Klepp,Fart,Away,6,Away
792,158,2019-11-24,2019,IF Fløya,Lyn,Home,0,Away
793,158,2019-11-24,2019,Lyn,IF Fløya,Away,5,Away
794,159,2019-12-01,2019,Lyn,IF Fløya,Home,2,Home


In [20]:
X = prepared_data.iloc[:, :6]
y = prepared_data["Goals"]

team_dict = {}
count = 0
for i in X["Team"].unique():
    team_dict[i] = count
    count += 1
print(team_dict)

venue_dict = {"Home" : 1,  "Away" : -1}
#winner_dict = {"Home" : 1 , "Draw" : 0, "Away" : -1 }


X["Team"] = X["Team"].map(team_dict)
X["Opponent"] = X["Opponent"].map(team_dict)
X["Venue"] = X["Venue"].map(venue_dict)

#y = y.map(winner_dict)

X = X[["Team", "Opponent", "Venue"]]
y

{'Medkila': 0, 'Sandviken': 1, 'Avaldsnes': 2, 'Vålerenga': 3, 'Grand Bodø': 4, 'Arna-Bjørnar': 5, 'Røa': 6, 'Stabæk': 7, 'LSK Kvinner': 8, 'Klepp': 9, 'Trondheims-Ørn': 10, 'Kolbotn': 11, 'Lyn': 12, 'Fart': 13, 'IF Fløya': 14}


0      1
1      1
2      2
3      1
4      2
      ..
791    6
792    0
793    5
794    2
795    1
Name: Goals, Length: 796, dtype: int64

In [21]:
X_train, X_testval, y_train, y_testval = train_test_split(X, y, test_size=0.30, shuffle = False)
X_test, X_val, y_test, y_val = train_test_split(X_testval, y_testval, test_size=0.50, shuffle = False)

X_train

Unnamed: 0,Team,Opponent,Venue
0,0,1,1
1,1,0,-1
2,2,3,1
3,3,2,-1
4,4,5,1
...,...,...,...
552,9,6,1
553,6,9,-1
554,12,11,1
555,11,12,-1


In [22]:
def linear_reg():
    lin_reg = LinearRegression()
    
    lin_reg.fit(X_train, y_train)
    
    return ["lin reg", lin_reg.predict(X_val)]

In [23]:
def poly_regression(x):
    poly_reg = PolynomialFeatures(x)
    poly_reg.fit(np.array(X_train))
    X_train_transform = poly_reg.transform(np.array(X_train))
    X_val_trans = poly_reg.transform(np.array(X_val))

    lin_reg = LinearRegression()
    lin_reg.fit(X_train_transform, y_train)
    
    return [f"poly reg with {x}", lin_reg.predict(X_val_trans)]

In [24]:
def KNeigh(x):
    neigh = KNeighborsClassifier(n_neighbors=x).fit(X_train, y_train)
    return [f"K nearest neigbours with {x}", neigh.predict(X_val)]

In [59]:
def MLP():
    regr = MLPRegressor(random_state=42, max_iter=500).fit(X_train, y_train)
    return ["Multi-layer Perceptron", regr.predict(X_val)]

In [60]:
def average():
    return ["Base line (avegage of train)", [y_train.mean()]*len(y_val)]

In [61]:
def SVM():
    clf = svm.SVC().fit(X_train, y_train)
    return ["Support Vector Machines", clf.predict(X_val)]

In [62]:
def NB():
    gnb = GaussianNB().fit(X_train, y_train)
    return["Naive Bayes", gnb.predict(X_val)]

In [63]:
def Decition_Tree():
    clf = tree.DecisionTreeClassifier().fit(X, y)
    return["Decition_Tree", clf.predict(X_val)]

In [64]:
def Bayesian():
    reg = linear_model.BayesianRidge().fit(X, y)
    return["Bayesian Ridge Regression", reg.predict(X_val)]

In [65]:
datalist = []
datalist.append(average())
datalist.append(SVM())
datalist.append(linear_reg())
datalist.append(MLP())
datalist.append(NB())
datalist.append(Decition_Tree())
datalist.append(Bayesian())
for i in range(2, 5):
    datalist.append(poly_regression(i))

for i in [2, 5, 10, 20, 50, 100]:
    datalist.append(KNeigh(i))

In [66]:
for i in datalist:
    print(f"{i[0]}:", np.sqrt(mean_squared_error(y_val, np.round(i[1]))))

Base line (avegage of train): 1.466287829861518
Support Vector Machines: 1.7368553960150703
lin reg: 1.4972196454317137
Multi-layer Perceptron: 1.3753787357185172
Naive Bayes: 1.6431676725154984
Decition_Tree: 1.399404635312222
Bayesian Ridge Regression: 1.4520101009749669
poly reg with 2: 1.420093893609386
poly reg with 3: 1.4053469322555197
poly reg with 4: 1.4375905768565218
K nearest neigbours with 2: 1.7818529681205462
K nearest neigbours with 5: 1.9472202409246537
K nearest neigbours with 10: 1.9578900207451218
K nearest neigbours with 20: 1.7818529681205462
K nearest neigbours with 50: 1.746424919657298
K nearest neigbours with 100: 1.8211717839530315


# read about GridSeachCV