In [1]:
import pandas as pd
import numpy as np
import plotly.express as px
import seaborn as sns
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings('ignore')

from sklearn.preprocessing import OneHotEncoder , StandardScaler , MinMaxScaler, RobustScaler
from category_encoders import BinaryEncoder
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LinearRegression
from sklearn.neighbors import KNeighborsRegressor
from sklearn.svm import SVR
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV , cross_validate
import joblib

In [2]:
df = pd.read_csv('data/Data_for_the_ML_model.csv')

In [3]:
df

Unnamed: 0,web_name,team,position,form,now_cost,selected_by_percent,event_points,total_points,ict_index,influence,...,chance_of_playing_next_round,chance_of_playing_this_round,status,in_dreamteam,dreamteam_count,ep_this,ep_next,strength,is_GKP,goal_involvement
0,Balogun,ARS,FWD,0.0,4.4,0.2,0,0,0.0,0.0,...,0.0,0.0,Unavailable,0,0,0.0,0.0,5,0,0
1,Cédric,ARS,DEF,0.0,3.9,0.4,0,0,0.0,0.0,...,0.0,0.0,Available,0,0,0.0,1.0,5,0,0
2,M.Elneny,ARS,MID,1.3,4.4,0.1,0,4,0.4,1.8,...,100.0,100.0,Available,0,0,1.3,2.3,5,0,1
3,Fábio Vieira,ARS,MID,2.3,5.4,0.1,1,23,21.9,79.4,...,100.0,100.0,Available,0,0,2.3,3.3,5,0,4
4,Gabriel,ARS,DEF,1.0,4.8,14.3,2,27,16.9,127.2,...,100.0,100.0,Available,0,0,1.0,2.0,5,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
728,Doherty,WOL,DEF,0.7,4.3,0.2,0,5,11.6,42.8,...,0.0,0.0,Available,0,0,1.2,0.7,3,0,0
729,S.Bueno,WOL,DEF,0.0,4.5,0.0,0,0,0.0,0.0,...,100.0,100.0,Available,0,0,0.5,0.0,3,0,0
730,González,WOL,MID,0.0,5.0,0.0,0,0,0.0,0.0,...,0.0,0.0,Available,0,0,0.5,0.0,3,0,0
731,Fraser,WOL,FWD,0.0,4.5,0.1,0,0,0.0,0.0,...,0.0,0.0,Available,0,0,0.5,0.0,3,0,0


In [4]:
df.drop(['web_name', 'team'], axis=1, inplace=True)

In [5]:
df

Unnamed: 0,position,form,now_cost,selected_by_percent,event_points,total_points,ict_index,influence,creativity,threat,...,chance_of_playing_next_round,chance_of_playing_this_round,status,in_dreamteam,dreamteam_count,ep_this,ep_next,strength,is_GKP,goal_involvement
0,FWD,0.0,4.4,0.2,0,0,0.0,0.0,0.0,0.0,...,0.0,0.0,Unavailable,0,0,0.0,0.0,5,0,0
1,DEF,0.0,3.9,0.4,0,0,0.0,0.0,0.0,0.0,...,0.0,0.0,Available,0,0,0.0,1.0,5,0,0
2,MID,1.3,4.4,0.1,0,4,0.4,1.8,0.5,2.0,...,100.0,100.0,Available,0,0,1.3,2.3,5,0,1
3,MID,2.3,5.4,0.1,1,23,21.9,79.4,63.3,77.0,...,100.0,100.0,Available,0,0,2.3,3.3,5,0,4
4,DEF,1.0,4.8,14.3,2,27,16.9,127.2,13.8,27.0,...,100.0,100.0,Available,0,0,1.0,2.0,5,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
728,DEF,0.7,4.3,0.2,0,5,11.6,42.8,52.0,21.0,...,0.0,0.0,Available,0,0,1.2,0.7,3,0,0
729,DEF,0.0,4.5,0.0,0,0,0.0,0.0,0.0,0.0,...,100.0,100.0,Available,0,0,0.5,0.0,3,0,0
730,MID,0.0,5.0,0.0,0,0,0.0,0.0,0.0,0.0,...,0.0,0.0,Available,0,0,0.5,0.0,3,0,0
731,FWD,0.0,4.5,0.1,0,0,0.0,0.0,0.0,0.0,...,0.0,0.0,Available,0,0,0.5,0.0,3,0,0


In [6]:
x = df.drop('ep_next' , axis = 1 )
y = df['ep_next']

In [7]:
Encoder = ColumnTransformer(transformers=[("BE" , BinaryEncoder() , ['position' , 'status'])] , remainder='passthrough')

In [8]:
models = list()
models.append(("LR" , LinearRegression()))
models.append(("SVR" , SVR()))
models.append(("CART" , DecisionTreeRegressor()))
models.append(("RF" , RandomForestRegressor()))
models.append(("XG" , XGBRegressor()))

In [9]:
for model in models:
    steps = list()
    steps.append(("Encoder" , Encoder))
    steps.append(("Scaler" , StandardScaler() ))
    steps.append(model)
    pipeline = Pipeline(steps = steps)
    scores = cross_validate(pipeline , x , y , cv = 5 , scoring="r2" , return_train_score=True, error_score='raise')
    print(model[0])
    print("Train_r2" , scores["train_score"].mean() )
    print("-" * 10)
    print("Test_r2" , scores["test_score"].mean())
    print("-" * 20)
    print("\n")

LR
Train_r2 0.9521257528706821
----------
Test_r2 0.9085598962357162
--------------------


SVR
Train_r2 0.8685501601484027
----------
Test_r2 0.7519753013350083
--------------------


CART
Train_r2 1.0
----------
Test_r2 0.8273167030424059
--------------------


RF
Train_r2 0.9913848280073114
----------
Test_r2 0.8951884044373841
--------------------


XG
Train_r2 0.9999894553318684
----------
Test_r2 0.8997606417102648
--------------------




In [10]:
steps = list()
steps.append(("Encoder" , Encoder))
steps.append(("Scaler" , StandardScaler() ))
steps.append(('Model', XGBRegressor()))
pipeline = Pipeline(steps = steps)

In [11]:
pipeline

In [30]:
params = [
    {'Model__n_estimators': [20, 30, 40], 'Model__max_depth': [2, 3, 4]}
]

In [31]:
grid = GridSearchCV(estimator = pipeline , param_grid = params , cv = 5  ,return_train_score=True ,  scoring = "r2" )

In [32]:
grid.fit(x,y)

In [33]:
grid.best_params_

{'Model__max_depth': 3, 'Model__n_estimators': 30}

In [34]:
grid.cv_results_["mean_train_score"].max()

0.9936899165075206

In [35]:
grid.cv_results_["mean_test_score"].max()

0.9121008860382093

In [36]:
steps = list()
steps.append(("Encoder" , Encoder))
steps.append(("Scaler" , StandardScaler() ))
steps.append(('Model', XGBRegressor(max_depth =  3, n_estimators = 30)))
pipeline = Pipeline(steps = steps)

In [37]:
pipeline.fit(x, y)

In [41]:
org_df = pd.read_csv('data/Data_for_the_ML_model.csv')

In [42]:
org_df.drop('ep_next', axis=1, inplace=True)

In [43]:
org_df

Unnamed: 0,web_name,team,position,form,now_cost,selected_by_percent,event_points,total_points,ict_index,influence,...,expected_goals_conceded,chance_of_playing_next_round,chance_of_playing_this_round,status,in_dreamteam,dreamteam_count,ep_this,strength,is_GKP,goal_involvement
0,Balogun,ARS,FWD,0.0,4.4,0.2,0,0,0.0,0.0,...,0.00,0.0,0.0,Unavailable,0,0,0.0,5,0,0
1,Cédric,ARS,DEF,0.0,3.9,0.4,0,0,0.0,0.0,...,0.00,0.0,0.0,Available,0,0,0.0,5,0,0
2,M.Elneny,ARS,MID,1.3,4.4,0.1,0,4,0.4,1.8,...,0.00,100.0,100.0,Available,0,0,1.3,5,0,1
3,Fábio Vieira,ARS,MID,2.3,5.4,0.1,1,23,21.9,79.4,...,1.41,100.0,100.0,Available,0,0,2.3,5,0,4
4,Gabriel,ARS,DEF,1.0,4.8,14.3,2,27,16.9,127.2,...,7.20,100.0,100.0,Available,0,0,1.0,5,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
728,Doherty,WOL,DEF,0.7,4.3,0.2,0,5,11.6,42.8,...,4.38,0.0,0.0,Available,0,0,1.2,3,0,0
729,S.Bueno,WOL,DEF,0.0,4.5,0.0,0,0,0.0,0.0,...,0.00,100.0,100.0,Available,0,0,0.5,3,0,0
730,González,WOL,MID,0.0,5.0,0.0,0,0,0.0,0.0,...,0.00,0.0,0.0,Available,0,0,0.5,3,0,0
731,Fraser,WOL,FWD,0.0,4.5,0.1,0,0,0.0,0.0,...,0.00,0.0,0.0,Available,0,0,0.5,3,0,0


In [44]:
joblib.dump(org_df.columns, 'inputs.pkl')

['inputs.pkl']

In [45]:
joblib.dump(pipeline, 'model.pkl')

['model.pkl']