In [37]:
# Importing Required Libraries
import numpy as np
import copy
import matplotlib.pyplot as plt
from sklearn.preprocessing import MinMaxScaler
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import Ridge, Lasso, LinearRegression
from sklearn.tree import DecisionTreeRegressor  
from sklearn.model_selection import KFold, cross_val_score
from sklearn.metrics import r2_score, accuracy_score, mean_squared_error
from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR
import seaborn as sns
import warnings
import pandas as pd
%matplotlib inline
warnings.filterwarnings('ignore')

In [21]:
df = pd.read_csv(r"Dataset.csv")
df.head()

Unnamed: 0,name,club,age,position,position_cat,market_value,page_views,fpl_value,fpl_sel,fpl_points,region,nationality,new_foreign,age_cat,club_id,big_club,new_signing
0,Alexis Sanchez,Arsenal,28,LW,1,65.0,4329,12.0,17.10%,264,3.0,Chile,0,4,1,1,0
1,Mesut Ozil,Arsenal,28,AM,1,50.0,4395,9.5,5.60%,167,2.0,Germany,0,4,1,1,0
2,Petr Cech,Arsenal,35,GK,4,7.0,1529,5.5,5.90%,134,2.0,Czech Republic,0,6,1,1,0
3,Theo Walcott,Arsenal,28,RW,1,20.0,2393,7.5,1.50%,122,1.0,England,0,4,1,1,0
4,Laurent Koscielny,Arsenal,31,CB,3,22.0,912,6.0,0.70%,121,2.0,France,0,4,1,1,0


In [22]:
class DataPreprocessing:
    def __init__(self,data):
        
        self.data = df
        # self.cols_to_be_dropped = ['name','club_id','age_cat','nationality']
        self.cols_to_be_encoded = ['club','position','position_cat','region']
        
    def addPosCat(self,inputCat):

        if inputCat == 1:
            return('Attackers')
        elif inputCat == 2:
            return('Midfielders')
        elif inputCat == 3:
            return('Defenders')
        else:
            return('Goalkeeper')
        
    def addRegion(self, inpregion):
    
        if inpregion == 1:
            return('England')
        elif inpregion == 2:
            return('EU')
        elif inpregion == 3:
            return('Americans')
        else:
            return('Rest of World')
        
    def columnTypeConversion(self):
        
        # Converting fpl selection into numeric variable 
        self.data['fpl_sel'] = self.data['fpl_sel'].map(lambda x: str(x)[:-1]).astype('float')
        
    def logTransformation(self):
        
        # log transformation on page views variable as it has a high skew
        self.data['page_views'] = self.data['page_views'].apply(np.log)
        
    # Encoding the categorical variables using pandas dummies 
    def dataEncoding(self):

        self.data = pd.get_dummies(self.data, columns = self.cols_to_be_encoded, drop_first = True)
        
    def getProcessedData(self):
        # self.data = self.data.drop(self.cols_to_be_dropped, inplace = False, axis = 1)
        self.data['position_cat'] = self.data['position_cat'].apply(self.addPosCat)
        self.data['region'] = self.data['region'].apply(self.addRegion)
        self.columnTypeConversion()
        self.dataEncoding()
        self.logTransformation()
        return self.data

In [23]:
data_obj = DataPreprocessing(df) 
encoded_data = data_obj.getProcessedData()
encoded_data.head()

Unnamed: 0,name,age,market_value,page_views,fpl_value,fpl_sel,fpl_points,nationality,new_foreign,age_cat,...,position_RB,position_RM,position_RW,position_SS,position_cat_Defenders,position_cat_Goalkeeper,position_cat_Midfielders,region_EU,region_England,region_Rest of World
0,Alexis Sanchez,28,65.0,8.373092,12.0,17.1,264,Chile,0,4,...,0,0,0,0,0,0,0,0,0,0
1,Mesut Ozil,28,50.0,8.388223,9.5,5.6,167,Germany,0,4,...,0,0,0,0,0,0,0,1,0,0
2,Petr Cech,35,7.0,7.332369,5.5,5.9,134,Czech Republic,0,6,...,0,0,0,0,0,1,0,1,0,0
3,Theo Walcott,28,20.0,7.780303,7.5,1.5,122,England,0,4,...,0,0,1,0,0,0,0,0,1,0
4,Laurent Koscielny,31,22.0,6.81564,6.0,0.7,121,France,0,4,...,0,0,0,0,1,0,0,1,0,0


In [46]:
output_var = 'market_value'
X = encoded_data[encoded_data.columns[~encoded_data.columns.isin([output_var])]]
X = X[["page_views","fpl_value","fpl_sel","fpl_points","big_club"]]
y = encoded_data[[output_var]]
x_train, x_test, y_train, y_test = train_test_split(X, y, test_size = 0.20)
x_train = x_train.reset_index(drop = True)
x_test = x_test.reset_index(drop = True)
y_train = y_train.reset_index(drop = True)
y_test = y_test.reset_index(drop = True)

In [47]:
print(x_train.shape)
print(x_test.shape)
print(y_train.shape)
print(y_test.shape)

(368, 5)
(93, 5)
(368, 1)
(93, 1)


In [35]:
# performin min max scaling on input data
scaler = MinMaxScaler()
x_train = scaler.fit_transform(x_train[x_train.columns[~x_train.columns.isin(["big_club"])]])
x_test = scaler.transform(x_test[x_test.columns[~x_test.columns.isin(["big_club"])]])

In [36]:
# For Random Forest Regression
params = {
    'bootstrap': [True],
    'max_depth': [8, 10, 12, 14, 16],
    'min_samples_leaf': [1, 2, 3],
    'n_estimators': [40, 50, 60, 70],
    'max_features': [0.3, 0.4, 0.5]
}

forest_reg = GridSearchCV(RandomForestRegressor(), param_grid = params, scoring = 'r2', cv = 5)
forest_reg.fit(x_train,y_train)
acc_train = forest_reg.score(x_train, y_train)

print("Cross Validation score - " + str(forest_reg.best_score_))
print()

print("R^2 score of training data - " + str(acc_train))
print()
print(forest_reg.best_estimator_)
forest_best_params = forest_reg.best_params_

Cross Validation score - 0.6831707601896057

R^2 score of training data - 0.956934168711396

RandomForestRegressor(max_depth=16, max_features=0.4, n_estimators=40)


In [48]:
# fitting a random forest regressor
Forest_regr = RandomForestRegressor(max_depth=16, max_features=0.4, n_estimators=40)
Forest_regr.fit(x_train, y_train)

In [49]:
import pickle

In [50]:
pickle.dump(Forest_regr,open("predictPrice.pkl","wb"))

In [52]:
train_pred = Forest_regr.predict(x_train)

In [54]:
x_train

Unnamed: 0,page_views,fpl_value,fpl_sel,fpl_points,big_club
0,5.129899,4.5,0.8,67,0
1,7.208600,6.5,11.4,0,0
2,6.856462,5.5,17.0,0,1
3,4.753590,4.5,0.4,63,0
4,6.122493,5.5,1.4,91,0
...,...,...,...,...,...
363,7.100027,6.0,0.4,69,0
364,7.508787,6.5,10.4,139,1
365,5.693732,4.0,3.3,4,0
366,5.820083,6.0,0.2,63,0


In [65]:
def predict(page_views,	fpl_value,	fpl_sel,	fpl_points,	big_club):
    x = pd.DataFrame([])
    x[["page_views",	"fpl_value",	"fpl_sel",	"fpl_points",	"big_club"]] = [[page_views,	fpl_value,	fpl_sel,	fpl_points,	big_club]]
    model = pickle.load(open("predictPrice.pkl", "rb"))
    price = model.predict(x)
    return price[0]


In [66]:
predict(5.129899,	4.5	,0.8,	67	,0)

3.7046875