In [1]:
import sqlite3
import pandas as pd
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from math import sqrt

# CONNECT SQLITE DATA BASE AND READ IT

In [2]:
cnx = sqlite3.connect('./database.sqlite')
data = pd.read_sql_query('SELECT* FROM Player_Attributes',cnx)

In [3]:
data.head()

Unnamed: 0,id,player_fifa_api_id,player_api_id,date,overall_rating,potential,preferred_foot,attacking_work_rate,defensive_work_rate,crossing,...,vision,penalties,marking,standing_tackle,sliding_tackle,gk_diving,gk_handling,gk_kicking,gk_positioning,gk_reflexes
0,1,218353,505942,2016-02-18 00:00:00,67.0,71.0,right,medium,medium,49.0,...,54.0,48.0,65.0,69.0,69.0,6.0,11.0,10.0,8.0,8.0
1,2,218353,505942,2015-11-19 00:00:00,67.0,71.0,right,medium,medium,49.0,...,54.0,48.0,65.0,69.0,69.0,6.0,11.0,10.0,8.0,8.0
2,3,218353,505942,2015-09-21 00:00:00,62.0,66.0,right,medium,medium,49.0,...,54.0,48.0,65.0,66.0,69.0,6.0,11.0,10.0,8.0,8.0
3,4,218353,505942,2015-03-20 00:00:00,61.0,65.0,right,medium,medium,48.0,...,53.0,47.0,62.0,63.0,66.0,5.0,10.0,9.0,7.0,7.0
4,5,218353,505942,2007-02-22 00:00:00,61.0,65.0,right,medium,medium,48.0,...,53.0,47.0,62.0,63.0,66.0,5.0,10.0,9.0,7.0,7.0


In [4]:
df_info= pd.DataFrame({"Dtype": data.dtypes, "Unique": data.nunique(), "Missing%": (data.isnull().sum()/data.shape[0])*100})
df_info

Unnamed: 0,Dtype,Unique,Missing%
id,int64,183978,0.0
player_fifa_api_id,int64,11062,0.0
player_api_id,int64,11060,0.0
date,object,197,0.0
overall_rating,float64,61,0.454402
potential,float64,56,0.454402
preferred_foot,object,2,0.454402
attacking_work_rate,object,8,1.755645
defensive_work_rate,object,19,0.454402
crossing,float64,95,0.454402


# DATA PROCESSING

In [5]:
data.columns

Index(['id', 'player_fifa_api_id', 'player_api_id', 'date', 'overall_rating',
       'potential', 'preferred_foot', 'attacking_work_rate',
       'defensive_work_rate', 'crossing', 'finishing', 'heading_accuracy',
       'short_passing', 'volleys', 'dribbling', 'curve', 'free_kick_accuracy',
       'long_passing', 'ball_control', 'acceleration', 'sprint_speed',
       'agility', 'reactions', 'balance', 'shot_power', 'jumping', 'stamina',
       'strength', 'long_shots', 'aggression', 'interceptions', 'positioning',
       'vision', 'penalties', 'marking', 'standing_tackle', 'sliding_tackle',
       'gk_diving', 'gk_handling', 'gk_kicking', 'gk_positioning',
       'gk_reflexes'],
      dtype='object')

In [6]:
clean_data = data.dropna()
y = clean_data[['overall_rating']]
x =clean_data[['potential','crossing','finishing','heading_accuracy',
       'short_passing', 'volleys', 'dribbling', 'curve', 'free_kick_accuracy',
       'long_passing', 'ball_control', 'acceleration', 'sprint_speed',
       'agility', 'reactions', 'balance', 'shot_power', 'jumping', 'stamina',
       'strength', 'long_shots', 'aggression', 'interceptions', 'positioning',
       'vision', 'penalties', 'marking', 'standing_tackle', 'sliding_tackle',
       'gk_diving', 'gk_handling', 'gk_kicking', 'gk_positioning',
       'gk_reflexes']]


# SPLIT THE DATA INTO TRAIN AND TEST

In [7]:
x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=0.33,random_state=324)

# TRAIN MODEL USING LINEAR REGRESSION

In [8]:
regressor = LinearRegression()
regressor.fit(x_train,y_train)

LinearRegression()

# PREDICTION

In [9]:
predictions = regressor.predict(x_test)

In [10]:
print('The predicted values:{}\n Y_test = {}'.format(predictions[:15],y_test[:15]))

The predicted values:[[66.51284879]
 [79.77234615]
 [66.57371825]
 [74.99042163]
 [66.20353346]
 [62.79820836]
 [76.21901664]
 [69.36647267]
 [66.93368892]
 [73.98655725]
 [62.67862312]
 [74.71147439]
 [68.3737291 ]
 [67.8427546 ]
 [81.01055706]]
 Y_test =         overall_rating
35543             66.0
28735             83.0
168029            65.0
138117            74.0
123316            65.0
8946              63.0
17365             78.0
94431             67.0
55237             64.0
130295            67.0
110857            65.0
178068            76.0
60797             69.0
98489             69.0
30686             80.0


# RMSE

In [11]:
rmse = sqrt(mean_squared_error(y_true=y_test,y_pred=predictions))
print('l\erreur quadratique :'+ str(rmse))

l\erreur quadratique :2.8053030468552094


# DECISION TREE REGRESSOR

In [12]:
from sklearn.tree import DecisionTreeRegressor
tree_regressor = DecisionTreeRegressor(max_depth=20)
tree_regressor.fit(x_train,y_train)
tree_predictions = tree_regressor.predict(x_test)
tree_rmse = sqrt(mean_squared_error(y_true=y_test,y_pred=tree_predictions))
print('l\erreur quadratique :'+ str(tree_rmse))

l\erreur quadratique :1.4608824406710466


# in this dataset RegressorTree worked better than LinearRegressor