## Predicting players rating
The dataset you are going to use is from European Soccer Database (https://www.kaggle.com/hugomathien/soccer) has more than 25,000 matches and more than 10,000 players for European professional soccer seasons from 2008 to 2016.

In [1]:
import sqlite3
import numpy as np
import pandas as pd
%matplotlib notebook
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from math import sqrt

In [2]:
cnx = sqlite3.connect('database.sqlite')
df = pd.read_sql_query("SELECT * FROM Player_Attributes", cnx)

In [3]:
df.head()

Unnamed: 0,id,player_fifa_api_id,player_api_id,date,overall_rating,potential,preferred_foot,attacking_work_rate,defensive_work_rate,crossing,...,vision,penalties,marking,standing_tackle,sliding_tackle,gk_diving,gk_handling,gk_kicking,gk_positioning,gk_reflexes
0,1,218353,505942,2016-02-18 00:00:00,67.0,71.0,right,medium,medium,49.0,...,54.0,48.0,65.0,69.0,69.0,6.0,11.0,10.0,8.0,8.0
1,2,218353,505942,2015-11-19 00:00:00,67.0,71.0,right,medium,medium,49.0,...,54.0,48.0,65.0,69.0,69.0,6.0,11.0,10.0,8.0,8.0
2,3,218353,505942,2015-09-21 00:00:00,62.0,66.0,right,medium,medium,49.0,...,54.0,48.0,65.0,66.0,69.0,6.0,11.0,10.0,8.0,8.0
3,4,218353,505942,2015-03-20 00:00:00,61.0,65.0,right,medium,medium,48.0,...,53.0,47.0,62.0,63.0,66.0,5.0,10.0,9.0,7.0,7.0
4,5,218353,505942,2007-02-22 00:00:00,61.0,65.0,right,medium,medium,48.0,...,53.0,47.0,62.0,63.0,66.0,5.0,10.0,9.0,7.0,7.0


In [4]:
df.shape

(183978, 42)

In [5]:
df.columns

Index(['id', 'player_fifa_api_id', 'player_api_id', 'date', 'overall_rating',
       'potential', 'preferred_foot', 'attacking_work_rate',
       'defensive_work_rate', 'crossing', 'finishing', 'heading_accuracy',
       'short_passing', 'volleys', 'dribbling', 'curve', 'free_kick_accuracy',
       'long_passing', 'ball_control', 'acceleration', 'sprint_speed',
       'agility', 'reactions', 'balance', 'shot_power', 'jumping', 'stamina',
       'strength', 'long_shots', 'aggression', 'interceptions', 'positioning',
       'vision', 'penalties', 'marking', 'standing_tackle', 'sliding_tackle',
       'gk_diving', 'gk_handling', 'gk_kicking', 'gk_positioning',
       'gk_reflexes'],
      dtype='object')

In [6]:
features = [
       'potential', 'crossing', 'finishing', 'heading_accuracy',
       'short_passing', 'volleys', 'dribbling', 'curve', 'free_kick_accuracy',
       'long_passing', 'ball_control', 'acceleration', 'sprint_speed',
       'agility', 'reactions', 'balance', 'shot_power', 'jumping', 'stamina',
       'strength', 'long_shots', 'aggression', 'interceptions', 'positioning',
       'vision', 'penalties', 'marking', 'standing_tackle', 'sliding_tackle',
       'gk_diving', 'gk_handling', 'gk_kicking', 'gk_positioning',
       'gk_reflexes']

In [7]:
target = ['overall_rating']

In [8]:
df = df.dropna()

In [9]:
X = df[features]

In [10]:
y = df[target]

In [11]:
y.isnull().sum()

overall_rating    0
dtype: int64

In [12]:
X.head()

Unnamed: 0,potential,crossing,finishing,heading_accuracy,short_passing,volleys,dribbling,curve,free_kick_accuracy,long_passing,...,vision,penalties,marking,standing_tackle,sliding_tackle,gk_diving,gk_handling,gk_kicking,gk_positioning,gk_reflexes
0,71.0,49.0,44.0,71.0,61.0,44.0,51.0,45.0,39.0,64.0,...,54.0,48.0,65.0,69.0,69.0,6.0,11.0,10.0,8.0,8.0
1,71.0,49.0,44.0,71.0,61.0,44.0,51.0,45.0,39.0,64.0,...,54.0,48.0,65.0,69.0,69.0,6.0,11.0,10.0,8.0,8.0
2,66.0,49.0,44.0,71.0,61.0,44.0,51.0,45.0,39.0,64.0,...,54.0,48.0,65.0,66.0,69.0,6.0,11.0,10.0,8.0,8.0
3,65.0,48.0,43.0,70.0,60.0,43.0,50.0,44.0,38.0,63.0,...,53.0,47.0,62.0,63.0,66.0,5.0,10.0,9.0,7.0,7.0
4,65.0,48.0,43.0,70.0,60.0,43.0,50.0,44.0,38.0,63.0,...,53.0,47.0,62.0,63.0,66.0,5.0,10.0,9.0,7.0,7.0


In [13]:
X.iloc[2]

potential             66.0
crossing              49.0
finishing             44.0
heading_accuracy      71.0
short_passing         61.0
volleys               44.0
dribbling             51.0
curve                 45.0
free_kick_accuracy    39.0
long_passing          64.0
ball_control          49.0
acceleration          60.0
sprint_speed          64.0
agility               59.0
reactions             47.0
balance               65.0
shot_power            55.0
jumping               58.0
stamina               54.0
strength              76.0
long_shots            35.0
aggression            63.0
interceptions         41.0
positioning           45.0
vision                54.0
penalties             48.0
marking               65.0
standing_tackle       66.0
sliding_tackle        69.0
gk_diving              6.0
gk_handling           11.0
gk_kicking            10.0
gk_positioning         8.0
gk_reflexes            8.0
Name: 2, dtype: float64

In [14]:
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.33,random_state=5)

### Linear Regression :

In [15]:
reg = LinearRegression()
reg.fit(X_train,y_train)

LinearRegression()

In [16]:
y_prediction = reg.predict(X_test)

In [17]:
y_prediction

array([[74.92715609],
       [70.70034563],
       [67.8844181 ],
       ...,
       [67.00234121],
       [57.86383979],
       [59.91298824]])

In [18]:
y_test.describe()

Unnamed: 0,overall_rating
count,59517.0
mean,68.645093
std,7.023278
min,35.0
25%,64.0
50%,69.0
75%,73.0
max,94.0


In [19]:
print(".............Evaluation metrics for Linear Regression..............")
from sklearn import metrics
score = reg.score(X_test, y_test)
n=len(df[target])
p=len(features)
adjr= 1-(1-score)*(n-1)/(n-p-1)
print("RSquared: ",score)
print("AdjustedRSquared: ",adjr)
print('MAE', metrics.mean_absolute_error(y_test, y_prediction))
print('MSE', metrics.mean_squared_error(y_test, y_prediction))
print('RMSE', np.sqrt(metrics.mean_squared_error(y_test, y_prediction)))


.............Evaluation metrics for Linear Regression..............
RSquared:  0.8423506999006632
AdjustedRSquared:  0.842320974379762
MAE 2.128822383074202
MSE 7.776147811466318
RMSE 2.7885745124465147


###  Decision Tree :

In [20]:
regre = DecisionTreeRegressor(max_depth=20)
regre.fit(X_train,y_train)

DecisionTreeRegressor(max_depth=20)

In [21]:
y_prediction = regre.predict(X_test)
y_prediction

array([72.        , 73.        , 67.69767442, ..., 74.        ,
       58.        , 66.        ])

In [22]:
print(".............Evaluation metrics for Decision tree Regression..............")
from sklearn import metrics
score = regre.score(X_test, y_test)
n=len(df[target])
p=len(features)
adjr= 1-(1-score)*(n-1)/(n-p-1)
print("RSquared: ",score)
print("AdjustedRSquared: ",adjr)
print('MAE', metrics.mean_absolute_error(y_test, y_prediction))
print('MSE', metrics.mean_squared_error(y_test, y_prediction))
print('RMSE', np.sqrt(metrics.mean_squared_error(y_test, y_prediction)))

.............Evaluation metrics for Decision tree Regression..............
RSquared:  0.9563546645823251
AdjustedRSquared:  0.9563464350479766
MAE 0.7377035182271786
MSE 2.152832770427848
RMSE 1.4672534785877482
