# Regressão Linear

In [1]:
!pip install pandas-profiling



In [2]:
import pandas as pd
import numpy as np

In [3]:
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier

#### Tratando o Dataframe

In [4]:
df = pd.read_excel('Scout exemplo.xlsx')
df.head()

Unnamed: 0.1,Unnamed: 0,Clube,Rodada,Resultado,Posse de Bola,Gols Feitos,Gols Sofridos,Total de Chutes,Faltas Cometidas,Escanteio,Impedimento,Chute ao Gol,Passes Certos,Total de Passes,Defesas,Faltas sofridas,Chutes Errados,Divididas Ganhas
0,0,Santos,1,D,66,0,3,12,9,11,2,4,501,563,0,18,5,9
1,18,Flamengo,1,V,62,1,0,14,25,3,0,10,440,503,5,21,3,9
2,17,Palmeiras,1,D,38,0,1,12,21,4,3,5,244,300,8,25,6,11
3,16,Corinthians,1,D,61,0,1,15,18,10,1,6,431,518,1,14,5,10
4,15,Atlético Goianiense,1,V,40,1,0,7,14,3,5,2,254,351,6,15,3,11


In [5]:
# Utilizando o dummie para transformar a variavel categorica em binaria
df1 = pd.get_dummies(df, columns=['Resultado'])
df1.head()

Unnamed: 0.1,Unnamed: 0,Clube,Rodada,Posse de Bola,Gols Feitos,Gols Sofridos,Total de Chutes,Faltas Cometidas,Escanteio,Impedimento,Chute ao Gol,Passes Certos,Total de Passes,Defesas,Faltas sofridas,Chutes Errados,Divididas Ganhas,Resultado_D,Resultado_E,Resultado_V
0,0,Santos,1,66,0,3,12,9,11,2,4,501,563,0,18,5,9,1,0,0
1,18,Flamengo,1,62,1,0,14,25,3,0,10,440,503,5,21,3,9,0,0,1
2,17,Palmeiras,1,38,0,1,12,21,4,3,5,244,300,8,25,6,11,1,0,0
3,16,Corinthians,1,61,0,1,15,18,10,1,6,431,518,1,14,5,10,1,0,0
4,15,Atlético Goianiense,1,40,1,0,7,14,3,5,2,254,351,6,15,3,11,0,0,1


In [6]:
# excluindo as colunas que não serão utilizadas
df2=df1.drop(columns=['Unnamed: 0','Clube','Rodada','Resultado_E','Resultado_D'])
df2=df2.rename(columns={'Resultado_V':'Vitória'})
df2

Unnamed: 0,Posse de Bola,Gols Feitos,Gols Sofridos,Total de Chutes,Faltas Cometidas,Escanteio,Impedimento,Chute ao Gol,Passes Certos,Total de Passes,Defesas,Faltas sofridas,Chutes Errados,Divididas Ganhas,Vitória
0,66,0,3,12,9,11,2,4,501,563,0,18,5,9,0
1,62,1,0,14,25,3,0,10,440,503,5,21,3,9,1
2,38,0,1,12,21,4,3,5,244,300,8,25,6,11,0
3,61,0,1,15,18,10,1,6,431,518,1,14,5,10,0
4,40,1,0,7,14,3,5,2,254,351,6,15,3,11,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
705,63,2,0,12,23,4,2,5,450,530,3,15,5,3,1
706,72,0,2,21,6,12,2,8,559,636,2,13,5,2,0
707,62,0,1,21,16,11,2,4,593,662,4,8,11,8,0
708,41,1,0,4,15,1,2,1,313,387,4,14,3,10,1


In [7]:
# Normalizando os dados

scaler = StandardScaler()

X=df2[['Posse de Bola','Gols Feitos','Gols Sofridos','Total de Chutes','Faltas Cometidas','Escanteio','Impedimento','Chute ao Gol','Passes Certos','Total de Passes','Defesas','Faltas sofridas','Chutes Errados','Divididas Ganhas']]

Y=df2['Vitória']

In [8]:
# Usando o Random Forest para rankear as variaveis
randomForest = RandomForestClassifier(n_estimators=14)
randomForest.fit(X, Y)
RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='entropy', max_depth=None, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=7,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

print(randomForest.feature_importances_)

[0.04226497 0.34120119 0.26807003 0.03005069 0.02512567 0.0221526
 0.01834282 0.07900462 0.03439078 0.0513186  0.01355136 0.02526838
 0.01769071 0.03156756]


In [9]:
feature_importances = pd.DataFrame(randomForest.feature_importances_,
                                   index = X.columns,
                                   columns=['Score']).sort_values('Score', ascending=False)
feature_importances

Unnamed: 0,Score
Gols Feitos,0.341201
Gols Sofridos,0.26807
Chute ao Gol,0.079005
Total de Passes,0.051319
Posse de Bola,0.042265
Passes Certos,0.034391
Divididas Ganhas,0.031568
Total de Chutes,0.030051
Faltas sofridas,0.025268
Faltas Cometidas,0.025126


In [10]:
feature_importances.head(3)

Unnamed: 0,Score
Gols Feitos,0.341201
Gols Sofridos,0.26807
Chute ao Gol,0.079005


## Regressão Multipla

In [11]:
from scipy import stats
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

In [12]:
model = LinearRegression()

In [24]:
X1_1 = df['Total de Passes']
X1_2= df['Chute ao Gol']
X1_3 = df['Posse de Bola']
X1_4 = df['Escanteio']
X1_5 = df['Total de Chutes']
X1_6 = df['Divididas Ganhas']
X1_7 = df['Defesas']
X1_8 = df['Impedimento']

#### Total de Passes

In [14]:
x1 = df2[['Posse de Bola','Escanteio','Total de Chutes','Divididas Ganhas','Defesas','Impedimento']]

y1 = df2['Total de Passes']

# normalizando os dados
scaler.fit(x1)
x1_std = scaler.transform(x1)

# carregando o modelo
model.fit(x1_std, y1)

# retorna a
a1 = model.coef_

# retorna b
b1 = model.intercept_

# retorna R²
r2_1 = model.score(x1_std, y1)

# retorna o erro
y_p1=model.predict(x1_std)
e1=np.sqrt(mean_squared_error(y1, y_p1))

print(f''' Total de Passes

Constante = {b1}
R² = {r2_1}
RMSE = {e1}
''')

 Total de Passes

Constante = 426.8183098591549
R² = 0.7847627262238468
RMSE = 45.53765716269531



In [15]:
slope, intercept, r_value, p_value_x1_3, std_err = stats.linregress(X1_3, X1_1)
slope, intercept, r_value, p_value_x1_4, std_err = stats.linregress(X1_4, X1_1)
slope, intercept, r_value, p_value_x1_5, std_err = stats.linregress(X1_5, X1_1)
slope, intercept, r_value, p_value_x1_6, std_err = stats.linregress(X1_6, X1_1)
slope, intercept, r_value, p_value_x1_7, std_err = stats.linregress(X1_7, X1_1)
slope, intercept, r_value, p_value_x1_8, std_err = stats.linregress(X1_8, X1_1)

p_value_TP = [p_value_x1_3,p_value_x1_4,p_value_x1_5,p_value_x1_6,p_value_x1_7,p_value_x1_8]

In [16]:
corr_TP = pd.DataFrame({'Parametros':x1.columns,'coefs':model.coef_,'p value':p_value_TP})
corr_TP.round(4).sort_values('coefs', ascending=False)

Unnamed: 0,Parametros,coefs,p value
0,Posse de Bola,90.7153,0.0
3,Divididas Ganhas,-0.4906,0.0665
2,Total de Chutes,-1.5601,0.0
4,Defesas,-4.1221,0.0
5,Impedimento,-5.6479,0.0545
1,Escanteio,-10.6824,0.0


In [18]:
TP = pd.DataFrame({'Scout':'Total de Passes','R²':r2_1,'Parametros':x1.columns,'coefs':model.coef_,'p value':p_value_TP})
TP = TP.round(4).sort_values('coefs',ascending=False)
Total_de_Passes = TP.drop([4,3,5,1,2])
Total_de_Passes

Unnamed: 0,Scout,R²,Parametros,coefs,p value
0,Total de Passes,0.7848,Posse de Bola,90.7153,0.0


#### Posse de Bola

In [19]:
x2 = df2[['Total de Passes','Chute ao Gol','Escanteio','Total de Chutes','Divididas Ganhas','Defesas','Impedimento']]

y2 = df2['Posse de Bola']

# normalizando os dados
scaler.fit(x2)
x2_std = scaler.transform(x2)

# carregando o modelo
model.fit(x2_std, y2)

# retorna a
a2 = model.coef_

# retorna b
b2 = model.intercept_

# retorna R²
r2_2 = model.score(x2_std, y2)

# retorna o erro
y_p2=model.predict(x2_std)
e2=np.sqrt(mean_squared_error(y2, y_p2))

print(f''' Posse de Bola

Constante = {b2}
R² = {r2_2}
RMSE = {e2}
''')

 Posse de Bola

Constante = 50.053521126760565
R² = 0.8155819237185589
RMSE = 4.302911408691422



In [21]:
slope, intercept, r_value, p_value_x2_1, std_err = stats.linregress(X1_1, X1_3)
slope, intercept, r_value, p_value_x2_3, std_err = stats.linregress(X1_3, X1_3)
slope, intercept, r_value, p_value_x2_4, std_err = stats.linregress(X1_4, X1_3)
slope, intercept, r_value, p_value_x2_5, std_err = stats.linregress(X1_5, X1_3)
slope, intercept, r_value, p_value_x2_6, std_err = stats.linregress(X1_6, X1_3)
slope, intercept, r_value, p_value_x2_7, std_err = stats.linregress(X1_7, X1_3)
slope, intercept, r_value, p_value_x2_8, std_err = stats.linregress(X1_8, X1_3)

p_value_PB = [p_value_x2_1,p_value_x2_3,p_value_x2_4,p_value_x2_5,p_value_x2_6,p_value_x2_7,p_value_x2_8]

In [22]:
corr_PB = pd.DataFrame({'Parametros':x2.columns,'coefs':model.coef_,'p value':p_value_PB})
corr_PB.round(4).sort_values('coefs', ascending=False)

Unnamed: 0,Parametros,coefs,p value
0,Total de Passes,7.9538,0.0
2,Escanteio,1.6678,0.0
3,Total de Chutes,1.0501,0.0
6,Impedimento,0.3372,0.7541
5,Defesas,-0.0388,0.0
4,Divididas Ganhas,-0.1027,0.0693
1,Chute ao Gol,-0.361,0.0


In [23]:
PB = pd.DataFrame({'Scout':'Posse de Bola','R²':r2_2,'Parametros':x2.columns,'coefs':model.coef_,'p value':p_value_PB})
PB = PB.round(4).sort_values('coefs',ascending=False)
Posse_de_Bola = PB.drop([5,6,4,1])
Posse_de_Bola

Unnamed: 0,Scout,R²,Parametros,coefs,p value
0,Posse de Bola,0.8156,Total de Passes,7.9538,0.0
2,Posse de Bola,0.8156,Escanteio,1.6678,0.0
3,Posse de Bola,0.8156,Total de Chutes,1.0501,0.0


#### Chute ao Gol

In [None]:
x3 = df2[['Total de Passes','Posse de Bola','Escanteio','Total de Chutes','Divididas Ganhas','Defesas','Impedimento']]

y3 = df2['Chute ao Gol']

# normalizando os dados
scaler.fit(x3)
x3_std = scaler.transform(x3)

# carregando o modelo
model.fit(x3_std, y3)

# retorna a
a3 = model.coef_

# retorna b
b3 = model.intercept_

# retorna R²
r2_3 = model.score(x3_std, y3)

# retorna o erro
y_p3=model.predict(x3_std)
e3=np.sqrt(mean_squared_error(y3, y_p3))

print(f''' Chute ao Gol

Constante = {b3}
R² = {r2_3}
RMSE = {e3}
''')

In [None]:
slope, intercept, r_value, p_value_x3_1, std_err = stats.linregress(X1_1, X1_3)
slope, intercept, r_value, p_value_x3_2, std_err = stats.linregress(X1_2, X1_3)
slope, intercept, r_value, p_value_x3_4, std_err = stats.linregress(X1_4, X1_3)
slope, intercept, r_value, p_value_x3_5, std_err = stats.linregress(X1_5, X1_3)
slope, intercept, r_value, p_value_x3_6, std_err = stats.linregress(X1_6, X1_3)
slope, intercept, r_value, p_value_x3_7, std_err = stats.linregress(X1_7, X1_3)
slope, intercept, r_value, p_value_x3_8, std_err = stats.linregress(X1_8, X1_3)

p_value_CG = [p_value_x3_1,p_value_x3_2,p_value_x3_4,p_value_x3_5,p_value_x3_6,p_value_x3_7,p_value_x3_8]

In [None]:
corr_CG = pd.DataFrame({'Parametros':x3.columns,'coefs':model.coef_,'p value':p_value_CG})
corr_CG.round(4).sort_values('coefs', ascending=False)

In [None]:
CG = pd.DataFrame({'Scout':'Chutes ao Gol','R²':r2_3,'Parametros':x3.columns,'coefs':model.coef_,'p value':p_value_CG})
CG = CG.round(4).sort_values('coefs',ascending=False)
Chute_ao_Gol = CG.drop([4,6,2,1])
Chute_ao_Gol

In [None]:
df_scout = pd.merge(Total_de_Passes,Posse_de_Bola,how='outer').merge(Chute_ao_Gol,how='outer')
df_scout.groupby(['Scout','R²','Parametros']).sum().sort_values('R²',ascending=False)