# Regressão Linear

In [1]:
!pip install pandas-profiling



In [2]:
import pandas as pd
import numpy as np
import seaborn as sn
import matplotlib.pyplot as plt

In [3]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import LogisticRegression
from sklearn import metrics
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import StandardScaler
from scipy import stats

#### Tratando o Dataframe

In [4]:
df = pd.read_excel('Scout final.xlsx')
df.head()

Unnamed: 0.1,Unnamed: 0,Clube,Rodada,Resultado,Posse de Bola,Gols Feitos,Gols Sofridos,Total de Chutes,Faltas Cometidas,Escanteio,Impedimento,Chute ao Gol,Passes Certos,Total de Passes,Defesas,Faltas sofridas,Chutes Errados,Divididas Ganhas
0,0,Santos,1,D,66,0,3,12,9,11,2,4,501,563,0,18,5,9
1,18,Flamengo,1,V,62,1,0,14,25,3,0,10,440,503,5,21,3,9
2,17,Palmeiras,1,D,38,0,1,12,21,4,3,5,244,300,8,25,6,11
3,16,Corinthians,1,D,61,0,1,15,18,10,1,6,431,518,1,14,5,10
4,15,Atlético Goianiense,1,V,40,1,0,7,14,3,5,2,254,351,6,15,3,11


In [5]:
# Utilizando o dummie para transformar a variavel categorica em binaria
df1 = pd.get_dummies(df, columns=['Resultado'])
df1.head()

Unnamed: 0.1,Unnamed: 0,Clube,Rodada,Posse de Bola,Gols Feitos,Gols Sofridos,Total de Chutes,Faltas Cometidas,Escanteio,Impedimento,Chute ao Gol,Passes Certos,Total de Passes,Defesas,Faltas sofridas,Chutes Errados,Divididas Ganhas,Resultado_D,Resultado_E,Resultado_V
0,0,Santos,1,66,0,3,12,9,11,2,4,501,563,0,18,5,9,1,0,0
1,18,Flamengo,1,62,1,0,14,25,3,0,10,440,503,5,21,3,9,0,0,1
2,17,Palmeiras,1,38,0,1,12,21,4,3,5,244,300,8,25,6,11,1,0,0
3,16,Corinthians,1,61,0,1,15,18,10,1,6,431,518,1,14,5,10,1,0,0
4,15,Atlético Goianiense,1,40,1,0,7,14,3,5,2,254,351,6,15,3,11,0,0,1


In [6]:
# excluindo as colunas que não serão utilizadas
df2=df1.drop(columns=['Unnamed: 0','Clube','Rodada','Resultado_E','Resultado_D'])
df2=df2.rename(columns={'Resultado_V':'Vitória'})
df2.head()

Unnamed: 0,Posse de Bola,Gols Feitos,Gols Sofridos,Total de Chutes,Faltas Cometidas,Escanteio,Impedimento,Chute ao Gol,Passes Certos,Total de Passes,Defesas,Faltas sofridas,Chutes Errados,Divididas Ganhas,Vitória
0,66,0,3,12,9,11,2,4,501,563,0,18,5,9,0
1,62,1,0,14,25,3,0,10,440,503,5,21,3,9,1
2,38,0,1,12,21,4,3,5,244,300,8,25,6,11,0
3,61,0,1,15,18,10,1,6,431,518,1,14,5,10,0
4,40,1,0,7,14,3,5,2,254,351,6,15,3,11,1


## Testando a Logistica

In [7]:
# Normalizando os dados

scaler = StandardScaler()

X=df2[['Posse de Bola','Gols Feitos','Gols Sofridos','Total de Chutes','Faltas Cometidas','Escanteio','Impedimento','Chute ao Gol','Passes Certos','Total de Passes','Defesas','Faltas sofridas','Chutes Errados','Divididas Ganhas']]

Y=df2['Vitória']

In [8]:
logreg = LogisticRegression(max_iter=2000)

# normalizando os dados
scaler.fit(X)
X_std = scaler.transform(X)

# carregando o modelo
logrec.fit(X_std, Y)

# retorna a
a = logrec.coef_

# retorna b
b = logrec.intercept_

# retorna R²
r2 = logrec.score(X_std,Y)

# retorna o erro
Y_pred=logrec.predict(X_std)
erro_2=np.sqrt(mean_squared_error(Y, Y_pred))

print(f''' Vitória

Constante = {b}
R² = {r2}
RMSE = {erro_2}
''')

 Vitória

Constante = [-2.46571795]
R² = 1.0
RMSE = 0.0



In [31]:
logrec = LogisticRegression(max_iter=4000)
from sklearn.feature_selection import RFE
rfe = RFE(logrec, 4)
fit = rfe.fit(X, Y)

# Mostrando o número de features:
print ("Número de features: {}".format(fit.n_features_))  



Número de features: 4


In [32]:
cols = fit.get_support(indices=True)
df2.iloc[:,cols]

Unnamed: 0,Gols Feitos,Gols Sofridos,Chute ao Gol,Chutes Errados
0,0,3,4,5
1,1,0,10,3
2,0,1,5,6
3,0,1,6,5
4,1,0,2,3
...,...,...,...,...
375,0,2,1,5
376,2,0,5,5
377,0,2,8,5
378,2,1,5,3


## Testando a Linear

In [9]:
model = LinearRegression()

In [10]:
# carregando o modelo
model.fit(X_std, Y)

# retorna a
a = model.coef_

# retorna b
b = model.intercept_

# retorna R²
r2 = model.score(X_std,Y)

# retorna o erro
Y_pred=model.predict(X_std)
erro_2=np.sqrt(mean_squared_error(Y, Y_pred))

print(f''' Vitória

Constante = {b}
R² = {r2}
RMSE = {erro_2}
''')

 Vitória

Constante = 0.3315789473684211
R² = 0.6046884682928819
RMSE = 0.29599765876463535



In [11]:
corr = pd.DataFrame({'Columns':X.columns,'coefs':model.coef_})
corr.round(4)

Unnamed: 0,Columns,coefs
0,Posse de Bola,-0.1274
1,Gols Feitos,0.2985
2,Gols Sofridos,-0.2215
3,Total de Chutes,0.0112
4,Faltas Cometidas,-0.0108
5,Escanteio,0.0265
6,Impedimento,0.0202
7,Chute ao Gol,0.0104
8,Passes Certos,0.0602
9,Total de Passes,0.0411


## Testando a Random Forest

In [33]:
from sklearn.ensemble import RandomForestClassifier

# feature extraction
randomForest = RandomForestClassifier(n_estimators=14)
randomForest.fit(X, Y)
RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=None, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=14,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

print(randomForest.feature_importances_)

[0.0458505  0.32955415 0.22244149 0.03314831 0.03667748 0.02798478
 0.02248871 0.07992654 0.03797502 0.03420695 0.03029455 0.03731976
 0.02940334 0.03272841]


In [35]:
feature_importances = pd.DataFrame(randomForest.feature_importances_,
                                   index = X.columns,
                                   columns=['Score']).sort_values('Score', ascending=False)
feature_importances

Unnamed: 0,Score
Gols Feitos,0.329554
Gols Sofridos,0.222441
Chute ao Gol,0.079927
Posse de Bola,0.045851
Passes Certos,0.037975
Faltas sofridas,0.03732
Faltas Cometidas,0.036677
Total de Passes,0.034207
Total de Chutes,0.033148
Divididas Ganhas,0.032728


In [36]:
Y1=df['Resultado']

randomForest = RandomForestClassifier(n_estimators=14)
randomForest.fit(X, Y)
RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=None, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=14,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

print(randomForest.feature_importances_)

[0.05216712 0.3036067  0.23581572 0.03588417 0.04192879 0.0257687
 0.0325101  0.05041404 0.05084664 0.03735474 0.03387308 0.02811504
 0.02922696 0.04248821]


In [37]:
feature_importances1 = pd.DataFrame(randomForest.feature_importances_,
                                   index = X.columns,
                                   columns=['Score']).sort_values('Score', ascending=False)
feature_importances1

Unnamed: 0,Score
Gols Feitos,0.303607
Gols Sofridos,0.235816
Posse de Bola,0.052167
Passes Certos,0.050847
Chute ao Gol,0.050414
Divididas Ganhas,0.042488
Faltas Cometidas,0.041929
Total de Passes,0.037355
Total de Chutes,0.035884
Defesas,0.033873
