# Campeonato Brasileiro - Série A

## Radom Forest Classifier

### 1. Import packages

In [523]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

In [524]:
from sklearn.ensemble import RandomForestClassifier

In [525]:
from sklearn.ensemble import AdaBoostClassifier

In [526]:
from sklearn.model_selection import cross_val_score

In [527]:
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score,precision_score, recall_score
from sklearn.metrics import f1_score

In [528]:
from sklearn import datasets

### 2. Reading data

In [529]:
jogos = pd.read_csv("data/jogos-resultado-posicao.csv",sep=",")

In [530]:
del jogos["Unnamed: 0"]

In [531]:
#jogos.query('ano > 2012', inplace=True)

In [532]:
jogos.head(3)

Unnamed: 0,campeonato,ano,rodada,data,mandante,visitante,gols_mandante,gols_visitante,resultado,posicao_m,posicao_v,aproveitamento_5r_v,aproveitamento_5r_m,aproveitamento_10r_v,aproveitamento_10r_m
0,brasileiro-a,2018,5,14-05,Botafogo,Fluminense,2,1,MANDANTE,6.0,9.0,0.78,0.44,0.78,0.44
1,brasileiro-a,2018,5,14-05,Ceará,América-MG,2,2,EMPATE,19.0,10.0,0.33,0.22,0.33,0.22
2,brasileiro-a,2018,5,13-05,Santos,Paraná,3,1,MANDANTE,13.0,20.0,0.11,0.0,0.11,0.0


In [533]:
jogos.keys()

Index(['campeonato', 'ano', 'rodada', 'data', 'mandante', 'visitante',
       'gols_mandante', 'gols_visitante', 'resultado', 'posicao_m',
       'posicao_v', 'aproveitamento_5r_v', 'aproveitamento_5r_m',
       'aproveitamento_10r_v', 'aproveitamento_10r_m'],
      dtype='object')

In [534]:
times = np.concatenate((jogos["visitante"].unique(), jogos["mandante"].unique()), axis=0)

In [535]:
times_dict = dict(zip(times, range(1, len(times))))

In [536]:
times_dict

{'América-MG': 56,
 'América-RN': 78,
 'Atlético-GO': 62,
 'Atlético-MG': 58,
 'Atlético-PR': 45,
 'Avaí': 63,
 'Bahia': 46,
 'Barueri': 73,
 'Botafogo': 42,
 'Brasiliense': 40,
 'Ceará': 43,
 'Chapecoense': 47,
 'Corinthians': 48,
 'Coritiba': 64,
 'Criciúma': 70,
 'Cruzeiro': 50,
 'Figueirense': 66,
 'Flamengo': 53,
 'Fluminense': 59,
 'Fortaleza': 79,
 'Goias': 68,
 'Grêmio': 51,
 'Guarani': 74,
 'Internacional': 57,
 'Ipatinga': 76,
 'Joinville': 69,
 'Juventude': 77,
 'Náutico': 72,
 'Palmeiras': 60,
 'Paraná': 61,
 'Paysandu': 81,
 'Ponte Preta': 65,
 'Portuguesa': 71,
 'Santa Cruz': 67,
 'Santo Andre': 75,
 'Santos': 44,
 'Sao Caetano': 80,
 'Sport Recife': 52,
 'São Paulo': 55,
 'Vasco': 49,
 'Vitória': 54}

In [537]:
jogos = jogos.applymap(lambda s: times_dict.get(s) if s in times_dict else s)

### 3. Define target and data subset

In [538]:
jogos_X_df = jogos.copy()

In [539]:
del jogos_X_df["resultado"]

In [540]:
del jogos_X_df["campeonato"]

In [541]:
del jogos_X_df["data"]

In [542]:
del jogos_X_df["gols_mandante"]

In [543]:
del jogos_X_df["gols_visitante"]

In [544]:
del jogos_X_df["aproveitamento_5r_v"]

In [545]:
del jogos_X_df["aproveitamento_5r_m"]

In [546]:
jogos_X_df.head()

Unnamed: 0,ano,rodada,mandante,visitante,posicao_m,posicao_v,aproveitamento_10r_v,aproveitamento_10r_m
0,2018,5,42,59,6.0,9.0,0.78,0.44
1,2018,5,43,56,19.0,10.0,0.33,0.22
2,2018,5,44,61,13.0,20.0,0.11,0.0
3,2018,5,45,58,15.0,3.0,0.78,0.22
4,2018,5,46,55,17.0,12.0,0.33,0.44


In [547]:
jogos_X_df.keys()

Index(['ano', 'rodada', 'mandante', 'visitante', 'posicao_m', 'posicao_v',
       'aproveitamento_10r_v', 'aproveitamento_10r_m'],
      dtype='object')

In [548]:
y = jogos["resultado"].values

In [549]:
X = jogos_X_df.values

### 4. Train and Test Split

In [550]:
from sklearn.model_selection import train_test_split

In [551]:
X_train, X_test, y_train, y_test = train_test_split(
                                        X,
                                        y, 
                                        test_size=0.2, 
                                        random_state=1)

### 4.1 Grid Search

In [552]:
from sklearn.grid_search import GridSearchCV

In [553]:
from sklearn.datasets import make_classification

In [554]:
from sklearn.ensemble import RandomForestClassifier

In [555]:
rfc = RandomForestClassifier(n_jobs=-1,max_features= 'sqrt' ,n_estimators=50, oob_score = True)

In [556]:
param_grid = {     
    'n_estimators': [800],
    'min_samples_leaf': [10],
    'max_features': ['log2'],
    'n_jobs': [-1],
    'min_samples_split': [3],
    'warm_start': [False],
    'oob_score':  [False], 
    'bootstrap': [True],
    'criterion': ['entropy'],
    'class_weight': ['balanced', 'balanced_subsample', None]
}

In [557]:
CV_rfc = GridSearchCV(estimator=rfc, param_grid=param_grid, cv= 5)

In [558]:
CV_rfc.fit(X, y)

GridSearchCV(cv=5, error_score='raise',
       estimator=RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='sqrt', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=50, n_jobs=-1,
            oob_score=True, random_state=None, verbose=0, warm_start=False),
       fit_params={}, iid=True, n_jobs=1,
       param_grid={'n_estimators': [800], 'min_samples_leaf': [10], 'max_features': ['log2'], 'n_jobs': [-1], 'min_samples_split': [3], 'warm_start': [False], 'oob_score': [False], 'bootstrap': [True], 'criterion': ['entropy'], 'class_weight': ['balanced', 'balanced_subsample', None]},
       pre_dispatch='2*n_jobs', refit=True, scoring=None, verbose=0)

In [559]:
print (CV_rfc.best_params_)

{'bootstrap': True, 'class_weight': None, 'criterion': 'entropy', 'max_features': 'log2', 'min_samples_leaf': 10, 'min_samples_split': 3, 'n_estimators': 800, 'n_jobs': -1, 'oob_score': False, 'warm_start': False}


### 5. Apply Random Forest Classifier

In [560]:
rf_model = RandomForestClassifier(random_state=1, criterion='entropy', max_features='log2', n_estimators=800, min_samples_leaf=10)

In [561]:
rf_model.fit(X_train, y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='entropy',
            max_depth=None, max_features='log2', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=10, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=800, n_jobs=1,
            oob_score=False, random_state=1, verbose=0, warm_start=False)

### 6. Model Cross-Validation

In [562]:
cv_scores_rf = cross_val_score(rf_model, X_train, y_train, 
                               cv=5,scoring="accuracy")
cv_scores_rf

array([ 0.57591093,  0.58603239,  0.57489879,  0.59008097,  0.58805668])

In [563]:
cv_scores_rf_mean =  np.mean(cv_scores_rf)
cv_scores_rf_mean

0.582995951417004

### Apply AdaBoostClassifier

In [564]:
abc_model = AdaBoostClassifier(random_state=1, n_estimators=50, learning_rate=1., algorithm="SAMME.R")

In [565]:
abc_model.fit(X_train, y_train)

AdaBoostClassifier(algorithm='SAMME.R', base_estimator=None,
          learning_rate=1.0, n_estimators=50, random_state=1)

In [566]:
cv_scores_abc = cross_val_score(abc_model, X_train, y_train, 
                               cv=5,scoring="accuracy")
cv_scores_abc

array([ 0.57995951,  0.59109312,  0.54554656,  0.58502024,  0.59210526])

In [567]:
cv_scores_abc_mean =  np.mean(cv_scores_abc)
cv_scores_abc_mean

0.57874493927125514

### 7. Model Test

In [568]:
y_test_pred = rf_model.predict(X_test)

In [569]:
y_test.shape

(1235,)

In [570]:
# Results
pd.crosstab(y_test, y_test_pred,
            rownames=['Actual'], 
            colnames=['Predicted'])

Predicted,EMPATE,MANDANTE,VISITANTE
Actual,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
EMPATE,38,214,66
MANDANTE,15,553,54
VISITANTE,22,135,138


In [571]:
confusion_matrix(y_test, y_test_pred)

array([[ 38, 214,  66],
       [ 15, 553,  54],
       [ 22, 135, 138]])

In [572]:
accuracy_score_test = accuracy_score(y_test, y_test_pred)
accuracy_score_test

0.59028340080971664

In [573]:
#precision_score_forest = precision_score(y_test, y_test_pred)
#precision_score_forest

In [574]:
#recall_score_forest = recall_score(y_test, y_test_pred)
#recall_score_forest

In [575]:
#f1_score_forest = f1_score(y_test, y_test_pred)
#f1_score_forest

### 8. Comparison accuracy: Validation vs Test

In [576]:
comparison = {"Validation": [cv_scores_rf_mean],
             "Test": [accuracy_score_test]}
pd.DataFrame(comparison, index = ["Accuracy"])

Unnamed: 0,Test,Validation
Accuracy,0.590283,0.582996


### 9. Feature Importance

In [577]:
rf_model.feature_importances_

array([ 0.06948818,  0.13273535,  0.08164099,  0.08564545,  0.21740812,
        0.19722893,  0.10139086,  0.11446213])

In [578]:
# View a list of the features and their importance scores
imp_list = list(zip(jogos_X_df.keys(), 
                    rf_model.feature_importances_))
imp_df = pd.DataFrame(imp_list, columns = ["Features", "Importance"])
imp_df.sort_values(by = "Importance", ascending = False)

Unnamed: 0,Features,Importance
4,posicao_m,0.217408
5,posicao_v,0.197229
1,rodada,0.132735
7,aproveitamento_10r_m,0.114462
6,aproveitamento_10r_v,0.101391
3,visitante,0.085645
2,mandante,0.081641
0,ano,0.069488


### 10. Predict

In [579]:
rodada = pd.read_csv("data/rodada.csv",sep=",")

In [580]:
rodada

Unnamed: 0,mandante,visitante,placar
0,Internacional,Chapecoense,
1,Fluminense,Atlético-PR,
2,América-MG,Botafogo,
3,Paraná,Grêmio,\t
4,São Paulo,Santos,
5,Sport Recife,Corinthians,
6,Vitória,Ceará,
7,Palmeiras,Bahia,
8,Flamengo,Vasco,
9,Atlético-MG,Cruzeiro,


In [581]:
rodada_DF = rodada.copy()

In [582]:
rodada_DF = rodada_DF.applymap(lambda s: times_dict.get(s) if s in times_dict else s)

In [583]:
rodada_DF

Unnamed: 0,mandante,visitante,placar
0,57,47,
1,59,45,
2,56,42,
3,61,51,\t
4,55,44,
5,52,48,
6,54,43,
7,60,46,
8,53,49,
9,58,50,


In [584]:
jogos.head(3)

Unnamed: 0,campeonato,ano,rodada,data,mandante,visitante,gols_mandante,gols_visitante,resultado,posicao_m,posicao_v,aproveitamento_5r_v,aproveitamento_5r_m,aproveitamento_10r_v,aproveitamento_10r_m
0,brasileiro-a,2018,5,14-05,42,59,2,1,MANDANTE,6.0,9.0,0.78,0.44,0.78,0.44
1,brasileiro-a,2018,5,14-05,43,56,2,2,EMPATE,19.0,10.0,0.33,0.22,0.33,0.22
2,brasileiro-a,2018,5,13-05,44,61,3,1,MANDANTE,13.0,20.0,0.11,0.0,0.11,0.0


In [585]:
jogos.query("visitante == 54 and rodada == 30 and ano == 2017")

Unnamed: 0,campeonato,ano,rodada,data,mandante,visitante,gols_mandante,gols_visitante,resultado,posicao_m,posicao_v,aproveitamento_5r_v,aproveitamento_5r_m,aproveitamento_10r_v,aproveitamento_10r_m
133,brasileiro-a,2017,30,22-10,46,54,2,1,MANDANTE,12.0,17.0,0.47,0.53,0.47,0.4


In [586]:
ano = jogos["ano"].max()

In [587]:
rod = jogos[jogos["ano"] == ano]["rodada"].max()

In [588]:
rodada_DF["ano"] = ano

In [589]:
rodada_DF["rodada"] = rod + 1

In [590]:
def getPosicao(jogos, ano, rodada, time):    

    ultimo_jogo = jogos.query("ano == " + str(ano) + " and rodada == " + str(rodada) + " and mandante == " + str(time))
    if not ultimo_jogo.empty:
        return ultimo_jogo["posicao_m"].values[0]
    
    ultimo_jogo = jogos.query("ano == " + str(ano) + " and rodada == " + str(rodada) + " and visitante == " + str(time))        
    return ultimo_jogo["posicao_v"].values[0]

In [591]:
def getAproveitamento(jogos, ano, rodada, time):    

    ultimo_jogo = jogos.query("ano == " + str(ano) + " and rodada == " + str(rodada) + " and mandante == " + str(time))
    if not ultimo_jogo.empty:
        return ultimo_jogo["aproveitamento_10r_m"].values[0]
    
    ultimo_jogo = jogos.query("ano == " + str(ano) + " and rodada == " + str(rodada) + " and visitante == " + str(time))        
    return ultimo_jogo["aproveitamento_10r_v"].values[0]

In [592]:
jogos

Unnamed: 0,campeonato,ano,rodada,data,mandante,visitante,gols_mandante,gols_visitante,resultado,posicao_m,posicao_v,aproveitamento_5r_v,aproveitamento_5r_m,aproveitamento_10r_v,aproveitamento_10r_m
0,brasileiro-a,2018,5,14-05,42,59,2,1,MANDANTE,6.0,9.0,0.78,0.44,0.78,0.44
1,brasileiro-a,2018,5,14-05,43,56,2,2,EMPATE,19.0,10.0,0.33,0.22,0.33,0.22
2,brasileiro-a,2018,5,13-05,44,61,3,1,MANDANTE,13.0,20.0,0.11,0.00,0.11,0.00
3,brasileiro-a,2018,5,13-05,45,58,1,2,VISITANTE,15.0,3.0,0.78,0.22,0.78,0.22
4,brasileiro-a,2018,5,13-05,46,55,2,2,EMPATE,17.0,12.0,0.33,0.44,0.33,0.44
5,brasileiro-a,2018,5,13-05,47,53,3,2,MANDANTE,14.0,1.0,1.00,0.33,1.00,0.33
6,brasileiro-a,2018,5,13-05,48,60,1,0,MANDANTE,2.0,5.0,0.78,0.44,0.78,0.44
7,brasileiro-a,2018,5,13-05,49,54,2,3,VISITANTE,7.0,18.0,0.00,0.44,0.00,0.44
8,brasileiro-a,2018,5,13-05,50,52,2,0,MANDANTE,8.0,11.0,0.78,0.44,0.78,0.44
9,brasileiro-a,2018,5,12-05,51,57,0,0,EMPATE,4.0,16.0,0.11,0.44,0.11,0.44


In [593]:
rod

5

In [594]:
ano

2018

In [595]:
rodada_DF["posicao_m"] = rodada_DF.apply(lambda row: getPosicao(jogos, ano, rod, row["mandante"]),axis=1)

In [596]:
rodada_DF.head(10)

Unnamed: 0,mandante,visitante,placar,ano,rodada,posicao_m
0,57,47,,2018,6,16.0
1,59,45,,2018,6,9.0
2,56,42,,2018,6,10.0
3,61,51,\t,2018,6,20.0
4,55,44,,2018,6,12.0
5,52,48,,2018,6,11.0
6,54,43,,2018,6,18.0
7,60,46,,2018,6,5.0
8,53,49,,2018,6,1.0
9,58,50,,2018,6,3.0


In [597]:
rodada_DF["posicao_v"] = rodada_DF.apply(lambda row: getPosicao(jogos, ano, rod, row["visitante"]),axis=1)

In [598]:
rodada_DF["aproveitamento_10r_m"] = rodada_DF.apply(lambda row: getAproveitamento(jogos, ano, rod, row["mandante"]),axis=1)

In [599]:
rodada_DF["aproveitamento_10r_v"] = rodada_DF.apply(lambda row: getAproveitamento(jogos, ano, rod, row["visitante"]),axis=1)

In [600]:
del rodada_DF["placar"]

In [601]:
rodada_DF

Unnamed: 0,mandante,visitante,ano,rodada,posicao_m,posicao_v,aproveitamento_10r_m,aproveitamento_10r_v
0,57,47,2018,6,16.0,14.0,0.11,0.33
1,59,45,2018,6,9.0,15.0,0.78,0.22
2,56,42,2018,6,10.0,6.0,0.33,0.44
3,61,51,2018,6,20.0,4.0,0.11,0.44
4,55,44,2018,6,12.0,13.0,0.33,0.0
5,52,48,2018,6,11.0,2.0,0.78,0.44
6,54,43,2018,6,18.0,19.0,0.0,0.22
7,60,46,2018,6,5.0,17.0,0.78,0.44
8,53,49,2018,6,1.0,7.0,1.0,0.44
9,58,50,2018,6,3.0,8.0,0.78,0.44


In [602]:
rodada["prediction_RFC"] = rf_model.predict(rodada_DF[["ano", "rodada", "mandante", "visitante", "posicao_m", "posicao_v", "aproveitamento_10r_m", "aproveitamento_10r_v"]].values)

In [603]:
rodada["prediction_ABC"] = abc_model.predict(rodada_DF[["ano", "rodada", "mandante", "visitante", "posicao_m", "posicao_v", "aproveitamento_10r_m", "aproveitamento_10r_v"]].values)

In [604]:
rodada

Unnamed: 0,mandante,visitante,placar,prediction_RFC,prediction_ABC
0,Internacional,Chapecoense,,VISITANTE,VISITANTE
1,Fluminense,Atlético-PR,,MANDANTE,MANDANTE
2,América-MG,Botafogo,,VISITANTE,VISITANTE
3,Paraná,Grêmio,\t,VISITANTE,VISITANTE
4,São Paulo,Santos,,MANDANTE,MANDANTE
5,Sport Recife,Corinthians,,EMPATE,EMPATE
6,Vitória,Ceará,,EMPATE,MANDANTE
7,Palmeiras,Bahia,,MANDANTE,MANDANTE
8,Flamengo,Vasco,,MANDANTE,MANDANTE
9,Atlético-MG,Cruzeiro,,MANDANTE,MANDANTE


In [605]:
rodada.to_csv(path_or_buf="data/predicao-rodada-6.csv",sep=";")