# Campeonato Brasileiro - Série A

## Radom Forest Classifier

### 1. Import packages

In [377]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

In [378]:
from sklearn.ensemble import RandomForestClassifier

In [379]:
from sklearn.ensemble import AdaBoostClassifier

In [380]:
from sklearn.model_selection import cross_val_score

In [381]:
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score,precision_score, recall_score
from sklearn.metrics import f1_score

In [382]:
from sklearn import datasets

### 2. Reading data

In [383]:
jogos = pd.read_csv("data/matches-results-positions.9.csv",sep=",")

In [384]:
del jogos["Unnamed: 0"]

In [385]:
#jogos.query('ano > 2012', inplace=True)

In [386]:
jogos.head(3)

Unnamed: 0,Unnamed: 0.1,campeonato,ano,rodada,data,mandante,visitante,gols_mandante,gols_visitante,resultado,posicao_m,posicao_v,aproveitamento_5r_v,aproveitamento_5r_m,aproveitamento_10r_v,aproveitamento_10r_m
0,0,brasileiro-a,2018,9,04-06,Paraná,Fluminense,2,1,MANDANTE,19.0,8.0,0.67,0.2,0.67,0.14
1,1,brasileiro-a,2018,9,03-06,Ceará,Cruzeiro,0,1,VISITANTE,20.0,2.0,0.8,0.13,0.62,0.14
2,2,brasileiro-a,2018,9,03-06,América-MG,Atlético-PR,3,1,MANDANTE,10.0,16.0,0.27,0.27,0.29,0.33


In [387]:
jogos.keys()

Index(['Unnamed: 0.1', 'campeonato', 'ano', 'rodada', 'data', 'mandante',
       'visitante', 'gols_mandante', 'gols_visitante', 'resultado',
       'posicao_m', 'posicao_v', 'aproveitamento_5r_v', 'aproveitamento_5r_m',
       'aproveitamento_10r_v', 'aproveitamento_10r_m'],
      dtype='object')

In [388]:
times = np.concatenate((jogos["visitante"].unique(), jogos["mandante"].unique()), axis=0)

In [389]:
times_dict = dict(zip(times, range(1, len(times))))

In [390]:
times_dict

{'América-MG': 44,
 'América-RN': 78,
 'Atlético-GO': 62,
 'Atlético-MG': 50,
 'Atlético-PR': 52,
 'Avaí': 63,
 'Bahia': 45,
 'Barueri': 73,
 'Botafogo': 61,
 'Brasiliense': 40,
 'Ceará': 43,
 'Chapecoense': 56,
 'Corinthians': 53,
 'Coritiba': 64,
 'Criciúma': 70,
 'Cruzeiro': 54,
 'Figueirense': 66,
 'Flamengo': 46,
 'Fluminense': 60,
 'Fortaleza': 79,
 'Goias': 68,
 'Grêmio': 55,
 'Guarani': 74,
 'Internacional': 51,
 'Ipatinga': 76,
 'Joinville': 69,
 'Juventude': 77,
 'Náutico': 72,
 'Palmeiras': 48,
 'Paraná': 42,
 'Paysandu': 81,
 'Ponte Preta': 65,
 'Portuguesa': 71,
 'Santa Cruz': 67,
 'Santo Andre': 75,
 'Santos': 47,
 'Sao Caetano': 80,
 'Sport Recife': 58,
 'São Paulo': 57,
 'Vasco': 49,
 'Vitória': 59}

In [391]:
jogos = jogos.applymap(lambda s: times_dict.get(s) if s in times_dict else s)

### 3. Define target and data subset

In [392]:
jogos_X_df = jogos.copy()

In [393]:
del jogos_X_df["resultado"]

In [394]:
del jogos_X_df["campeonato"]

In [395]:
del jogos_X_df["data"]

In [396]:
del jogos_X_df["gols_mandante"]

In [397]:
del jogos_X_df["gols_visitante"]

In [398]:
del jogos_X_df["aproveitamento_5r_v"]

In [399]:
del jogos_X_df["aproveitamento_5r_m"]

In [400]:
del jogos_X_df["Unnamed: 0.1"]

In [401]:
jogos_X_df.head()

Unnamed: 0,ano,rodada,mandante,visitante,posicao_m,posicao_v,aproveitamento_10r_v,aproveitamento_10r_m
0,2018,9,42,60,19.0,8.0,0.67,0.14
1,2018,9,43,54,20.0,2.0,0.62,0.14
2,2018,9,44,52,10.0,16.0,0.29,0.33
3,2018,9,45,55,18.0,3.0,0.48,0.38
4,2018,9,46,53,1.0,7.0,0.52,0.76


In [402]:
jogos_X_df.keys()

Index(['ano', 'rodada', 'mandante', 'visitante', 'posicao_m', 'posicao_v',
       'aproveitamento_10r_v', 'aproveitamento_10r_m'],
      dtype='object')

In [403]:
y = jogos["resultado"].values

In [404]:
X = jogos_X_df.values

### 4. Train and Test Split

In [405]:
from sklearn.model_selection import train_test_split

In [406]:
X_train, X_test, y_train, y_test = train_test_split(
                                        X,
                                        y, 
                                        test_size=0.2, 
                                        random_state=1)

### 4.1 Grid Search

In [407]:
from sklearn.grid_search import GridSearchCV

In [408]:
from sklearn.datasets import make_classification

In [409]:
from sklearn.ensemble import RandomForestClassifier

In [410]:
rfc = RandomForestClassifier(n_jobs=-1,max_features= 'sqrt' ,n_estimators=50, oob_score = True)

In [411]:
param_grid = {     
    'n_estimators': [800],
    'min_samples_leaf': [10],
    'max_features': ['log2'],
    'n_jobs': [-1],
    'min_samples_split': [3],
    'warm_start': [False],
    'oob_score':  [False], 
    'bootstrap': [True],
    'criterion': ['entropy'],
    'class_weight': ['balanced', 'balanced_subsample', None]
}

In [412]:
CV_rfc = GridSearchCV(estimator=rfc, param_grid=param_grid, cv= 5)

In [413]:
CV_rfc.fit(X, y)

GridSearchCV(cv=5, error_score='raise',
       estimator=RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='sqrt', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=50, n_jobs=-1,
            oob_score=True, random_state=None, verbose=0, warm_start=False),
       fit_params={}, iid=True, n_jobs=1,
       param_grid={'n_estimators': [800], 'min_samples_leaf': [10], 'max_features': ['log2'], 'n_jobs': [-1], 'min_samples_split': [3], 'warm_start': [False], 'oob_score': [False], 'bootstrap': [True], 'criterion': ['entropy'], 'class_weight': ['balanced', 'balanced_subsample', None]},
       pre_dispatch='2*n_jobs', refit=True, scoring=None, verbose=0)

In [414]:
print (CV_rfc.best_params_)

{'bootstrap': True, 'class_weight': None, 'criterion': 'entropy', 'max_features': 'log2', 'min_samples_leaf': 10, 'min_samples_split': 3, 'n_estimators': 800, 'n_jobs': -1, 'oob_score': False, 'warm_start': False}


### 5. Apply Random Forest Classifier

In [415]:
rf_model = RandomForestClassifier(random_state=1, criterion='entropy', max_features='log2', n_estimators=800, min_samples_leaf=10)

In [416]:
rf_model.fit(X_train, y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='entropy',
            max_depth=None, max_features='log2', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=10, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=800, n_jobs=1,
            oob_score=False, random_state=1, verbose=0, warm_start=False)

### 6. Model Cross-Validation

In [417]:
cv_scores_rf = cross_val_score(rf_model, X_train, y_train, 
                               cv=5,scoring="accuracy")
cv_scores_rf

array([ 0.59798995,  0.5915493 ,  0.59255533,  0.59959759,  0.59054326])

In [418]:
cv_scores_rf_mean =  np.mean(cv_scores_rf)
cv_scores_rf_mean

0.59444708451715322

### Apply AdaBoostClassifier

In [419]:
abc_model = AdaBoostClassifier(random_state=1, n_estimators=50, learning_rate=1., algorithm="SAMME.R")

In [420]:
abc_model.fit(X_train, y_train)

AdaBoostClassifier(algorithm='SAMME.R', base_estimator=None,
          learning_rate=1.0, n_estimators=50, random_state=1)

In [421]:
cv_scores_abc = cross_val_score(abc_model, X_train, y_train, 
                               cv=5,scoring="accuracy")
cv_scores_abc

array([ 0.58291457,  0.58249497,  0.58148893,  0.57545272,  0.58853119])

In [422]:
cv_scores_abc_mean =  np.mean(cv_scores_abc)
cv_scores_abc_mean

0.58217647594107358

### 7. Model Test

In [423]:
y_test_pred = rf_model.predict(X_test)

In [424]:
y_test.shape

(1243,)

In [425]:
# Results
pd.crosstab(y_test, y_test_pred,
            rownames=['Actual'], 
            colnames=['Predicted'])

Predicted,EMPATE,MANDANTE,VISITANTE
Actual,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
EMPATE,39,241,61
MANDANTE,21,544,44
VISITANTE,21,141,131


In [426]:
confusion_matrix(y_test, y_test_pred)

array([[ 39, 241,  61],
       [ 21, 544,  44],
       [ 21, 141, 131]])

In [427]:
accuracy_score_test = accuracy_score(y_test, y_test_pred)
accuracy_score_test

0.57441673370876911

In [428]:
#precision_score_forest = precision_score(y_test, y_test_pred)
#precision_score_forest

In [429]:
#recall_score_forest = recall_score(y_test, y_test_pred)
#recall_score_forest

In [430]:
#f1_score_forest = f1_score(y_test, y_test_pred)
#f1_score_forest

### 8. Comparison accuracy: Validation vs Test

In [431]:
comparison = {"Validation": [cv_scores_rf_mean],
             "Test": [accuracy_score_test]}
pd.DataFrame(comparison, index = ["Accuracy"])

Unnamed: 0,Test,Validation
Accuracy,0.574417,0.594447


### 9. Feature Importance

In [432]:
rf_model.feature_importances_

array([ 0.06803089,  0.13292503,  0.08262914,  0.08378799,  0.21138386,
        0.20205659,  0.1039028 ,  0.11528371])

In [433]:
# View a list of the features and their importance scores
imp_list = list(zip(jogos_X_df.keys(), 
                    rf_model.feature_importances_))
imp_df = pd.DataFrame(imp_list, columns = ["Features", "Importance"])
imp_df.sort_values(by = "Importance", ascending = False)

Unnamed: 0,Features,Importance
4,posicao_m,0.211384
5,posicao_v,0.202057
1,rodada,0.132925
7,aproveitamento_10r_m,0.115284
6,aproveitamento_10r_v,0.103903
3,visitante,0.083788
2,mandante,0.082629
0,ano,0.068031


### 10. Predict

In [434]:
rodada = pd.read_csv("data/turn.10.csv",sep=",")

In [435]:
rodada

Unnamed: 0,mandante,visitante,placar
0,São Paulo,Internacional,
1,Botafogo,Ceará,
2,Vitória,Chapecoense,
3,Corinthians,Santos,
4,Sport Recife,Atlético-PR,
5,Cruzeiro,Vasco,
6,Grêmio,Palmeiras,
7,Fluminense,Flamengo,
8,Paraná,Bahia,
9,América-MG,Atlético-MG,


In [436]:
rodada_DF = rodada.copy()

In [437]:
rodada_DF = rodada_DF.applymap(lambda s: times_dict.get(s) if s in times_dict else s)

In [438]:
rodada_DF

Unnamed: 0,mandante,visitante,placar
0,57,51,
1,61,43,
2,59,56,
3,53,47,
4,58,52,
5,54,49,
6,55,48,
7,60,46,
8,42,45,
9,44,50,


In [439]:
jogos.head(3)

Unnamed: 0,Unnamed: 0.1,campeonato,ano,rodada,data,mandante,visitante,gols_mandante,gols_visitante,resultado,posicao_m,posicao_v,aproveitamento_5r_v,aproveitamento_5r_m,aproveitamento_10r_v,aproveitamento_10r_m
0,0,brasileiro-a,2018,9,04-06,42,60,2,1,MANDANTE,19.0,8.0,0.67,0.2,0.67,0.14
1,1,brasileiro-a,2018,9,03-06,43,54,0,1,VISITANTE,20.0,2.0,0.8,0.13,0.62,0.14
2,2,brasileiro-a,2018,9,03-06,44,52,3,1,MANDANTE,10.0,16.0,0.27,0.27,0.29,0.33


In [440]:
jogos.query("visitante == 54 and rodada == 30 and ano == 2017")

Unnamed: 0,Unnamed: 0.1,campeonato,ano,rodada,data,mandante,visitante,gols_mandante,gols_visitante,resultado,posicao_m,posicao_v,aproveitamento_5r_v,aproveitamento_5r_m,aproveitamento_10r_v,aproveitamento_10r_m


In [441]:
ano = jogos["ano"].max()

In [442]:
rod = jogos[jogos["ano"] == ano]["rodada"].max()

In [443]:
rodada_DF["ano"] = ano

In [444]:
rodada_DF["rodada"] = rod + 1

In [445]:
def getPosicao(jogos, ano, rodada, time):    

    ultimo_jogo = jogos.query("ano == " + str(ano) + " and rodada == " + str(rodada) + " and mandante == " + str(time))
    if not ultimo_jogo.empty:
        return ultimo_jogo["posicao_m"].values[0]
    
    ultimo_jogo = jogos.query("ano == " + str(ano) + " and rodada == " + str(rodada) + " and visitante == " + str(time))        
    return ultimo_jogo["posicao_v"].values[0]

In [446]:
def getAproveitamento(jogos, ano, rodada, time):    

    ultimo_jogo = jogos.query("ano == " + str(ano) + " and rodada == " + str(rodada) + " and mandante == " + str(time))
    if not ultimo_jogo.empty:
        return ultimo_jogo["aproveitamento_10r_m"].values[0]
    
    ultimo_jogo = jogos.query("ano == " + str(ano) + " and rodada == " + str(rodada) + " and visitante == " + str(time))        
    return ultimo_jogo["aproveitamento_10r_v"].values[0]

In [447]:
jogos

Unnamed: 0,Unnamed: 0.1,campeonato,ano,rodada,data,mandante,visitante,gols_mandante,gols_visitante,resultado,posicao_m,posicao_v,aproveitamento_5r_v,aproveitamento_5r_m,aproveitamento_10r_v,aproveitamento_10r_m
0,0,brasileiro-a,2018,9,04-06,42,60,2,1,MANDANTE,19.0,8.0,0.67,0.20,0.67,0.14
1,1,brasileiro-a,2018,9,03-06,43,54,0,1,VISITANTE,20.0,2.0,0.80,0.13,0.62,0.14
2,2,brasileiro-a,2018,9,03-06,44,52,3,1,MANDANTE,10.0,16.0,0.27,0.27,0.29,0.33
3,3,brasileiro-a,2018,9,03-06,45,55,0,2,VISITANTE,18.0,3.0,0.60,0.27,0.48,0.38
4,4,brasileiro-a,2018,9,03-06,46,53,1,0,MANDANTE,1.0,7.0,0.53,0.67,0.52,0.76
5,5,brasileiro-a,2018,9,03-06,47,59,5,2,MANDANTE,15.0,17.0,0.47,0.20,0.33,0.14
6,6,brasileiro-a,2018,9,02-06,48,57,3,1,MANDANTE,6.0,4.0,0.73,0.40,0.62,0.48
7,7,brasileiro-a,2018,9,02-06,49,61,1,2,VISITANTE,13.0,12.0,0.27,0.47,0.38,0.38
8,8,brasileiro-a,2018,9,02-06,50,56,3,3,EMPATE,9.0,14.0,0.47,0.47,0.43,0.62
9,9,brasileiro-a,2018,9,02-06,51,58,0,0,EMPATE,11.0,5.0,0.67,0.47,0.67,0.38


In [448]:
rod

9

In [449]:
ano

2018

In [450]:
rodada_DF["posicao_m"] = rodada_DF.apply(lambda row: getPosicao(jogos, ano, rod, row["mandante"]),axis=1)

In [451]:
del rodada["placar"]

In [452]:
rodada_DF.head(10)

Unnamed: 0,mandante,visitante,placar,ano,rodada,posicao_m
0,57,51,,2018,10,4.0
1,61,43,,2018,10,12.0
2,59,56,,2018,10,17.0
3,53,47,,2018,10,7.0
4,58,52,,2018,10,5.0
5,54,49,,2018,10,2.0
6,55,48,,2018,10,3.0
7,60,46,,2018,10,8.0
8,42,45,,2018,10,19.0
9,44,50,,2018,10,10.0


In [453]:
rodada_DF["posicao_v"] = rodada_DF.apply(lambda row: getPosicao(jogos, ano, rod, row["visitante"]),axis=1)

In [454]:
rodada_DF["aproveitamento_10r_m"] = rodada_DF.apply(lambda row: getAproveitamento(jogos, ano, rod, row["mandante"]),axis=1)

In [455]:
rodada_DF["aproveitamento_10r_v"] = rodada_DF.apply(lambda row: getAproveitamento(jogos, ano, rod, row["visitante"]),axis=1)

In [456]:
rodada_DF

Unnamed: 0,mandante,visitante,placar,ano,rodada,posicao_m,posicao_v,aproveitamento_10r_m,aproveitamento_10r_v
0,57,51,,2018,10,4.0,11.0,0.62,0.38
1,61,43,,2018,10,12.0,20.0,0.38,0.14
2,59,56,,2018,10,17.0,14.0,0.33,0.43
3,53,47,,2018,10,7.0,15.0,0.52,0.14
4,58,52,,2018,10,5.0,16.0,0.67,0.29
5,54,49,,2018,10,2.0,13.0,0.62,0.38
6,55,48,,2018,10,3.0,6.0,0.48,0.48
7,60,46,,2018,10,8.0,1.0,0.67,0.76
8,42,45,,2018,10,19.0,18.0,0.14,0.38
9,44,50,,2018,10,10.0,9.0,0.33,0.62


In [457]:
rodada["prediction_RFC"] = rf_model.predict(rodada_DF[["ano", "rodada", "mandante", "visitante", "posicao_m", "posicao_v", "aproveitamento_10r_m", "aproveitamento_10r_v"]].values)

In [458]:
rodada["prediction_ABC"] = abc_model.predict(rodada_DF[["ano", "rodada", "mandante", "visitante", "posicao_m", "posicao_v", "aproveitamento_10r_m", "aproveitamento_10r_v"]].values)

In [459]:
rodada

Unnamed: 0,mandante,visitante,prediction_RFC,prediction_ABC
0,São Paulo,Internacional,MANDANTE,MANDANTE
1,Botafogo,Ceará,MANDANTE,MANDANTE
2,Vitória,Chapecoense,VISITANTE,VISITANTE
3,Corinthians,Santos,MANDANTE,MANDANTE
4,Sport Recife,Atlético-PR,MANDANTE,MANDANTE
5,Cruzeiro,Vasco,MANDANTE,MANDANTE
6,Grêmio,Palmeiras,MANDANTE,MANDANTE
7,Fluminense,Flamengo,VISITANTE,VISITANTE
8,Paraná,Bahia,EMPATE,VISITANTE
9,América-MG,Atlético-MG,VISITANTE,VISITANTE


In [460]:
rodada.to_csv(path_or_buf="data/prediction.10.csv",sep=";")