# Campeonato Brasileiro - Série A

## Radom Forest Classifier

### 1. Import packages

In [49]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

In [50]:
from sklearn.ensemble import RandomForestClassifier

In [51]:
from sklearn.model_selection import cross_val_score

In [52]:
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score,precision_score, recall_score
from sklearn.metrics import f1_score

In [53]:
from sklearn import datasets

### 2. Reading data

In [54]:
jogos = pd.read_csv("data/jogos-resultado-posicao.csv",sep=",")

In [55]:
del jogos["Unnamed: 0"]

In [56]:
#jogos.query('ano > 2012', inplace=True)

In [57]:
jogos.head(3)

Unnamed: 0,campeonato,ano,rodada,data,mandante,visitante,gols_mandante,gols_visitante,resultado,posicao_m,posicao_v
0,brasileiro-a,2017,30,23-10,Botafogo,Corinthians,2,1,MANDANTE,6.0,1.0
1,brasileiro-a,2017,30,22-10,Chapecoense,Fluminense,2,0,MANDANTE,11.0,13.0
2,brasileiro-a,2017,30,22-10,Ponte Preta,Avaí,1,2,VISITANTE,18.0,16.0


In [58]:
jogos.keys()

Index(['campeonato', 'ano', 'rodada', 'data', 'mandante', 'visitante',
       'gols_mandante', 'gols_visitante', 'resultado', 'posicao_m',
       'posicao_v'],
      dtype='object')

In [59]:
times_visitantes = jogos["visitante"].unique()

In [60]:
times_mandantes = jogos["mandante"].unique()

In [61]:
times = np.concatenate((times_visitantes, times_mandantes), axis=0)

In [62]:
type(times_mandantes)

numpy.ndarray

In [63]:
times_dict = dict(zip(times, range(1, len(times))))

In [64]:
jogos = jogos.applymap(lambda s: times_dict.get(s) if s in times_dict else s)

### 3. Define target and data subset

In [65]:
jogos_X = ['campeonato', 'ano', 'rodada', 'data', 'mandante', 'visitante', 'gols_mandante', 'gols_visitante', 'posicao_m', 'posicao_v']

In [70]:
jogos_X_df = jogos.copy()

In [72]:
del jogos_X_df["resultado"]

In [73]:
del jogos_X_df["campeonato"]

In [74]:
del jogos_X_df["data"]

In [75]:
del jogos_X_df["gols_mandante"]

In [76]:
del jogos_X_df["gols_visitante"]

In [77]:
jogos_X_df.head()

Unnamed: 0,ano,rodada,mandante,visitante,posicao_m,posicao_v
0,2017,30,42,57,6.0,1.0
1,2017,30,43,58,11.0,13.0
2,2017,30,44,56,18.0,16.0
3,2017,30,45,53,9.0,15.0
4,2017,30,46,55,12.0,17.0


In [28]:
jogos_X_df.keys()

Index(['ano', 'rodada', 'mandante', 'visitante', 'posicao_m', 'posicao_v'], dtype='object')

In [29]:
y = jogos["resultado"].values

In [30]:
X = jogos_X_df.values

### 4. Train and Test Split

In [31]:
from sklearn.model_selection import train_test_split

In [32]:
X_train, X_test, y_train, y_test = train_test_split(
                                        X,
                                        y, 
                                        test_size=0.2, 
                                        random_state=1)

### 5. Apply Random Forest Classifier

In [33]:
rf_model = RandomForestClassifier()

In [34]:
X_train

array([[  2.00800000e+03,   2.00000000e+01,   6.60000000e+01,
          7.00000000e+01,   1.10000000e+01,   1.70000000e+01],
       [  2.00900000e+03,   2.60000000e+01,   6.60000000e+01,
          4.80000000e+01,   2.00000000e+00,   6.00000000e+00],
       [  2.00600000e+03,   1.00000000e+01,   5.70000000e+01,
          5.20000000e+01,   1.80000000e+01,   1.00000000e+01],
       ..., 
       [  2.00700000e+03,   1.30000000e+01,   4.80000000e+01,
          5.20000000e+01,   4.00000000e+00,   1.40000000e+01],
       [  2.01700000e+03,   7.00000000e+00,   5.20000000e+01,
          4.40000000e+01,   1.10000000e+01,   9.00000000e+00],
       [  2.00400000e+03,   2.90000000e+01,   5.10000000e+01,
          8.10000000e+01,   1.60000000e+01,   1.70000000e+01]])

In [35]:
rf_model.fit(X_train, y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

### 6. Model Cross-Validation

In [36]:
cv_scores_rf = cross_val_score(rf_model, X_train, y_train, 
                               cv=5, scoring="accuracy")
cv_scores_rf

array([ 0.50413223,  0.51756198,  0.5       ,  0.50672182,  0.50051813])

In [37]:
cv_scores_rf_mean =  np.mean(cv_scores_rf)
cv_scores_rf_mean

0.50578683393062129

### 7. Model Test

In [38]:
y_test_pred = rf_model.predict(X_test)

In [39]:
y_test.shape

(1210,)

In [40]:
# Results
pd.crosstab(y_test, y_test_pred,
            rownames=['Actual'], 
            colnames=['Predicted'])

Predicted,EMPATE,MANDANTE,VISITANTE
Actual,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
EMPATE,84,166,62
MANDANTE,111,455,51
VISITANTE,71,112,98


In [41]:
confusion_matrix(y_test, y_test_pred)

array([[ 84, 166,  62],
       [111, 455,  51],
       [ 71, 112,  98]])

In [42]:
accuracy_score_test = accuracy_score(y_test, y_test_pred)
accuracy_score_test

0.52644628099173552

In [43]:
#precision_score_forest = precision_score(y_test, y_test_pred)
#precision_score_forest

In [44]:
#recall_score_forest = recall_score(y_test, y_test_pred)
#recall_score_forest

In [45]:
#f1_score_forest = f1_score(y_test, y_test_pred)
#f1_score_forest

### 8. Comparison accuracy: Validation vs Test

In [46]:
comparison = {"Validation": [cv_scores_rf_mean],
             "Test": [accuracy_score_test]}
pd.DataFrame(comparison, index = ["Accuracy"])

Unnamed: 0,Test,Validation
Accuracy,0.526446,0.505787


### 9. Feature Importance

In [47]:
rf_model.feature_importances_

array([ 0.13384634,  0.19943317,  0.16438573,  0.16159084,  0.17032106,
        0.17042286])

In [48]:
# View a list of the features and their importance scores
imp_list = list(zip(jogos_X_df.keys(), 
                    rf_model.feature_importances_))
imp_df = pd.DataFrame(imp_list, columns = ["Features", "Importance"])
imp_df.sort_values(by = "Importance", ascending = False)

Unnamed: 0,Features,Importance
1,rodada,0.199433
5,posicao_v,0.170423
4,posicao_m,0.170321
2,mandante,0.164386
3,visitante,0.161591
0,ano,0.133846
