# Campeonato Brasileiro - Série A

## Radom Forest Classifier

### 1. Import packages

In [1]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

In [2]:
from sklearn.ensemble import RandomForestClassifier

In [3]:
from sklearn.model_selection import cross_val_score

In [4]:
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score,precision_score, recall_score
from sklearn.metrics import f1_score

In [5]:
from sklearn import datasets

### 2. Reading data

In [6]:
jogos = pd.read_csv("data/jogos-resultado-posicao.csv",sep=",")

In [7]:
del jogos["Unnamed: 0"]

In [8]:
#jogos.query('ano > 2012', inplace=True)

In [9]:
jogos.head(3)

Unnamed: 0,campeonato,ano,rodada,data,mandante,visitante,gols_mandante,gols_visitante,resultado,posicao_m,posicao_v,aproveitamento_5r_v,aproveitamento_5r_m,aproveitamento_10r_v,aproveitamento_10r_m
0,brasileiro-a,2017,30,23-10,Botafogo,Corinthians,2,1,MANDANTE,6.0,1.0,0.4,0.47,0.4,0.63
1,brasileiro-a,2017,30,22-10,Chapecoense,Fluminense,2,0,MANDANTE,11.0,13.0,0.47,0.47,0.4,0.43
2,brasileiro-a,2017,30,22-10,Ponte Preta,Avaí,1,2,VISITANTE,18.0,16.0,0.13,0.27,0.43,0.3


In [10]:
jogos.keys()

Index(['campeonato', 'ano', 'rodada', 'data', 'mandante', 'visitante',
       'gols_mandante', 'gols_visitante', 'resultado', 'posicao_m',
       'posicao_v', 'aproveitamento_5r_v', 'aproveitamento_5r_m',
       'aproveitamento_10r_v', 'aproveitamento_10r_m'],
      dtype='object')

In [11]:
times_visitantes = jogos["visitante"].unique()

In [12]:
times_mandantes = jogos["mandante"].unique()

In [13]:
times = np.concatenate((times_visitantes, times_mandantes), axis=0)

In [14]:
type(times_mandantes)

numpy.ndarray

In [15]:
times_dict = dict(zip(times, range(1, len(times))))

In [16]:
jogos = jogos.applymap(lambda s: times_dict.get(s) if s in times_dict else s)

### 3. Define target and data subset

In [17]:
jogos_X = ['campeonato', 'ano', 'rodada', 'data', 'mandante', 'visitante', 'gols_mandante', 'gols_visitante', 'posicao_m', 'posicao_v']

In [18]:
jogos_X_df = jogos.copy()

In [19]:
del jogos_X_df["resultado"]

In [20]:
del jogos_X_df["campeonato"]

In [21]:
del jogos_X_df["data"]

In [22]:
del jogos_X_df["gols_mandante"]

In [23]:
del jogos_X_df["gols_visitante"]

In [24]:
del jogos_X_df["aproveitamento_5r_v"]

In [25]:
del jogos_X_df["aproveitamento_5r_m"]

In [26]:
jogos_X_df.head()

Unnamed: 0,ano,rodada,mandante,visitante,posicao_m,posicao_v,aproveitamento_10r_v,aproveitamento_10r_m
0,2017,30,42,57,6.0,1.0,0.4,0.63
1,2017,30,43,58,11.0,13.0,0.4,0.43
2,2017,30,44,56,18.0,16.0,0.43,0.3
3,2017,30,45,53,9.0,15.0,0.23,0.4
4,2017,30,46,55,12.0,17.0,0.47,0.4


In [27]:
jogos_X_df.keys()

Index(['ano', 'rodada', 'mandante', 'visitante', 'posicao_m', 'posicao_v',
       'aproveitamento_10r_v', 'aproveitamento_10r_m'],
      dtype='object')

In [28]:
y = jogos["resultado"].values

In [29]:
X = jogos_X_df.values

### 4. Train and Test Split

In [30]:
from sklearn.model_selection import train_test_split

In [31]:
X_train, X_test, y_train, y_test = train_test_split(
                                        X,
                                        y, 
                                        test_size=0.2, 
                                        random_state=1)

### 5. Apply Random Forest Classifier

In [32]:
rf_model = RandomForestClassifier()

In [33]:
X_train

array([[  2.00800000e+03,   2.00000000e+01,   6.60000000e+01, ...,
          1.70000000e+01,   2.30000000e-01,   4.70000000e-01],
       [  2.00900000e+03,   2.60000000e+01,   6.60000000e+01, ...,
          6.00000000e+00,   6.00000000e-01,   5.30000000e-01],
       [  2.00600000e+03,   1.00000000e+01,   5.70000000e+01, ...,
          1.00000000e+01,   4.60000000e-01,   3.80000000e-01],
       ..., 
       [  2.00700000e+03,   1.30000000e+01,   4.80000000e+01, ...,
          1.40000000e+01,   4.70000000e-01,   5.00000000e-01],
       [  2.01700000e+03,   7.00000000e+00,   5.20000000e+01, ...,
          9.00000000e+00,   4.70000000e-01,   4.00000000e-01],
       [  2.00400000e+03,   2.90000000e+01,   5.10000000e+01, ...,
          1.70000000e+01,   5.00000000e-01,   4.00000000e-01]])

In [34]:
rf_model.fit(X_train, y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

### 6. Model Cross-Validation

In [35]:
cv_scores_rf = cross_val_score(rf_model, X_train, y_train, 
                               cv=5, scoring="accuracy")
cv_scores_rf

array([ 0.54855372,  0.56508264,  0.53719008,  0.52119959,  0.53056995])

In [36]:
cv_scores_rf_mean =  np.mean(cv_scores_rf)
cv_scores_rf_mean

0.54051919616341093

### 7. Model Test

In [37]:
y_test_pred = rf_model.predict(X_test)

In [38]:
y_test.shape

(1210,)

In [39]:
# Results
pd.crosstab(y_test, y_test_pred,
            rownames=['Actual'], 
            colnames=['Predicted'])

Predicted,EMPATE,MANDANTE,VISITANTE
Actual,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
EMPATE,83,165,64
MANDANTE,103,465,49
VISITANTE,73,104,104


In [40]:
confusion_matrix(y_test, y_test_pred)

array([[ 83, 165,  64],
       [103, 465,  49],
       [ 73, 104, 104]])

In [41]:
accuracy_score_test = accuracy_score(y_test, y_test_pred)
accuracy_score_test

0.53884297520661162

In [42]:
#precision_score_forest = precision_score(y_test, y_test_pred)
#precision_score_forest

In [43]:
#recall_score_forest = recall_score(y_test, y_test_pred)
#recall_score_forest

In [44]:
#f1_score_forest = f1_score(y_test, y_test_pred)
#f1_score_forest

### 8. Comparison accuracy: Validation vs Test

In [45]:
comparison = {"Validation": [cv_scores_rf_mean],
             "Test": [accuracy_score_test]}
pd.DataFrame(comparison, index = ["Accuracy"])

Unnamed: 0,Test,Validation
Accuracy,0.538843,0.540519


### 9. Feature Importance

In [46]:
rf_model.feature_importances_

array([ 0.10012764,  0.13789505,  0.12395651,  0.11587545,  0.14895458,
        0.14138967,  0.11855572,  0.11324539])

In [47]:
# View a list of the features and their importance scores
imp_list = list(zip(jogos_X_df.keys(), 
                    rf_model.feature_importances_))
imp_df = pd.DataFrame(imp_list, columns = ["Features", "Importance"])
imp_df.sort_values(by = "Importance", ascending = False)

Unnamed: 0,Features,Importance
4,posicao_m,0.148955
5,posicao_v,0.14139
1,rodada,0.137895
2,mandante,0.123957
6,aproveitamento_10r_v,0.118556
3,visitante,0.115875
7,aproveitamento_10r_m,0.113245
0,ano,0.100128
