# Modelo B - Random Forest

Este notebook contém a modelagem e a avaliação do modelo A, que utiliza uma regressão logística.

### Imports

#### Importação das bibliotecas utilizadas

In [1]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.multiclass import OneVsRestClassifier
import pickle

#### Importação do conjunto de treinamento e teste

O conjuntos de treinamento e de teste estão em formato JSON, portando é necessário usar a função __read_json__ do pandas para realizar a leitura dos arquivos:

In [2]:
train = pd.read_csv('../dados/train/processed_train.csv', encoding='utf-8')
train_target = pd.read_csv('../dados/train/train_target.csv', encoding='utf-8')
test = pd.read_csv('../dados/test/processed_test.csv', encoding='utf-8')

In [3]:
# visualização das cinco primeira receitas presentes no conjunto de treino
train.head(2)

Unnamed: 0,cuisine,id,ingredients,ingredients_text,ingredients_qtt,0,1,2,3,4,...,2720,2721,2722,2723,2724,2725,2726,2727,2728,2729
0,greek,10259,"['romaine lettuce', 'seasoning', 'grape tomato...",romaine lettuce seasoning grape tomato purple ...,9,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,southern_us,25693,"['milk', 'ground pepper', 'thyme', 'salt', 'gr...",milk ground pepper thyme salt ground black pep...,11,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


### Modelagem

Os seguintes tratamentos serão realizados nos dois conjuntos de dados:

In [4]:
X_train = train.copy()
X_train = X_train.drop(['cuisine','id','ingredients','ingredients_text','ingredients_qtt'], axis=1)

In [5]:
X_train.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,2720,2721,2722,2723,2724,2725,2726,2727,2728,2729
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [6]:
random = RandomForestClassifier(random_state=42) #, multi_class='ovr'

In [7]:
model = OneVsRestClassifier(random, n_jobs=1)
model.fit(X_train, train_target)



OneVsRestClassifier(estimator=RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators='warn', n_jobs=None,
            oob_score=False, random_state=42, verbose=0, warm_start=False),
          n_jobs=1)

In [8]:
with open('../models/random.pkl', 'wb') as local_model_file:
    pickle.dump(model, local_model_file)

In [9]:
test.head(2)

Unnamed: 0,id,ingredients,ingredients_qtt,ingredients_text,0,1,2,3,4,5,...,2720,2721,2722,2723,2724,2725,2726,2727,2728,2729
0,18009,"['milk', 'baking powder', 'white sugar', 'rais...",6,milk baking powder white sugar raisin egg purp...,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,28583,"['milk', 'toasted pecan', 'banana', 'vanilla e...",11,milk toasted pecan banana vanilla extract corn...,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [10]:
X_test = test.copy()
X_test = X_test.drop(['id','ingredients','ingredients_text','ingredients_qtt'], axis=1)

In [11]:
y_pred_test = model.predict(X_test)

In [12]:
encoder = LabelEncoder()
classes = encoder.fit_transform(train.cuisine)

In [13]:
list(zip(encoder.classes_, encoder.transform(encoder.classes_)))

[('brazilian', 0),
 ('british', 1),
 ('cajun_creole', 2),
 ('chinese', 3),
 ('filipino', 4),
 ('french', 5),
 ('greek', 6),
 ('indian', 7),
 ('irish', 8),
 ('italian', 9),
 ('jamaican', 10),
 ('japanese', 11),
 ('korean', 12),
 ('mexican', 13),
 ('moroccan', 14),
 ('russian', 15),
 ('southern_us', 16),
 ('spanish', 17),
 ('thai', 18),
 ('vietnamese', 19)]

In [14]:
classes

array([ 6, 16,  4, ...,  8,  3, 13])

In [15]:
y_pred_test

array([16, 16, 14, ...,  9, 16, 13])

In [16]:
y_test_decoded = encoder.inverse_transform(y_pred_test)

In [17]:
y_test_decoded

array(['southern_us', 'southern_us', 'moroccan', ..., 'italian',
       'southern_us', 'mexican'], dtype=object)

In [18]:
submission = pd.concat([test['id'],pd.DataFrame(y_test_decoded, columns=[ 'cuisine'])], axis=1)

In [19]:
submission.to_csv("../dados/test/submission_randomforest.csv", index=False)