# Bibliotecas

In [89]:
# ! pip install xgboost
# ! pip install scikit-optimize

In [90]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from sklearn.model_selection import (train_test_split,
                                     KFold,
                                     cross_val_score,
                                     RandomizedSearchCV)
from sklearn.ensemble import RandomForestClassifier
import xgboost as xgb
from skopt import BayesSearchCV
from skopt.space import Real, Categorical, Integer

# Dataset

In [91]:
treino = pd.read_csv('/content/drive/MyDrive/Datasets/Titanic/train.csv')

In [92]:
treino.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [93]:
teste = pd.read_csv('/content/drive/MyDrive/Datasets/Titanic/test.csv')

In [94]:
teste.head()

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S


In [95]:
treino.isnull().sum()

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64

'Embarked' é variável categórica -> os valores NA serão subtituídos pela moda

'Age' é variável contínua -> os valores NA serão subtituídos pela média

In [96]:
treino['Embarked'].value_counts()

S    644
C    168
Q     77
Name: Embarked, dtype: int64

In [97]:
moda_embarked = treino['Embarked'].mode()[0]

treino.loc[treino['Embarked'].isnull(), 'Embarked'] = moda_embarked

treino['Embarked'].isnull().sum()

0

In [98]:
media_age = treino['Age'].mean()

treino.loc[treino['Age'].isnull(), 'Age'] = media_age

treino['Age'].isnull().sum()

0

## Treino e teste

In [99]:
X_treino = treino.drop(['Survived', 'Name', 'Ticket', 'Cabin'], axis='columns') # Exclui features não interessantes

X_treino['Sex'] = X_treino['Sex'].map({'male': 0, 'female': 1}) # Sex variável discreta

X_treino['Embarked'] = X_treino['Embarked'].map({'C': 0, 'S': 1, 'Q': 2}) # Embarked variável discreta


y_treino = treino['Survived']

In [100]:
teste.loc[treino['Embarked'].isnull(), 'Embarked'] = moda_embarked

teste['Embarked'].isnull().sum()

0

In [101]:
teste.loc[treino['Age'].isnull(), 'Age'] = media_age

teste['Age'].isnull().sum()

86

O teste é como se fosse a produção, onde os dados não são conhecidos

Por essa razão os valores NA do teste foram substituídos pela moda Embarked e média Age do treino (dados conhecidos)

In [102]:
X_teste = treino.drop(['Survived', 'Name', 'Ticket', 'Cabin'], axis='columns') # Exclui features não interessantes

X_teste['Sex'] = X_teste['Sex'].map({'male': 0, 'female': 1}) # Sex variável discreta

X_teste['Embarked'] = X_teste['Embarked'].map({'C': 0, 'S': 1, 'Q': 2}) # Embarked variável discreta

In [103]:
X_teste.isnull().sum()

PassengerId    0
Pclass         0
Sex            0
Age            0
SibSp          0
Parch          0
Fare           0
Embarked       0
dtype: int64

# Modelo

In [104]:
cv = KFold(n_splits=10, random_state=1, shuffle=True) # Cross validation

clf = RandomForestClassifier(random_state=31) # Classificador

scores = cross_val_score(clf, X_treino, y_treino, scoring='accuracy', cv=cv, n_jobs=-1)

media_scores = scores.mean()

media_scores

0.8193508114856428

## Tunning randomizado

In [105]:
param_space = {'bootstrap': [True],
               'max_depth': [6, 8, 10, 12, 14],
               'max_features': ['auto', 'sqrt', 'log2'],
               'min_samples_leaf': [2, 3, 4],
               'min_samples_split': [2, 3, 4, 5],
               'n_estimators': [100, 200, 300, 400, 500, 600, 700, 800, 900, 1000]}

forest_rand_search = RandomizedSearchCV(clf,
                                        param_space,
                                        n_iter=32,
                                        scoring='accuracy',
                                        cv=5,
                                        n_jobs=-1)

forest_rand_search.fit(X_treino, y_treino)

RandomizedSearchCV(cv=5, estimator=RandomForestClassifier(random_state=31),
                   n_iter=32, n_jobs=-1,
                   param_distributions={'bootstrap': [True],
                                        'max_depth': [6, 8, 10, 12, 14],
                                        'max_features': ['auto', 'sqrt',
                                                         'log2'],
                                        'min_samples_leaf': [2, 3, 4],
                                        'min_samples_split': [2, 3, 4, 5],
                                        'n_estimators': [100, 200, 300, 400,
                                                         500, 600, 700, 800,
                                                         900, 1000]},
                   scoring='accuracy')

In [106]:
forest_rand_search.best_params_

{'n_estimators': 1000,
 'min_samples_split': 2,
 'min_samples_leaf': 4,
 'max_features': 'log2',
 'max_depth': 12,
 'bootstrap': True}

In [107]:
forest_rand_search.best_estimator_

RandomForestClassifier(max_depth=12, max_features='log2', min_samples_leaf=4,
                       n_estimators=1000, random_state=31)

In [108]:
forest_rand_search.best_score_

0.8271734354403364

### Modelo tunado

In [109]:
clf = RandomForestClassifier(max_depth=10,
                             max_features='log2',
                             min_samples_leaf=3,
                             min_samples_split=4,
                             n_estimators=900,
                             random_state=31) # Classificador

scores = cross_val_score(clf, X_treino, y_treino, scoring='accuracy', cv=cv, n_jobs=-1)

media_scores = scores.mean()

media_scores

0.8216354556803994

## Tunning bayesiano

In [110]:
search_space = {'bootstrap': Categorical([True, False]),
                'max_depth': Integer(6, 20),
                'max_features': Categorical(['auto', 'sqrt', 'log2']),
                'min_samples_leaf': Integer(2, 10),
                'min_samples_split': Integer(2, 10),
                'n_estimators': Integer(100, 500)}

forest_bayes_search = BayesSearchCV(clf,
                                    search_space,
                                    n_iter=32,
                                    scoring='accuracy',
                                    n_jobs=-1,
                                    cv=5)

forest_bayes_search.fit(X_treino, y_treino)

BayesSearchCV(cv=5,
              estimator=RandomForestClassifier(max_depth=10,
                                               max_features='log2',
                                               min_samples_leaf=3,
                                               min_samples_split=4,
                                               n_estimators=900,
                                               random_state=31),
              n_iter=32, n_jobs=-1, scoring='accuracy',
              search_spaces={'bootstrap': Categorical(categories=(True, False), prior=None),
                             'max_depth': Integer(low=6, high=20, prior='uniform', transform='normalize'),
                             'max_features': Categorical(categories=('auto', 'sqrt', 'log2'), prior=None),
                             'min_samples_leaf': Integer(low=2, high=10, prior='uniform', transform='normalize'),
                             'min_samples_split': Integer(low=2, high=10, prior='uniform', transform='normali

In [111]:
forest_bayes_search.best_params_

OrderedDict([('bootstrap', False),
             ('max_depth', 19),
             ('max_features', 'log2'),
             ('min_samples_leaf', 10),
             ('min_samples_split', 2),
             ('n_estimators', 140)])

In [112]:
forest_bayes_search.best_estimator_

RandomForestClassifier(bootstrap=False, max_depth=19, max_features='log2',
                       min_samples_leaf=10, n_estimators=140, random_state=31)

In [113]:
forest_bayes_search.best_score_

0.8249387985688281

### Modelo tunado

In [114]:
clf = RandomForestClassifier(max_depth=17,
                             max_features='log2',
                             min_samples_leaf=4,
                             min_samples_split=5,
                             n_estimators=358,
                             random_state=31) # Classificador

scores = cross_val_score(clf, X_treino, y_treino, scoring='accuracy', cv=cv, n_jobs=-1)

media_scores = scores.mean()

media_scores

0.8238701622971286