# Análise Titanic (Parte 2)- Feature Scaling

Neste notebook iremos continuar com a análise de dados dos passageiros do Titanic. 
Iremos padronizar os valores quantitativos de idade ('age') e 'familia' tirando a média e dividindo pela variância (um dos possíveis tratamentos que podemos fazer com os dados).


## Baixar os dados e realizar tratamentos feitos na Parte 1

In [24]:
import pandas as pd

dados = pd.read_excel('http://biostat.mc.vanderbilt.edu/wiki/pub/Main/DataSets/titanic3.xls')
## Criar variável "familia" como a soma entre 'sibsp' e 'parch'
dados['familia'] = dados['sibsp'] + dados['parch'] 

from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(dados.iloc[:,dados.columns.values !='survived'], dados['survived'], test_size=0.33, random_state=42)

## Inserir Idades faltantes com a Mediana
X_train['age'].fillna(X_train['age'].median(), inplace = True)
X_test['age'].fillna(X_test['age'].median(), inplace = True)

#Criar dummies
X_train = pd.get_dummies(X_train, columns=["pclass","sex"])
X_test = pd.get_dummies(X_test, columns=["pclass","sex"])

#Apagar as dummies que servirão de base
del X_train['sex_female']
del X_train['pclass_1']

del X_test['sex_female']
del X_test['pclass_1']

from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix

# Tabela com resultados finais
resultados_acuracia = pd.DataFrame(columns = ['modelo', 'acuracia'])

In [25]:
#### Padronizar as variáveis quantitativas
from sklearn import preprocessing

std_scale = preprocessing.StandardScaler().fit(X_train[['age', 'familia']])
X_train[['age', 'familia']] = std_scale.transform(X_train[['age','familia']])

std_scale2 = preprocessing.StandardScaler().fit(X_test[['age', 'familia']])
X_test[['age', 'familia']] = std_scale2.transform(X_test[['age','familia']])


## Rodar modelos

In [42]:

#######################################################
#
#               Rodar Modelos
#
#######################################################

from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix

from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn import svm
from sklearn.tree import DecisionTreeClassifier

modelos = [
    [LogisticRegression(fit_intercept = True),'lr_sklearn'],
    [KNeighborsClassifier(n_neighbors=3),'knn'],
    [GaussianNB(),'naive_bayes'],
    [LinearDiscriminantAnalysis(),'lda'],
    [svm.SVC(gamma=0.1, C=1.0),'svm'],
    [DecisionTreeClassifier(), 'arvore_decisao']
]

# Criar uma função para rodar todos os modelos 
def analise(modelo, nome, tabela_acuracia):
    
    modelo.fit(X_train[['pclass_2','pclass_3','age','sex_male','familia']],y_train)
    previsto = modelo.predict(X_test[['pclass_2','pclass_3','age','sex_male','familia']])
    acuracia = accuracy_score(y_test,previsto)
    tabela_acuracia = tabela_acuracia.append({'modelo':nome, 'acuracia':acuracia}, ignore_index = True)
                                             
    return tabela_acuracia

for item in modelos:
    resultados_acuracia = analise(item[0],item[1], resultados_acuracia)

In [43]:
resultados_acuracia

Unnamed: 0,modelo,acuracia
0,lr_sklearn,0.803241
1,knn,0.752315
2,naive_bayes,0.784722
3,lda,0.800926
4,svm,0.798611
5,arvore_decisao,0.766204
