# Kaggle desafio Titanic

In [1]:
import pandas as pd
import numpy as np
from sklearn import tree

import warnings
warnings.filterwarnings('ignore')

In [2]:
# Importando dados de treino
train_path = '../../data/Bruno-Godoi-Eilliar/train.csv'
train = pd.read_csv(train_path)

In [3]:
train.head(3)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S


### DESCRIÇÕES DAS VARIÁVEIS

In [4]:
# Importando dados de teste
test_path = '../../data/Bruno-Godoi-Eilliar/test.csv'
test = pd.read_csv(test_path)

In [5]:
test.head(3)

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q


In [6]:
print("Train data set shape", train.shape)
print("Test data set shape", test.shape)

Train data set shape (891, 12)
Test data set shape (418, 11)


In [7]:
#Descrição dos dados
train.describe()

Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare
count,891.0,891.0,891.0,714.0,891.0,891.0,891.0
mean,446.0,0.383838,2.308642,29.699118,0.523008,0.381594,32.204208
std,257.353842,0.486592,0.836071,14.526497,1.102743,0.806057,49.693429
min,1.0,0.0,1.0,0.42,0.0,0.0,0.0
25%,223.5,0.0,2.0,20.125,0.0,0.0,7.9104
50%,446.0,0.0,3.0,28.0,0.0,0.0,14.4542
75%,668.5,1.0,3.0,38.0,1.0,0.0,31.0
max,891.0,1.0,3.0,80.0,8.0,6.0,512.3292


In [8]:
# Quantas pessoas sobreviveram?
train['Survived'].value_counts()

0    549
1    342
Name: Survived, dtype: int64

In [9]:
# Verificar a porcentagem:
train['Survived'].value_counts(normalize=True) * 100

0    61.616162
1    38.383838
Name: Survived, dtype: float64

In [10]:
print("Quantos homens morreram?")
print(train['Survived'][train['Sex'] == 'male'].value_counts())
print(train['Survived'][train['Sex'] == 'male'].value_counts(normalize=True))

Quantos homens morreram?
0    468
1    109
Name: Survived, dtype: int64
0    0.811092
1    0.188908
Name: Survived, dtype: float64


In [11]:
print("Quantas mulheres sobreviveram?")
print(train['Survived'][train['Sex'] == 'female'].value_counts())
print(train['Survived'][train['Sex'] == 'female'].value_counts(normalize=True))

Quantas mulheres sobreviveram?
1    233
0     81
Name: Survived, dtype: int64
1    0.742038
0    0.257962
Name: Survived, dtype: float64


### Verificar concistêcia dos dados (data consistency)

In [12]:
for col in list(train.columns.values):
    print("Numeros de dados faltantes {}: {}".format(col, train[col].isnull().values.sum()))

Numeros de dados faltantes PassengerId: 0
Numeros de dados faltantes Survived: 0
Numeros de dados faltantes Pclass: 0
Numeros de dados faltantes Name: 0
Numeros de dados faltantes Sex: 0
Numeros de dados faltantes Age: 177
Numeros de dados faltantes SibSp: 0
Numeros de dados faltantes Parch: 0
Numeros de dados faltantes Ticket: 0
Numeros de dados faltantes Fare: 0
Numeros de dados faltantes Cabin: 687
Numeros de dados faltantes Embarked: 2


### Tratamento os dados (feature engeneering)

In [13]:
# Criando a coluna de idade
train['Child'] = float('NAN')

In [14]:
train.Child[train['Age'] < 18] = 1
train.Child[train['Age'] >= 18] = 0
train.head(3)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Child
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S,0.0
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C,0.0
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S,0.0


In [15]:
# Verificando Pessoas a baixo de 18 anos
print("Pessoas abaixo de 18 anos: ")
print(train['Survived'][train['Child'] == 1].value_counts())
print(train['Survived'][train['Child'] == 1].value_counts(normalize=True))

Pessoas abaixo de 18 anos: 
1    61
0    52
Name: Survived, dtype: int64
1    0.539823
0    0.460177
Name: Survived, dtype: float64


In [16]:
# Verificando Pessoas a baixo de 18 anos
print("Pessoas acima de 18 anos: ")
print(train['Survived'][train['Child'] == 0].value_counts())
print(train['Survived'][train['Child'] == 0].value_counts(normalize=True))

Pessoas acima de 18 anos: 
0    372
1    229
Name: Survived, dtype: int64
0    0.618968
1    0.381032
Name: Survived, dtype: float64


#### Convertendo dados

In [17]:
# Convertendo dados da coluna Sexo
train['Sex'][train['Sex'] == 'male'] = 0
train['Sex'][train['Sex'] == 'female'] = 1
train.head(3)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Child
0,1,0,3,"Braund, Mr. Owen Harris",0,22.0,1,0,A/5 21171,7.25,,S,0.0
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",1,38.0,1,0,PC 17599,71.2833,C85,C,0.0
2,3,1,3,"Heikkinen, Miss. Laina",1,26.0,0,0,STON/O2. 3101282,7.925,,S,0.0


In [18]:
# Como tem apenas 2 registros que não embarcaram, então vamos setar o local de embarque com 'S'
train['Embarked']  = train['Embarked'].fillna('S')

In [19]:
# Convertendo os dados e embarque
train['Embarked'][train['Embarked'] == 'S'] = 0
train['Embarked'][train['Embarked'] == 'C'] = 1
train['Embarked'][train['Embarked'] == 'Q'] = 2
train.head(3)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Child
0,1,0,3,"Braund, Mr. Owen Harris",0,22.0,1,0,A/5 21171,7.25,,0,0.0
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",1,38.0,1,0,PC 17599,71.2833,C85,1,0.0
2,3,1,3,"Heikkinen, Miss. Laina",1,26.0,0,0,STON/O2. 3101282,7.925,,0,0.0


### Quais atributos vamos considerar para meu algoritmo de machine learn?

<ul>
    <li>Pclass</li>
    <li>Sex</li>
    <li>Age</li>
    <li>Fare</li>
</ul>

In [20]:
# Verificando se contem dados faltantes para estas colunas:
print("Números de dados faltantes: {}".format(train[['Pclass', 'Sex', 'Age', 'Fare']].isnull().values.sum()))

Números de dados faltantes: 177


Pela análise anterior, já haviamos detectado que 177 dados faltantes eram da idade

In [21]:
train2 = train[['Pclass', 'Sex', 'Age', 'Fare', 'Survived']]

In [22]:
# Como tem muitos dados faltantes na idade, vamos descartar
train2.dropna(axis=0, inplace=True)

In [23]:
print("Train shape: {}".format(train.shape))
print("Train2 shape: {}".format(train2.shape))

Train shape: (891, 13)
Train2 shape: (714, 5)


### Vamos separar nossos atributos (feature) do que queremos prever (target)

In [24]:
target = train2['Survived'].values
features_one = train2[['Pclass', 'Sex', 'Age', 'Fare']].values

# Iniciando o processo de Machine Learn

Temos de decidir qual modelo de Machine Learn empregar em nossa análise. Nesse caso vamos utilizar o Decsion Tree (árvore de decisão)

In [25]:
# Primeiro vamos fazer o treinamento
my_tree_one = tree.DecisionTreeClassifier()
my_tree_one = my_tree_one.fit(features_one, target)

In [26]:
# Verificar os pesos dos atributos que eu escolhi
feature_list = ['Pclass', 'Sex', 'Age', 'Fare']
importances = my_tree_one.feature_importances_

for k in range(0, len(feature_list)):
    print("Atributo: {} -> Importante: {}". format(feature_list[k], importances[k]))

Atributo: Pclass -> Importante: 0.14851988665700472
Atributo: Sex -> Importante: 0.3005122109582393
Atributo: Age -> Importante: 0.2578366816682424
Atributo: Fare -> Importante: 0.29313122071651354


In [27]:
# É apena para exemplificar, mas não se verifica o score com a sua base de treino, teria que ter outra base
print("Score: {}".format(my_tree_one.score(features_one, target)))

Score: 0.9845938375350141


98% De acerto (óbivio porque utilizamos no próprio dado que treinamos)

# Já treinei meu algoritmo, agora vamos fazer algumas predições

In [28]:
# Verificando os dados faltantes da noss base de teste
for col in list(test.columns.values):
    print("Dados faltantes {} : {}".format(col, test[col].isnull().values.sum()))

Dados faltantes PassengerId : 0
Dados faltantes Pclass : 0
Dados faltantes Name : 0
Dados faltantes Sex : 0
Dados faltantes Age : 86
Dados faltantes SibSp : 0
Dados faltantes Parch : 0
Dados faltantes Ticket : 0
Dados faltantes Fare : 1
Dados faltantes Cabin : 327
Dados faltantes Embarked : 0


In [29]:
# Vamos prencher os dados faltantes. Vamos preencher com a mediana
test.Fare[152] = test.Fare.median()
test.Age = test.Age.fillna(test.Age.median())

In [30]:
# Verificando os dados faltantes da noss base de teste
for col in list(test.columns.values):
    print("Dados faltantes {} : {}".format(col, test[col].isnull().values.sum()))

Dados faltantes PassengerId : 0
Dados faltantes Pclass : 0
Dados faltantes Name : 0
Dados faltantes Sex : 0
Dados faltantes Age : 0
Dados faltantes SibSp : 0
Dados faltantes Parch : 0
Dados faltantes Ticket : 0
Dados faltantes Fare : 0
Dados faltantes Cabin : 327
Dados faltantes Embarked : 0


In [31]:
# Convertendo o sexo
test['Sex'][test['Sex'] == 'male'] = 0
test['Sex'][test['Sex'] == 'female'] = 1

In [32]:
# preenchendo os dados faltantes de embarque
test['Embarked'] = test['Embarked'].fillna('S')
# Convertendo os dados de embarque
test['Embarked'][test['Embarked'] == 'S'] = 0
test['Embarked'][test['Embarked'] == 'C'] = 1
test['Embarked'][test['Embarked'] == 'Q'] = 2

In [33]:
test.head(3)

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,3,"Kelly, Mr. James",0,34.5,0,0,330911,7.8292,,2
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",1,47.0,1,0,363272,7.0,,0
2,894,2,"Myles, Mr. Thomas Francis",0,62.0,0,0,240276,9.6875,,2


### Vamos separar os valores dos atributos que vamos utilizar da base de teste

In [35]:
test_features = test[['Pclass', 'Sex', 'Age', 'Fare']].values

### Vamos fazer a predição com essa base de teste

In [36]:
my_prediction = my_tree_one.predict(test_features)

### Criando o DataFrame com duas colunas, ID do passageiro e a previsão (Morreu ou não)

In [37]:
PassengerId = np.array(test['PassengerId']).astype(int)

my_solution = pd.DataFrame(my_prediction, index=PassengerId, columns=['Survived'])

print(my_solution)

      Survived
892          0
893          0
894          1
895          1
896          0
897          0
898          0
899          0
900          1
901          0
902          0
903          0
904          1
905          1
906          1
907          1
908          0
909          1
910          1
911          0
912          0
913          1
914          1
915          0
916          1
917          0
918          1
919          1
920          1
921          0
...        ...
1280         0
1281         0
1282         0
1283         1
1284         1
1285         0
1286         0
1287         1
1288         0
1289         1
1290         0
1291         0
1292         1
1293         0
1294         1
1295         1
1296         0
1297         0
1298         0
1299         0
1300         1
1301         1
1302         1
1303         1
1304         0
1305         0
1306         1
1307         0
1308         0
1309         0

[418 rows x 1 columns]


# Gerando o CSV da minha predição

In [38]:
my_solution.to_csv('my_solution_one.csv', index_label=['PassengerId'])

# Vamos controlar o overfitting

In [39]:
# vamos criar um novo array de atributos
train3 = train[['Pclass', 'Age', 'Sex', 'Fare', 'SibSp', 'Parch', 'Embarked']]
train3.head(3)

Unnamed: 0,Pclass,Age,Sex,Fare,SibSp,Parch,Embarked
0,3,22.0,0,7.25,1,0,0
1,1,38.0,1,71.2833,1,0,1
2,3,26.0,1,7.925,0,0,0


In [40]:
# vamos remover os dados faltantes
train3.dropna(axis=0, inplace=True)

In [41]:
feature_two = train3.values

In [42]:
# Controlando a profundidade da arvore de decisão
max_depth = 10
min_samples_split = 5

In [43]:
my_tree_two = tree.DecisionTreeClassifier(
    max_depth=max_depth, 
    min_samples_split=min_samples_split, 
    random_state=1
)
# Efetuar o treino
my_tree_two = my_tree_two.fit(feature_two, target)

In [49]:
print("Score:",  my_tree_two.score(feature_two, target))

Score: 0.9103641456582633


In [52]:
feature_list = ['Pclass', 'Age', 'Sex', 'Fare', 'SibSp', 'Parch', 'Embarked']
importances = my_tree_two.feature_importances_

for k in range(0,len(feature_list)):
    print("Atributo: {} -> Importante {}".format(feature_list[k], importances[k]))

Atributo: Pclass -> Importante 0.16563960458268956
Atributo: Age -> Importante 0.20814617627635315
Atributo: Sex -> Importante 0.39599613972986275
Atributo: Fare -> Importante 0.14912301201968503
Atributo: SibSp -> Importante 0.060127643699185586
Atributo: Parch -> Importante 0.011709952159711616
Atributo: Embarked -> Importante 0.00925747153251226


# Vamos efetuar a segunda tentativa de previsão

In [54]:
test_features = test[['Pclass', 'Age', 'Sex', 'Fare', 'SibSp', 'Parch', 'Embarked']].values

In [56]:
my_prediction2 = my_tree_two.predict(test_features)

In [59]:
my_solution2 = pd.DataFrame(my_prediction2, index=PassengerId, columns=['Survived'])

In [60]:
print(my_solution2)

      Survived
892          0
893          0
894          0
895          0
896          0
897          0
898          0
899          0
900          0
901          0
902          0
903          1
904          1
905          0
906          1
907          1
908          0
909          0
910          0
911          0
912          1
913          1
914          1
915          0
916          1
917          0
918          1
919          0
920          0
921          1
...        ...
1280         1
1281         0
1282         0
1283         1
1284         1
1285         0
1286         0
1287         1
1288         0
1289         1
1290         0
1291         0
1292         1
1293         0
1294         1
1295         0
1296         0
1297         0
1298         0
1299         0
1300         1
1301         1
1302         1
1303         1
1304         1
1305         0
1306         1
1307         0
1308         0
1309         1

[418 rows x 1 columns]


# Gerando o CSV da tentativa de segunda previsão

In [61]:
my_solution2.to_csv('my_solution_two.csv', index_label=['PassengerId'])

# Feature Enginering

In [62]:
# Vamos criar um novo atributo
train_two = train.copy()
train_two['family_size'] = train_two.SibSp + train.Parch + 1

In [65]:
train3 = train_two[['Pclass', 'Sex', 'Age', 'Fare', 'SibSp', 'Parch', 'family_size']]
for col in list(train3.columns.values):
    print('Dados faltantes {}: {}'.format(col, train3[col].isnull().values.sum()))

Dados faltantes Pclass: 0
Dados faltantes Sex: 0
Dados faltantes Age: 177
Dados faltantes Fare: 0
Dados faltantes SibSp: 0
Dados faltantes Parch: 0
Dados faltantes family_size: 0


In [66]:
train3.Age = train3.Age.fillna(train.Age.median())

In [67]:
target = train.Survived

### Vamos criar mais uma vez nossa treino

In [68]:
features_three = train3.values

In [70]:
my_tree_three = tree.DecisionTreeClassifier(max_depth=max_depth, min_samples_split=min_samples_split, random_state=1)
my_tree_three = my_tree_three.fit(features_three, target)

In [71]:
print("Score: ", my_tree_three.score(features_three, target))

Score:  0.898989898989899


### Vamos pegar nossa base de teste e criar esse novo atributo

In [72]:
test_three = test.copy()
test_three['family_size'] = test_three.SibSp + test_three.Parch + 1

In [73]:
# Vamos extrar os atributos a serem treinados
test_features = test_three[['Pclass', 'Sex', 'Age', 'Fare', 'SibSp', 'Parch', 'family_size']].values

### Vamos fazer nossa terceira predição

In [75]:
my_prediction3 = my_tree_three.predict(test_features)

## Vamos gerar nossa terceira solução

In [76]:
my_solution3 = pd.DataFrame(my_prediction3, PassengerId, columns=['Survived'])
print(my_solution3)

      Survived
892          0
893          0
894          0
895          0
896          0
897          0
898          0
899          0
900          1
901          0
902          0
903          0
904          1
905          0
906          1
907          1
908          0
909          0
910          1
911          0
912          0
913          1
914          1
915          1
916          1
917          0
918          1
919          0
920          0
921          0
...        ...
1280         0
1281         0
1282         1
1283         1
1284         1
1285         0
1286         0
1287         1
1288         0
1289         1
1290         0
1291         0
1292         1
1293         0
1294         1
1295         1
1296         0
1297         0
1298         0
1299         0
1300         1
1301         1
1302         1
1303         1
1304         1
1305         0
1306         1
1307         0
1308         0
1309         0

[418 rows x 1 columns]


# Gerando arquivo final da terceira solução

In [78]:
my_solution3.to_csv('my_solution_three.csv', index_label=['PassengerId'])