# Get data

In [119]:


import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

datas = []
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        datas.append(os.path.join(dirname, filename))


In [120]:
test_data = pd.read_csv(datas[1])
train_data = pd.read_csv(datas[2])

train_data.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [121]:
test_data.head()

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S


# # Trasnform Sex in binary and get Y (Survived)

In [122]:
y = train_data.Survived
train_data.drop('Survived', axis = 1, inplace = True)
train_data.head()

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [126]:
train_copy = train_data.copy()
test_copy = test_data.copy()

def sex_binary(value):
    if value == 'female':
        return 1
    else:
        return 0

set(train_copy) == set(test_copy) # TRUE

train_copy['Binary_sex'] = train_copy['Sex'].map(sex_binary)
test_copy['Binary_sex'] = test_copy['Sex'].map(sex_binary)

train_copy.drop('Sex', axis = 1, inplace = True)
test_copy.drop('Sex', axis = 1, inplace = True)

train_copy.head()

Unnamed: 0,PassengerId,Pclass,Name,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Binary_sex
0,1,3,"Braund, Mr. Owen Harris",22.0,1,0,A/5 21171,7.25,,S,0
1,2,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",38.0,1,0,PC 17599,71.2833,C85,C,1
2,3,3,"Heikkinen, Miss. Laina",26.0,0,0,STON/O2. 3101282,7.925,,S,1
3,4,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",35.0,1,0,113803,53.1,C123,S,1
4,5,3,"Allen, Mr. William Henry",35.0,0,0,373450,8.05,,S,0


# Handle Missing values

In [127]:
# I think these features arent useful
train_copy.drop(['Ticket', 'Cabin','PassengerId', 'Name'], axis = 1, inplace=True)
test_copy.drop(['Ticket', 'Cabin','PassengerId', 'Name'], axis = 1, inplace=True)

In [128]:
object_cols = [col for col in train_copy.columns if train_copy[col].dtype == 'object']
n_cols = [col for col in train_copy.columns if train_copy[col].dtype != 'object']

print('Categorical variables: {}\nNumerical variables: {}'.format(object_cols, n_cols))

Categorical variables: ['Embarked']
Numerical variables: ['Pclass', 'Age', 'SibSp', 'Parch', 'Fare', 'Binary_sex']


In [129]:
#Separate high and low cardinality to use Label and One hot encoding
high_cardinality = [col for col in object_cols if train_copy[col].nunique() > 10]
low_cardinality = set(object_cols) - set(high_cardinality)

print('High cardinality: {}\nLow cardinality: {}'.format(high_cardinality, low_cardinality))

High cardinality: []
Low cardinality: {'Embarked'}


# Pipeline

In [135]:
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder

# places missing values with the most frequent value
numerical_transformer = SimpleImputer(strategy = 'constant')

# the same to missing categorical and apply one hot encoding to a low card col
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy= 'most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

preprocessor = ColumnTransformer(transformers=[
    ('num', numerical_transformer, n_cols),
    ('cat', categorical_transformer, object_cols)
])

# Tests with model

In [147]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error
import numpy as np
X_train,X_valid, y_train, y_valid = train_test_split(train_copy, y, train_size = 0.8, test_size = 0.2)
trees = [25,50, 100, 150, 200, 250, 300]

for tree in trees:
    model = RandomForestClassifier(max_leaf_nodes=tree, random_state = 0)
    my_pipeline = Pipeline(steps=[
        ('preprocessor', preprocessor),
        ('model', model)
    ])
    my_pipeline.fit(X_train, y_train)
    preds = my_pipeline.predict(X_valid)
    print(np.mean(y_valid == preds))

0.7988826815642458
0.8156424581005587
0.8324022346368715
0.8044692737430168
0.8044692737430168
0.8044692737430168
0.8044692737430168


# Submission

In [149]:
model = RandomForestClassifier(max_leaf_nodes=100, n_estimators = 5, random_state = 0)
my_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('model', model)
])
my_pipeline.fit(train_copy, y)
preds = my_pipeline.predict(test_copy)

sub = pd.Series(preds, index=test_data['PassengerId'], name='Survived')
sub.to_csv('titanic_second_model2.csv', index=True)

# Your submission scored 0.75119, which is an improvement of your previous score of 0.72488. Great job!