### Titanic - Submissão 3

In [180]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.impute import KNNImputer
from sklearn.metrics import roc_auc_score
from skopt import gp_minimize
from skopt import dummy_minimize

In [223]:
# Dados
train = pd.read_csv('datasets/titanic/train.csv')
train.head(3)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S


In [224]:
# Describing...
train.describe()

Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare
count,891.0,891.0,891.0,714.0,891.0,891.0,891.0
mean,446.0,0.383838,2.308642,29.699118,0.523008,0.381594,32.204208
std,257.353842,0.486592,0.836071,14.526497,1.102743,0.806057,49.693429
min,1.0,0.0,1.0,0.42,0.0,0.0,0.0
25%,223.5,0.0,2.0,20.125,0.0,0.0,7.9104
50%,446.0,0.0,3.0,28.0,0.0,0.0,14.4542
75%,668.5,1.0,3.0,38.0,1.0,0.0,31.0
max,891.0,1.0,3.0,80.0,8.0,6.0,512.3292


In [225]:
# Shape
train.shape # 891 rows / 12 columns

(891, 12)

In [226]:
# Infos
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB


In [227]:
# NaN values
train.isna().sum()

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64

In [228]:
# Removing some features (won't be used)
train = train.drop(['PassengerId','Name','Cabin','Ticket'],axis=1)

In [229]:
# The data:
train.head(3)

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,0,3,male,22.0,1,0,7.25,S
1,1,1,female,38.0,1,0,71.2833,C
2,1,3,female,26.0,0,0,7.925,S


In [230]:
# Adjusting Embarked
train.groupby('Embarked').size()

Embarked
C    168
Q     77
S    644
dtype: int64

In [231]:
# Transforming the two NaN values into 'S' class
train.loc[train.Embarked.isnull(),'Embarked'] = 'S'

In [232]:
# Converting some variables into numeric (LabelEncoder)
train['Sex_Binary'] = LabelEncoder().fit_transform(train['Sex'])
train['Embarked_Binary'] = LabelEncoder().fit_transform(train['Embarked'])

In [233]:
sex_orders = dict(zip(train['Sex'].unique(),train['Sex_Binary'].unique()))
embarked_orders = dict(zip(train['Embarked'].unique(),train['Embarked_Binary'].unique()))

In [234]:
sex_orders, embarked_orders

({'male': 1, 'female': 0}, {'S': 2, 'C': 0, 'Q': 1})

In [221]:
# The data:
train.head(3)

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked,Sex_Binary,Embarked_Binary
0,0,3,male,22.0,1,0,7.25,S,1,2
1,1,1,female,38.0,1,0,71.2833,C,0,0
2,1,3,female,26.0,0,0,7.925,S,0,2


In [236]:
# Imputing mean of age by using KNN
train = train.drop(['Sex','Embarked'], axis = 1)
traindf = pd.DataFrame(KNNImputer(n_neighbors = 5).fit_transform(train), columns=train.columns)

In [237]:
# Verifying NaN values
traindf.isna().sum()

Survived           0
Pclass             0
Age                0
SibSp              0
Parch              0
Fare               0
Sex_Binary         0
Embarked_Binary    0
dtype: int64

In [238]:
# Splitting into Train and Test (Traindf)
x = traindf.drop(['Survived'],axis=1)
y = traindf['Survived']

In [239]:
x_tr, x_ts, y_tr, y_ts = train_test_split(x,y, random_state=0,train_size=0.6)
x_tr.shape, x_ts.shape, y_tr.shape, y_ts.shape

((534, 7), (357, 7), (534,), (357,))

In [240]:
# Building the model
# ?GradientBoostingClassifier
model = GradientBoostingClassifier() # using default values

# Fitting the model
model.fit(x_tr, y_tr)

# Predicting using test
p = model.predict(x_ts)

# Calculating roc_auc_score
print('My accuracy, using only the dafault parameters, is {0:0.2f}%'.format(100 * roc_auc_score(p,y_ts)))

My accuracy, using only the dafault parameters, is 78.72%


In [241]:
# Let's try to tune this model by using skopt.gp_minimize 
def fit_model_func(params):
    
    # Selecting some parameters
    learning_rate = params[0]
    n_estimators = params[1]
    max_depth = params[2]
    min_samples_split = params[3]
    subsample = params[4]
    
    # Params that are using
    print(params, '\n')
    
    # Fit the model
    modelfit = GradientBoostingClassifier(
        learning_rate=learning_rate,
        n_estimators=n_estimators,
        max_depth=max_depth,
        min_samples_split=min_samples_split,
        subsample=subsample)
    
    modelfit.fit(x_tr,y_tr)
    
    # Viewing the score
    prediction = modelfit.predict(x_ts)
    
    # ROC_AUC_SCORE
    try:
        return -roc_auc_score(prediction, y_ts)
    except ValueError:
        pass

# Defining the intervals to be optimized
space = [(1e-4,1e-1), # learning_rate
        (100,1000), # n_estimators
        (2,5), # max_depth
        (2,3), # min_samples_split
        (0.2,1)] # subsample

In [242]:
# Let's rock
results = dummy_minimize(fit_model_func, 
                           space, 
                           random_state=1, 
                           verbose=1, 
                           n_calls = 35)

Iteration No: 1 started. Evaluating function at random point.
[0.09971876261279299, 172, 5, 3, 0.44186605810547186] 

Iteration No: 1 ended. Evaluation done at random point.
Time taken: 0.1875
Function value obtained: -0.8048
Current minimum: -0.8048
Iteration No: 2 started. Evaluating function at random point.
[0.014760913492629596, 244, 3, 2, 0.510328592929606] 

Iteration No: 2 ended. Evaluation done at random point.
Time taken: 0.1616
Function value obtained: -0.8171
Current minimum: -0.8171
Iteration No: 3 started. Evaluating function at random point.
[0.06700762907666767, 381, 4, 2, 0.5353556115226359] 

Iteration No: 3 ended. Evaluation done at random point.
Time taken: 0.3178
Function value obtained: -0.8031
Current minimum: -0.8171
Iteration No: 4 started. Evaluating function at random point.
[0.06855342808963628, 352, 4, 2, 0.22191007455834094] 

Iteration No: 4 ended. Evaluation done at random point.
Time taken: 0.2479
Function value obtained: -0.7986
Current minimum: -0.817

Iteration No: 34 ended. Evaluation done at random point.
Time taken: 0.2940
Function value obtained: -0.8260
Current minimum: -0.8372
Iteration No: 35 started. Evaluating function at random point.
[0.02469648565354429, 789, 3, 3, 0.3854067716170667] 

Iteration No: 35 ended. Evaluation done at random point.
Time taken: 0.4791
Function value obtained: -0.8146
Current minimum: -0.8372


In [270]:
# Getting the best results
best_parameters = dict(zip(['learning_rate','n_estimators','max_depth','min_samples_split','subsample'], results.x))
best_parameters

{'learning_rate': 0.06244059564908761,
 'n_estimators': 164,
 'max_depth': 2,
 'min_samples_split': 3,
 'subsample': 0.4243551936515242}

### Adjusting the Training Dataset

In [272]:
train = pd.read_csv('datasets/titanic/train.csv')

train = train.drop(['PassengerId','Name','Cabin','Ticket'],axis=1)

train.loc[train.Embarked.isnull(),'Embarked'] = 'S'

train['Sex_Binary'] = LabelEncoder().fit_transform(train['Sex'])
train['Embarked_Binary'] = LabelEncoder().fit_transform(train['Embarked'])

train = train.drop(['Sex','Embarked'], axis = 1)
traindf = pd.DataFrame(KNNImputer(n_neighbors = 5).fit_transform(train), columns=train.columns)

x = traindf.drop('Survived',axis = 1)
y = traindf['Survived']

### Predicting the Official Test Dataset

In [277]:
test = pd.read_csv('datasets/titanic/test.csv')
test.head(3)

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q


In [278]:
# Adjusting the model
test = test.drop(['Name','Cabin','Ticket'],axis=1)

In [249]:
test.isna().sum()

Pclass       0
Sex          0
Age         86
SibSp        0
Parch        0
Fare         1
Embarked     0
dtype: int64

In [251]:
sex_orders, embarked_orders

({'male': 1, 'female': 0}, {'S': 2, 'C': 0, 'Q': 1})

In [279]:
test.loc[test.Sex == 'male','Sex_Binary'] = 1
test.loc[test.Sex == 'female','Sex_Binary'] = 0

In [280]:
test.loc[test.Embarked == 'Q','Embarked_Binary'] = 1
test.loc[test.Embarked == 'C','Embarked_Binary'] = 0
test.loc[test.Embarked == 'S','Embarked_Binary'] = 2

In [281]:
# Imputing mean of age by using KNN
test = test.drop(['Sex','Embarked'], axis = 1)
testdf = pd.DataFrame(KNNImputer(n_neighbors = 5).fit_transform(test), columns=test.columns)

In [282]:
testdf.head(3)

Unnamed: 0,PassengerId,Pclass,Age,SibSp,Parch,Fare,Sex_Binary,Embarked_Binary
0,892.0,3.0,34.5,0.0,0.0,7.8292,1.0,1.0
1,893.0,3.0,47.0,1.0,0.0,7.0,0.0,2.0
2,894.0,2.0,62.0,0.0,0.0,9.6875,1.0,1.0


### Fitting the model

In [283]:
#best_parameters
titanic_model = GradientBoostingClassifier(learning_rate=0.06244059564908761, 
                                           n_estimators=164,
                                           max_depth=2,
                                           min_samples_split=3,
                                           subsample=0.4243551936515242)

titanic_model.fit(x,y)

p = titanic_model.predict(testdf.drop('PassengerId',axis=1))

In [293]:
submission = pd.Series(pd.to_numeric(p,downcast='integer'), pd.to_numeric(testdf.PassengerId,downcast='integer'),
                       name='Survived')

In [294]:
submission.to_csv('datasets/titanic/submission_Titanic.csv')