# Predicting Titanic survivors  

In [513]:
import numpy as np
import pandas as pd

In [514]:
# Import train & test data 
train = pd.read_csv('./train.csv')
test = pd.read_csv('./test.csv')

#We need this Id for the final submission
test_PassengerId = test["PassengerId"]

In [515]:
#View the first 10 training examples to get the idea about data set
train.head(10)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S
5,6,0,3,"Moran, Mr. James",male,,0,0,330877,8.4583,,Q
6,7,0,1,"McCarthy, Mr. Timothy J",male,54.0,0,0,17463,51.8625,E46,S
7,8,0,3,"Palsson, Master. Gosta Leonard",male,2.0,3,1,349909,21.075,,S
8,9,1,3,"Johnson, Mrs. Oscar W (Elisabeth Vilhelmina Berg)",female,27.0,0,2,347742,11.1333,,S
9,10,1,2,"Nasser, Mrs. Nicholas (Adele Achem)",female,14.0,1,0,237736,30.0708,,C


## Missing Values 

In [516]:
train.isnull().sum()

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64

In [517]:
test.isnull().sum()


PassengerId      0
Pclass           0
Name             0
Sex              0
Age             86
SibSp            0
Parch            0
Ticket           0
Fare             1
Cabin          327
Embarked         0
dtype: int64

## Data Preprocessing 

In [518]:
#Extracting Honorific name from the name field

train_title = [i.split(",")[1].split(".")[0].strip() for i in train["Name"]]
train["Title"] = pd.Series(train_title)
train["Title"].head()

## Repeat the steps from test set

test_title = [i.split(",")[1].split(".")[0].strip() for i in test["Name"]]
test["Title"] = pd.Series(test_title)
test["Title"].head()

0     Mr
1    Mrs
2     Mr
3     Mr
4    Mrs
Name: Title, dtype: object

In [519]:
# normalize the titles
# Source : https://ipfs.io/ipfs/QmXoypizjW3WknFiJnKLwHCnL72vedxjQkDDP1mXWo6uco/wiki/English_honorifics.html

normalized_titles = {
    "Capt":       "Officer",
    "Col":        "Officer",
    "Major":      "Officer",
    "Jonkheer":   "Royalty",
    "Don":        "Royalty",
    "Sir" :       "Royalty",
    "Dr":         "Officer",
    "Rev":        "Officer",
    "the Countess":"Royalty",
    "Dona":       "Royalty",
    "Mme":        "Mrs",
    "Mlle":       "Miss",
    "Ms":         "Mrs",
    "Mr" :        "Mr",
    "Mrs" :       "Mrs",
    "Miss" :      "Miss",
    "Master" :    "Master",
    "Lady" :      "Royalty"
}

train.Title = train.Title.map(normalized_titles)
test.Title = test.Title.map(normalized_titles)

In [520]:
train.Title.value_counts()

Mr         517
Miss       184
Mrs        127
Master      40
Officer     18
Royalty      5
Name: Title, dtype: int64

### Age 

In [521]:
#Filling the missing Age field
#Instead filling the field with median, we'll fill it with median of the correspodning group

group = train.groupby(['Sex','Pclass', 'Title'])  
# view the median Age by the grouped features 
group.Age.median()

Sex     Pclass  Title  
female  1       Miss       30.0
                Mrs        40.0
                Officer    49.0
                Royalty    40.5
        2       Miss       24.0
                Mrs        31.5
        3       Miss       18.0
                Mrs        31.0
male    1       Master      4.0
                Mr         40.0
                Officer    51.0
                Royalty    40.0
        2       Master      1.0
                Mr         31.0
                Officer    46.5
        3       Master      4.0
                Mr         26.0
Name: Age, dtype: float64

In [522]:
train["Age"] = group.Age.apply(lambda x: x.fillna(x.median()))

In [523]:
#Repeat these steps to fill NAs from test set
group = test.groupby(['Sex','Pclass', 'Title'])  
test["Age"] = group.Age.apply(lambda x: x.fillna(x.median()))

In [524]:
#Create Family size attribute

# Create a family size descriptor from SibSp and Parch
train["Family_size"] = train["SibSp"] + train["Parch"] + 1
test["Family_size"] = test["SibSp"] + test["Parch"] + 1

### Cabin 

In [525]:
# To fill the missing values for the Cabin, let's understand if there's any pattern for cabin allocation
group_n = train.groupby(['Sex','Pclass', 'Title','Cabin'])
group_n.Age.count()

Sex     Pclass  Title    Cabin          
female  1       Miss     B18                1
                         B22                1
                         B28                1
                         B35                1
                         B39                1
                         B42                1
                         B5                 2
                         B57 B59 B63 B66    2
                         B73                1
                         B77                1
                         B79                1
                         B80                1
                         B96 B98            1
                         C103               1
                         C125               1
                         C22 C26            1
                         C23 C25 C27        2
                         C32                1
                         C45                1
                         C49                1
                         C54           

In [526]:
#It looks like passengers with title 'Miss' are allocated with B,C,D cabins
#Those with title 'Mr' are allocated with Cabin 'E'. 
#We'll extract first letter of the cabin as it is possible that some cabins were closer to the life boats
#Also, We'll  Assign all the null values as "N"

train.Cabin.fillna("N",inplace=True)
test.Cabin.fillna("N",inplace=True)

train.Cabin=train.Cabin.map(lambda x: x[0])
test.Cabin=test.Cabin.map(lambda x: x[0])



### Embarked 

In [527]:
train.Embarked.value_counts()

S    644
C    168
Q     77
Name: Embarked, dtype: int64

In [528]:
# We see that maximum value for Embarked is S. Hence we'll assign missing values with 'S'
train.Embarked.fillna("S",inplace=True)
test.Embarked.fillna("S",inplace=True)

### Fare 

In [529]:
#There are missing values for 'Fare' in the test set. 

train['Fare'].fillna(train['Fare'].median(), inplace = True)
test['Fare'].fillna(test['Fare'].median(), inplace = True)

In [530]:
#Check if all missing values are filled
train.isnull().sum()

PassengerId    0
Survived       0
Pclass         0
Name           0
Sex            0
Age            0
SibSp          0
Parch          0
Ticket         0
Fare           0
Cabin          0
Embarked       0
Title          0
Family_size    0
dtype: int64

In [531]:

test.isnull().sum()

PassengerId    0
Pclass         0
Name           0
Sex            0
Age            0
SibSp          0
Parch          0
Ticket         0
Fare           0
Cabin          0
Embarked       0
Title          0
Family_size    0
dtype: int64

## Handle categorical data 

In [532]:
cat = train.select_dtypes(exclude=np.number)
cat.describe()


Unnamed: 0,Name,Sex,Ticket,Cabin,Embarked,Title
count,891,891,891,891,891,891
unique,891,2,681,9,3,6
top,"Hansen, Mr. Henry Damsgaard",male,CA. 2343,N,S,Mr
freq,1,577,7,687,646,517


In [533]:
#We'll drop Ticket column as it would not help in modelling

train.drop(['Ticket'], axis=1, inplace = True)
test.drop(['Ticket'], axis=1, inplace = True)


### Sex 

In [534]:
# encode male as '1' and Female as '0'
train['sex_enc'] = train.Sex.apply(lambda x: 1 if x=='male' else 0)
test['sex_enc'] = test.Sex.apply(lambda x: 1 if x=='male' else 0)

### Cabin 

In [535]:
train.Cabin.value_counts()


N    687
C     59
B     47
D     33
E     32
A     15
F     13
G      4
T      1
Name: Cabin, dtype: int64

In [536]:
test.Cabin.value_counts()

N    327
C     35
B     18
D     13
E      9
F      8
A      7
G      1
Name: Cabin, dtype: int64

In [541]:
#We see that there's only one value for 'T' in training set and no values for 'T' in test set.
#We replace 'T' with 'N'
train[train.Cabin=='T']=train[train.Cabin=='T'].replace('T','N')

In [542]:
#Create dummy variable for Cabin

train_cabin_dummies = pd.get_dummies(train.Cabin, prefix="Cabin")
test_cabin_dummies = pd.get_dummies(test.Cabin, prefix="Cabin")

In [543]:
#Append dummy variables to train & test set

train = pd.concat([train, train_cabin_dummies], axis=1)
test = pd.concat([test, test_cabin_dummies], axis=1)

### Embarked 

In [544]:
test.shape, train.shape,

((418, 21), (891, 22))

In [545]:
from sklearn.preprocessing import LabelEncoder, OneHotEncoder

lbl = LabelEncoder() 
lbl.fit(list(train['Embarked'].values)) 
train['Embarked'] = lbl.transform(list(train['Embarked'].values))
lbl.fit(list(test['Embarked'].values)) 
test['Embarked'] = lbl.transform(list(test['Embarked'].values))

### Title 

In [546]:
train.Title.value_counts()

Mr         517
Miss       184
Mrs        127
Master      40
Officer     18
Royalty      5
Name: Title, dtype: int64

In [547]:
#create dummy variables for Title

train_title_dummies = pd.get_dummies(train.Title, prefix="Title")
test_title_dummies = pd.get_dummies(test.Title, prefix="Title")

In [548]:
#Append dummy variables to train & test set

train = pd.concat([train, train_title_dummies], axis=1)
test = pd.concat([test, test_title_dummies], axis=1)

In [550]:
#Let's Check our cleaned data set 
train.select_dtypes(include=np.number).head()

Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare,Embarked,Family_size,sex_enc,...,Cabin_E,Cabin_F,Cabin_G,Cabin_N,Title_Master,Title_Miss,Title_Mr,Title_Mrs,Title_Officer,Title_Royalty
0,1,0,3,22.0,1,0,7.25,2,2,1,...,0,0,0,1,0,0,1,0,0,0
1,2,1,1,38.0,1,0,71.2833,0,2,0,...,0,0,0,0,0,0,0,1,0,0
2,3,1,3,26.0,0,0,7.925,2,1,0,...,0,0,0,1,0,1,0,0,0,0
3,4,1,1,35.0,1,0,53.1,2,2,0,...,0,0,0,0,0,0,0,1,0,0
4,5,0,3,35.0,0,0,8.05,2,1,1,...,0,0,0,1,0,0,1,0,0,0


In [551]:
#Looks like PassengerId wouldn't help in predictions. Hence we'll drop it from data set

train.drop(['PassengerId'], axis=1, inplace = True)
test.drop(['PassengerId'], axis=1, inplace = True)

In [552]:
train.shape,test.shape

((891, 27), (418, 26))

## Outlier Detection 

In [553]:
from collections import Counter

def get_outlier_indices(train,n,features):
   
    outlier_indices = []   
    
    for i in features:
        
        # Interquartile Difference
        IQD = np.percentile(train[i],75) - np.percentile(train[i], 25)
        
        # outlier  = 1.5* Interquartile Difference
        outlier = 1.5 * IQD        
       
        outlier_list = train[(train[i] < np.percentile(train[i], 25) - outlier) | (train[i] > np.percentile(train[i],75) + outlier )].index        
       
        outlier_indices.extend(outlier_list)
        
    # select rows containing more than 2 outliers
    outlier_indices = Counter(outlier_indices)        
    all_outliers = list( j for j, k in outlier_indices.items() if k > n )
    
    return all_outliers   


indices_to_drop = get_outlier_indices(train,2,["Age","SibSp","Parch","Fare"])


In [554]:
#List the outliers to be removed
train.loc[indices_to_drop]

Unnamed: 0,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Fare,Cabin,Embarked,...,Cabin_E,Cabin_F,Cabin_G,Cabin_N,Title_Master,Title_Miss,Title_Mr,Title_Mrs,Title_Officer,Title_Royalty
438,0,1,"Fortune, Mr. Mark",male,64.0,1,4,263.0,C,2,...,0,0,0,0,0,0,1,0,0,0
745,0,1,"Crosby, Capt. Edward Gifford",male,70.0,1,1,71.0,B,2,...,0,0,0,0,0,0,0,0,1,0
27,0,1,"Fortune, Mr. Charles Alexander",male,19.0,3,2,263.0,C,2,...,0,0,0,0,0,0,1,0,0,0
88,1,1,"Fortune, Miss. Mabel Helen",female,23.0,3,2,263.0,C,2,...,0,0,0,0,0,1,0,0,0,0
159,0,3,"Sage, Master. Thomas Henry",male,4.0,8,2,69.55,N,2,...,0,0,0,1,1,0,0,0,0,0
180,0,3,"Sage, Miss. Constance Gladys",female,18.0,8,2,69.55,N,2,...,0,0,0,1,0,1,0,0,0,0
201,0,3,"Sage, Mr. Frederick",male,26.0,8,2,69.55,N,2,...,0,0,0,1,0,0,1,0,0,0
324,0,3,"Sage, Mr. George John Jr",male,26.0,8,2,69.55,N,2,...,0,0,0,1,0,0,1,0,0,0
341,1,1,"Fortune, Miss. Alice Elizabeth",female,24.0,3,2,263.0,C,2,...,0,0,0,0,0,1,0,0,0,0
792,0,3,"Sage, Miss. Stella Anna",female,18.0,8,2,69.55,N,2,...,0,0,0,1,0,1,0,0,0,0


In [555]:
# Drop outliers
train = train.drop(indices_to_drop, axis = 0).reset_index(drop=True)

In [556]:

train_temp = train.select_dtypes(include=[np.number]).interpolate().dropna()

Y_train = train["Survived"]

X_train = train_temp.drop(labels = ["Survived"],axis = 1)


In [557]:
test.select_dtypes(include=[np.number])

Unnamed: 0,Pclass,Age,SibSp,Parch,Fare,Embarked,Family_size,sex_enc,Cabin_A,Cabin_B,...,Cabin_E,Cabin_F,Cabin_G,Cabin_N,Title_Master,Title_Miss,Title_Mr,Title_Mrs,Title_Officer,Title_Royalty
0,3,34.5,0,0,7.8292,1,1,1,0,0,...,0,0,0,1,0,0,1,0,0,0
1,3,47.0,1,0,7.0000,2,2,0,0,0,...,0,0,0,1,0,0,0,1,0,0
2,2,62.0,0,0,9.6875,1,1,1,0,0,...,0,0,0,1,0,0,1,0,0,0
3,3,27.0,0,0,8.6625,2,1,1,0,0,...,0,0,0,1,0,0,1,0,0,0
4,3,22.0,1,1,12.2875,2,3,0,0,0,...,0,0,0,1,0,0,0,1,0,0
5,3,14.0,0,0,9.2250,2,1,1,0,0,...,0,0,0,1,0,0,1,0,0,0
6,3,30.0,0,0,7.6292,1,1,0,0,0,...,0,0,0,1,0,1,0,0,0,0
7,2,26.0,1,1,29.0000,2,3,1,0,0,...,0,0,0,1,0,0,1,0,0,0
8,3,18.0,0,0,7.2292,0,1,0,0,0,...,0,0,0,1,0,0,0,1,0,0
9,3,21.0,2,0,24.1500,2,3,1,0,0,...,0,0,0,1,0,0,1,0,0,0


In [559]:
test = test.select_dtypes(include=[np.number]).interpolate().dropna()
test = test[X_train.columns]

## Feature Scaling 

In [560]:
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X_train = sc.fit_transform(X_train)

test = sc.fit_transform(test)

## Building Machine Learning Models 

In [562]:
X_train.shape, Y_train.shape

((879, 22), (879,))

In [563]:
import time, datetime

import warnings
warnings.filterwarnings('ignore')

# Machine learning
from sklearn.model_selection import train_test_split
from sklearn import model_selection, tree, preprocessing, metrics, linear_model
from sklearn.svm import LinearSVC,SVC
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LinearRegression, LogisticRegression, SGDClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier, GradientBoostingClassifier,AdaBoostClassifier

from sklearn.model_selection import GridSearchCV, cross_val_score, StratifiedKFold
kfold = StratifiedKFold(n_splits=10)

In [564]:
# This general function is used to test different models
# Function will take model name, train & test data as input.

def train_model(model, X_train, Y_train, cv):
    
    
    md = model.fit(X_train, Y_train)
    acc = round(md.score(X_train, Y_train) * 100, 2)
    
    # Cross Validation 
    train_pred = model_selection.cross_val_predict(model, 
                                                  X_train, 
                                                  Y_train, 
                                                  cv=cv, 
                                                  n_jobs = -1)
    # CV accuracy metric
    accuracy_cv = round(metrics.accuracy_score(Y_train, train_pred) * 100, 2)
    
    return train_pred, acc, accuracy_cv

In [565]:
train_predictions = dict()
accuracy = dict()
accuracy_cv = dict()


### Logistic Regression 

In [566]:
tic = time.time()
train_predict, acc, acc_cv = train_model(LogisticRegression(),X_train,Y_train,10)
toc = time.time()

time_taken=toc-tic
print("Accuracy: %s" % acc)
print("Accuracy CV 10-Fold: %s" % acc_cv)
print("Time Taken: %s" % datetime.timedelta(seconds=time_taken))

#Store the values in dictionary for future reference
train_predictions = {'Log':train_predict}
accuracy = {'Log':acc}
accuracy_cv = {'Log':acc_cv}

Accuracy: 83.96
Accuracy CV 10-Fold: 82.48
Time Taken: 0:00:05.498349


### KNN 

In [567]:
tic = time.time()
train_predict, acc, acc_cv = train_model(KNeighborsClassifier(),X_train,Y_train,10)
toc = time.time()

time_taken=toc-tic
print("Accuracy: %s" % acc)
print("Accuracy CV 10-Fold: %s" % acc_cv)
print("Time Taken: %s" % datetime.timedelta(seconds=time_taken))

#Store the values in dictionary for future reference
train_predictions.update({'KNN':train_predict})
accuracy.update({'KNN':acc})
accuracy_cv.update({'KNN':acc_cv})

Accuracy: 87.03
Accuracy CV 10-Fold: 81.68
Time Taken: 0:00:00.108937


### Linear Support Vector Machines (SVC)

In [568]:
tic = time.time()
train_predict, acc, acc_cv = train_model(LinearSVC(),X_train,Y_train,10)
toc = time.time()

time_taken=toc-tic
print("Accuracy: %s" % acc)
print("Accuracy CV 10-Fold: %s" % acc_cv)
print("Time Taken: %s" % datetime.timedelta(seconds=time_taken))

#Store the values in dictionary for future reference
train_predictions.update({'SVC':train_predict})
accuracy.update({'SVC':acc})
accuracy_cv.update({'SVC':acc_cv})

Accuracy: 83.85
Accuracy CV 10-Fold: 82.48
Time Taken: 0:00:00.578712


### SGD

In [569]:
tic = time.time()
train_predict, acc, acc_cv = train_model(SGDClassifier(),X_train,Y_train,10)
toc = time.time()

time_taken=toc-tic
print("Accuracy: %s" % acc)
print("Accuracy CV 10-Fold: %s" % acc_cv)
print("Time Taken: %s" % datetime.timedelta(seconds=time_taken))

#Store the values in dictionary for future reference
train_predictions.update({'SGD':train_predict})
accuracy.update({'SGD':acc})
accuracy_cv.update({'SGD':acc_cv})

Accuracy: 78.61
Accuracy CV 10-Fold: 75.88
Time Taken: 0:00:00.073955


### Decision Tree Classifier 

In [570]:
tic = time.time()
train_predict, acc, acc_cv = train_model(DecisionTreeClassifier(),X_train,Y_train,10)
toc = time.time()

time_taken=toc-tic
print("Accuracy: %s" % acc)
print("Accuracy CV 10-Fold: %s" % acc_cv)
print("Time Taken: %s" % datetime.timedelta(seconds=time_taken))

#Store the values in dictionary for future reference
train_predictions.update({'DTC':train_predict})
accuracy.update({'DTC':acc})
accuracy_cv.update({'DTC':acc_cv})

Accuracy: 98.75
Accuracy CV 10-Fold: 78.95
Time Taken: 0:00:00.131149


### Gradient Boost Trees 

In [571]:
tic = time.time()
train_predict, acc, acc_cv = train_model(GradientBoostingClassifier(),X_train,Y_train,10)
toc = time.time()

time_taken=toc-tic
print("Accuracy: %s" % acc)
print("Accuracy CV 10-Fold: %s" % acc_cv)
print("Time Taken: %s" % datetime.timedelta(seconds=time_taken))

#Store the values in dictionary for future reference
train_predictions.update({'GBT':train_predict})
accuracy.update({'GBT':acc})
accuracy_cv.update({'GBT':acc_cv})

Accuracy: 89.65
Accuracy CV 10-Fold: 82.82
Time Taken: 0:00:00.700733


## Grid Search 

### Extra Tree Classifier 

In [573]:
parameters = {"max_depth":  [n for n in range(9, 11)],  
              "max_features": [1, 3, 10],
              "min_samples_split": [n for n in range(4, 9)],
              "min_samples_leaf": [n for n in range(2, 4)],
              "bootstrap": [False],
              "n_estimators" :[n for n in range(10, 50, 10)],
              "criterion": ["gini"]}


ETC = ExtraTreesClassifier()

GS_ETC = GridSearchCV(ETC,param_grid = parameters, cv=kfold, scoring="accuracy", n_jobs= 4, verbose = 1)

GS_ETC.fit(X_train,Y_train)

ETC_best = GS_ETC.best_estimator_



print("Best score: {}".format(GS_ETC.best_score_))
print("Optimal params: {}".format(GS_ETC.best_estimator_))

Fitting 10 folds for each of 240 candidates, totalling 2400 fits


[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  49 tasks      | elapsed:    5.2s
[Parallel(n_jobs=4)]: Done 945 tasks      | elapsed:   18.5s


Best score: 0.838452787258248
Optimal params: ExtraTreesClassifier(bootstrap=False, class_weight=None, criterion='gini',
           max_depth=10, max_features=3, max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=2, min_samples_split=4,
           min_weight_fraction_leaf=0.0, n_estimators=20, n_jobs=None,
           oob_score=False, random_state=None, verbose=0, warm_start=False)


[Parallel(n_jobs=4)]: Done 2400 out of 2400 | elapsed:   41.4s finished


### Random Forest Classifier 

In [574]:
# create param grid object 
parameters = dict(     
    max_depth = [n for n in range(9, 12)],     
    min_samples_split = [n for n in range(4, 9)], 
    min_samples_leaf = [n for n in range(2, 4)],     
    n_estimators = [n for n in range(10, 60, 10)],
)


RFC = RandomForestClassifier()

# build and fit model 
GS_RFC = GridSearchCV(estimator=RFC, param_grid=parameters, cv=5) 
GS_RFC.fit(X_train, Y_train)

RFC_best = GS_RFC.best_estimator_

print("Best score: {}".format(GS_RFC.best_score_))
print("Optimal params: {}".format(GS_RFC.best_estimator_))

Best score: 0.8350398179749715
Optimal params: RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=9, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=3, min_samples_split=7,
            min_weight_fraction_leaf=0.0, n_estimators=40, n_jobs=None,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)


### Gradient Boost 

In [575]:
parameters = {'loss' : ["deviance"],
              'n_estimators' : [n for n in range(10, 60, 10)],
              'learning_rate': [0.1, 0.05, 0.01],
              'max_depth':  [n for n in range(9, 14)],  
              'min_samples_leaf': [n for n in range(2, 5)],
              'max_features': [0.3, 0.1] 
              }

GBC = GradientBoostingClassifier()

GS_GBC = GridSearchCV(GBC,param_grid = parameters, cv=kfold, scoring="accuracy", n_jobs= 4, verbose = 1)

GS_GBC.fit(X_train,Y_train)

GBC_best = GS_GBC.best_estimator_

print("Best score: {}".format(GS_GBC.best_score_))
print("Optimal params: {}".format(GS_GBC.best_estimator_))

Fitting 10 folds for each of 450 candidates, totalling 4500 fits


[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  76 tasks      | elapsed:    4.6s
[Parallel(n_jobs=4)]: Done 376 tasks      | elapsed:   18.8s
[Parallel(n_jobs=4)]: Done 876 tasks      | elapsed:   45.1s
[Parallel(n_jobs=4)]: Done 1417 tasks      | elapsed:  1.3min
[Parallel(n_jobs=4)]: Done 2264 tasks      | elapsed:  2.1min
[Parallel(n_jobs=4)]: Done 3061 tasks      | elapsed:  2.9min
[Parallel(n_jobs=4)]: Done 4201 tasks      | elapsed:  4.0min
[Parallel(n_jobs=4)]: Done 4500 out of 4500 | elapsed:  4.3min finished


Best score: 0.8407281001137656
Optimal params: GradientBoostingClassifier(criterion='friedman_mse', init=None,
              learning_rate=0.1, loss='deviance', max_depth=12,
              max_features=0.3, max_leaf_nodes=None,
              min_impurity_decrease=0.0, min_impurity_split=None,
              min_samples_leaf=3, min_samples_split=2,
              min_weight_fraction_leaf=0.0, n_estimators=20,
              n_iter_no_change=None, presort='auto', random_state=None,
              subsample=1.0, tol=0.0001, validation_fraction=0.1,
              verbose=0, warm_start=False)


### Ada Boost 

In [576]:
parameters = {"base_estimator__criterion" : ["gini", "entropy"],
              "base_estimator__splitter" :   ["best", "random"],
              "algorithm" : ["SAMME","SAMME.R"],
              "n_estimators" :[30],
              "learning_rate":  [0.0001, 0.001, 0.01, 0.1, 0.2, 0.3,1.5]}


DTC = DecisionTreeClassifier()

ADA_DTC = AdaBoostClassifier(DTC, random_state=7)

GS_ADA_DTC = GridSearchCV(ADA_DTC,param_grid = parameters, cv=kfold, scoring="accuracy", n_jobs= 4, verbose = 1)

GS_ADA_DTC.fit(X_train,Y_train)

ADA_DTC_best = GS_ADA_DTC.best_estimator_

print("Best score: {}".format(GS_ADA_DTC.best_score_))
print("Optimal params: {}".format(GS_ADA_DTC.best_estimator_))

Fitting 10 folds for each of 56 candidates, totalling 560 fits


[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  76 tasks      | elapsed:    4.0s
[Parallel(n_jobs=4)]: Done 376 tasks      | elapsed:   16.6s


Best score: 0.8191126279863481
Optimal params: AdaBoostClassifier(algorithm='SAMME',
          base_estimator=DecisionTreeClassifier(class_weight=None, criterion='entropy', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='random'),
          learning_rate=0.1, n_estimators=30, random_state=7)


[Parallel(n_jobs=4)]: Done 560 out of 560 | elapsed:   24.5s finished


### SVC 

In [577]:
parameters = {'kernel': ['rbf'], 
                  'gamma': [ 0.001, 0.01, 0.1, 1],
                  'C': [1, 10, 50, 100,200,300, 1000]}

SVMC = SVC(probability=True)

GS_SVMC = GridSearchCV(SVMC,param_grid = parameters, cv=kfold, scoring="accuracy", n_jobs= 4, verbose = 1)

GS_SVMC.fit(X_train,Y_train)

SVMC_best = GS_SVMC.best_estimator_

print("Best score: {}".format(GS_SVMC.best_score_))
print("Optimal params: {}".format(GS_SVMC.best_estimator_))

Fitting 10 folds for each of 28 candidates, totalling 280 fits


[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:    2.6s
[Parallel(n_jobs=4)]: Done 192 tasks      | elapsed:   18.9s
[Parallel(n_jobs=4)]: Done 280 out of 280 | elapsed:   46.3s finished


Best score: 0.8270762229806599
Optimal params: SVC(C=100, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma=0.01, kernel='rbf',
  max_iter=-1, probability=True, random_state=None, shrinking=True,
  tol=0.001, verbose=False)


## Ensemble 

In [578]:
#We'll use Voting classifier to fit the best model we've built

from sklearn.ensemble import VotingClassifier

votingC = VotingClassifier(estimators=[('RFC', RFC_best), ('ETC', ETC_best),('Log',LogisticRegression()),('KNN',KNeighborsClassifier())
,('GBC',GBC_best),('Adaboost',ADA_DTC_best),('SVC',SVMC_best)], voting='soft', n_jobs=4)

votingC = votingC.fit(X_train, Y_train)

## Predict the test set 

In [579]:

test_Survived = pd.Series(votingC.predict(test), name="Survived")


In [580]:
Submission = pd.concat([test_PassengerId,test_Survived],axis=1)
Submission.to_csv("submission.csv",index=False)