In [151]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

plt.style.use('seaborn')
sns.set(font_scale=2)
import missingno as msno

#ignore warnings
import warnings
warnings.filterwarnings('ignore')

%matplotlib inline

# 1. Check Data

In [152]:
df_train = pd.read_csv('../input/titanic/train.csv')
df_test = pd.read_csv('../input/titanic/test.csv')

In [153]:
df_train.head()

In [154]:
df_train.describe()

Seems like there exists null data. Visualize using msno.

## 1.1 Check Null Data

In [155]:
for col in df_train.columns:
    msg = 'column: {:>10}\t Percent of NaN value: {:.2f}%'.format(
        col, 100 * (df_train[col].isnull().sum() / df_train[col].shape[0]))
    print(msg)

In [156]:
for col in df_test.columns:
    msg = 'column: {:>10}\t Percent of NaN value: {:.2f}%'.format(
        col, 100 * (df_test[col].isnull().sum() / df_test[col].shape[0]))
    print(msg)

Both train and test sets have null data in Age (20%) and Cabin (80%). Train set also has null data in Embarked (0.22%). Test set has null data in Fare (0.24%).

In [157]:
msno.matrix(df=df_train.iloc[:, :], figsize=(8, 8), color=(0.8, 0.5, 0.2))

In [158]:
msno.bar(df=df_train.iloc[:, :], figsize=(8, 8), color=(0.8, 0.5, 0.2))

In [159]:
msno.bar(df=df_test.iloc[:, :], figsize=(8, 8), color=(0.8, 0.5, 0.2))

## 1.2 Check Target Label

We check the distribution of the target label, as **the distribution of the target label determines the evaluation metric.**

(e.g. if the distribution shows that everyone but one died, it wouldn't make sense to evaluate on the accuracy of the model that only returns 0.)

In [160]:
f, ax = plt.subplots(1, 2, figsize=(18, 8))

df_train['Survived'].value_counts().plot.pie(explode=[0, 0.1], autopct='%1.1f%%', ax=ax[0], shadow=True)
ax[0].set_title('Pie plot - Survived')
ax[0].set_ylabel('')
sns.countplot('Survived', data=df_train, ax=ax[1])
ax[1].set_title('Count plot - Survived')

plt.show()

Only 38.4% has survived. But we consider the distribution of target label balanced, as it is not too extreme.

# 2. EDA

## 2.1 Pclass

We check Pclass first. Pclass column is *ordinal*.

In [161]:
# distribution by Pclass
df_train[['Pclass', 'Survived']].groupby(['Pclass'], as_index=True).count()

In [162]:
df_train[['Pclass', 'Survived']].groupby(['Pclass'], as_index=True).sum()

In [163]:
# better visualization! Shows Pclass vs. Survived

pd.crosstab(df_train['Pclass'], df_train['Survived'],
            margins=True).style.background_gradient(cmap='summer_r')

In [164]:
# shows Pclass vs. Survived in ratio

df_train[['Pclass', 'Survived']].groupby(['Pclass'], as_index=True).mean().sort_values(
    by='Survived', ascending=False).plot.bar()

In [165]:
# frequency between Pclass

y_position = 1.02
f, ax = plt.subplots(1, 2, figsize=(18, 8))
df_train['Pclass'].value_counts().plot.bar(color=['#CD7F32','#FFDF00','#D3D3D3'], ax=ax[0])
ax[0].set_title('Number of Passengers By Pclass', y=y_position)
ax[0].set_ylabel('Count')
sns.countplot('Pclass', hue='Survived', data=df_train, ax=ax[1])
ax[1].set_title('Pclass: Survived vs Dead', y=y_position)
plt.show()

The survival rate shows a decreasing trend as Pclass ordinally increases (i.e. Passenger class gets lower). Hence, we can determine that Pclass affects survival, and that we will be using this feature for our model.

## 2.2 Sex

In [166]:
f, ax = plt.subplots(1, 2, figsize=(18, 8))
df_train[['Sex', 'Survived']].groupby(['Sex'], as_index=True).mean().plot.bar(ax=ax[0])
ax[0].set_title('Survived vs Sex')
sns.countplot('Sex', hue='Survived', data=df_train, ax=ax[1])
ax[1].set_title('Sex: Survived vs Dead')
plt.show()

As shown, women are more likely to survive.

In [167]:
df_train[['Sex', 'Survived']].groupby(['Sex'], as_index=False).mean().sort_values(
    by='Survived', ascending=False)

In [168]:
pd.crosstab(df_train['Sex'], df_train['Survived'],
            margins=True).style.background_gradient(cmap='summer_r')

Like Pclass, Sex is an important feature to determine survival.

## 2.3 Sex and Pclass

We determine the survival rate based on **both** Sex and Pclass.

(Use factorplot for three-dimensional plotting)

In [169]:
sns.factorplot('Pclass', 'Survived', hue='Sex', data=df_train, 
               size=6, aspect=1.5)

Findings:

1. In all classes, female are more likely to survive than male.
2. For both male and female, higher Pclass shows higher survival rate.

## 2.4 Age

In [170]:
print('The oldest passenger : {:.1f} Years'.format(df_train['Age'].max()))
print('The youngest passenger : {:.1f} Years'.format(df_train['Age'].min()))
print('Passenger mean age : {:.1f} Years'.format(df_train['Age'].mean()))

In [171]:
fig, ax = plt.subplots(1, 1, figsize=(9, 5))
sns.kdeplot(df_train[df_train['Survived'] == 1]['Age'], ax=ax)
sns.kdeplot(df_train[df_train['Survived'] == 0]['Age'], ax=ax)
plt.legend(['Survived', 'Deceased'])
plt.show()

Younger age seems to show higher survival rate.

In [172]:
# Age distribution withing classes
plt.figure(figsize=(8, 6))
df_train['Age'][df_train['Pclass'] == 1].plot(kind='kde')
df_train['Age'][df_train['Pclass'] == 2].plot(kind='kde')
df_train['Age'][df_train['Pclass'] == 3].plot(kind='kde')

plt.xlabel('Age')
plt.title('Age Distribution within classes')
plt.legend(['1st Class', '2nd Class', '3rd Class'])

Higher class tends to have more aged people.

In [173]:
cummulate_survival_ratio = []
for i in range(1, 80):
    cummulate_survival_ratio.append(
        df_train[df_train['Age'] < i]['Survived'].sum() / len(df_train[df_train['Age'] < i]['Survived']))
    
plt.figure(figsize=(7, 7))
plt.plot(cummulate_survival_ratio)
plt.title('Survival rate change depending on range of Age', y=1.02)
plt.ylabel('Survival rate')
plt.xlabel('Range of Age(0~x)')
plt.show()

As shown, the younger the passengers are, the higher their survival rate is. Hence, Age can also be an important feature.

## 2.5 Pclass, Sex, Age

In [174]:
f,ax=plt.subplots(1,2,figsize=(18,8))
sns.violinplot("Pclass","Age", hue="Survived", data=df_train, scale='count', split=True,ax=ax[0])
ax[0].set_title('Pclass and Age vs Survived')
ax[0].set_yticks(range(0,110,10))
sns.violinplot("Sex","Age", hue="Survived", data=df_train, scale='count', split=True,ax=ax[1])
ax[1].set_title('Sex and Age vs Survived')
ax[1].set_yticks(range(0,110,10))
plt.show()

By observing the survived on the left subfigure, we know that in every class, younger passengers seemed to have survived more. (See 0 vs. 1)

By observing the right subfigure, we know that female has clearly survived more.

Hence, we can determine that they prioritized the survival of women and babies.

## 2.6 Embarked

In [175]:
f, ax = plt.subplots(1, 1, figsize=(7, 7))
df_train[['Embarked', 'Survived']].groupby(['Embarked'], as_index=True).mean().sort_values(
    by='Survived', ascending=False).plot.bar(ax=ax)

It seems like that passengers who embarked at C harbor survived more.

**We observe Embarked vs. other features to see whether other features affect Embarked.**

In [176]:
f,ax=plt.subplots(2, 2, figsize=(20,15))
sns.countplot('Embarked', data=df_train, ax=ax[0,0])
ax[0,0].set_title('(1) No. Of Passengers Boarded')
sns.countplot('Embarked', hue='Sex', data=df_train, ax=ax[0,1])
ax[0,1].set_title('(2) Male-Female Split for Embarked')
sns.countplot('Embarked', hue='Survived', data=df_train, ax=ax[1,0])
ax[1,0].set_title('(3) Embarked vs Survived')
sns.countplot('Embarked', hue='Pclass', data=df_train, ax=ax[1,1])
ax[1,1].set_title('(4) Embarked vs Pclass')
plt.subplots_adjust(wspace=0.2, hspace=0.5)
plt.show()

Figure(1) - Overall, people embarked at Harbor S the most.

Figure(2) - Harbor C and Q show similar gender distribution, whereas in Harbor S it is male-dominant.

Figure(3) - As we already demonstrated, people who embarked at Harbor S shows lower survival rate.

Figure(4) - By splitting into class, we can infer that Harbor C shows a better survival rate as it has high proportion of 1st Pclass. Harbor S has high proportion of 3rd class, which affects its low survival rate.

## 2.7 SibSp + Parch

SibSp is the number of siblings and spouses on board. Parch is the number of parents and children on board. **Together, we can make a new feature column, FamilySize.**

In [177]:
df_train['FamilySize'] = df_train['SibSp'] + df_train['Parch'] + 1 # add 1 to include oneself
df_test['FamilySize'] = df_test['SibSp'] + df_test['Parch'] + 1 # add 1 to include oneself

In [178]:
print("Maximum size of Family: ", df_train['FamilySize'].max())
print("Minimum size of Family: ", df_train['FamilySize'].min())

In [179]:
f,ax=plt.subplots(1, 3, figsize=(40,10))
sns.countplot('FamilySize', data=df_train, ax=ax[0])
ax[0].set_title('(1) No. Of Passengers Boarded', y=1.02)

sns.countplot('FamilySize', hue='Survived', data=df_train, ax=ax[1])
ax[1].set_title('(2) Survived countplot depending on FamilySize',  y=1.02)

df_train[['FamilySize', 'Survived']].groupby(['FamilySize'], as_index=True).mean().sort_values(by='Survived', ascending=False).plot.bar(ax=ax[2])
ax[2].set_title('(3) Survived rate depending on FamilySize',  y=1.02)

plt.subplots_adjust(wspace=0.2, hspace=0.5)
plt.show()

Figure (1) - Shows the distribution of Family Size. The maximum size of family is 11, and the frequency of family size is the highest in 1, 2, 3, and then 4.

Figure (2), (3) - Survival rate is the highest for Family Size of 4. This shows that family size of 3-4 shows a better survival rate, indicating that **too small or too big of a family size affects the survival rate negatively.**

## 2.8 Fare

In [180]:
fig, ax = plt.subplots(1, 1, figsize=(8, 8))
g = sns.distplot(df_train['Fare'], color='b', label='Skewness : {:.2f}'.format(
    df_train['Fare'].skew()), ax=ax)
g = g.legend(loc='best')

Fare is very skewed. We can take care of this during the feature engineering.

## 2.9 Cabin & Ticket

Cabin has 80% null value. We exclude this feature from our model.

Ticked does not have a null value, but it is a string data.

In [181]:
df_train['Ticket'].value_counts()

For now, we exclude Ticket from our features, but **we should include it during feature engineering to ameliorate our model.**

# 3. Feature Engineering

## 3.1 Fill Null Data

### 3.1.1 Fill Null in Age using title

As we've seen, Age has 177 null data. **We can try various ways to fill in null data**, and here we use *title + statistics*.

We see titles such as 'Mr., Mrs., Miss'. We first make an Initial column and extract the titles there.

In [182]:
df_train['Initial']= df_train.Name.str.extract('([A-Za-z]+)\.') #lets extract the Salutations
    
df_test['Initial']= df_test.Name.str.extract('([A-Za-z]+)\.') #lets extract the Salutations

In [183]:
#Checking the Initials with the Sex
pd.crosstab(df_train['Initial'], df_train['Sex']).T.style.background_gradient(cmap='summer_r')

We replace initials to representative ones. (Changing 'Dr.' to 'Mr.' may be controversial, but given the population distribution, maybe...)

In [184]:
df_train['Initial'].replace(['Mlle','Mme','Ms','Dr','Major','Lady','Countess','Jonkheer','Col',
                             'Rev','Capt','Sir','Don', 'Dona'],
                            ['Miss','Miss','Miss','Mr','Mr','Mrs','Mrs','Other','Other',
                             'Other','Mr','Mr','Mr', 'Mr'],inplace=True)

df_test['Initial'].replace(['Mlle','Mme','Ms','Dr','Major','Lady','Countess','Jonkheer','Col',
                            'Rev','Capt','Sir','Don', 'Dona'],
                           ['Miss','Miss','Miss','Mr','Mr','Mrs','Mrs','Other','Other',
                            'Other','Mr','Mr','Mr', 'Mr'],inplace=True)

In [185]:
df_train.groupby('Initial').mean()

We can see that the survival rates of Miss and Mrs are high, as expected.

In [186]:
df_train.groupby('Initial')['Survived'].mean().plot.bar()

**We use the mean age to fill in the null value.**

In [187]:
df_train.loc[(df_train.Age.isnull())&(df_train.Initial=='Mr'),'Age'] = 33
df_train.loc[(df_train.Age.isnull())&(df_train.Initial=='Mrs'),'Age'] = 36
df_train.loc[(df_train.Age.isnull())&(df_train.Initial=='Master'),'Age'] = 5
df_train.loc[(df_train.Age.isnull())&(df_train.Initial=='Miss'),'Age'] = 22
df_train.loc[(df_train.Age.isnull())&(df_train.Initial=='Other'),'Age'] = 46

df_test.loc[(df_test.Age.isnull())&(df_test.Initial=='Mr'),'Age'] = 33
df_test.loc[(df_test.Age.isnull())&(df_test.Initial=='Mrs'),'Age'] = 36
df_test.loc[(df_test.Age.isnull())&(df_test.Initial=='Master'),'Age'] = 5
df_test.loc[(df_test.Age.isnull())&(df_test.Initial=='Miss'),'Age'] = 22
df_test.loc[(df_test.Age.isnull())&(df_test.Initial=='Other'),'Age'] = 46

### 3.1.2 Fill Null in Embarked

In [188]:
print('Embarked has ', sum(df_train['Embarked'].isnull()), ' Null values')

There are only 2 null values in Embarked. Replace them with S, the most frequent value.

In [189]:
df_train['Embarked'].fillna('S', inplace=True)

### 3.1.3 Fill Null in Fare

In [190]:
print('Fare has ', sum(df_test['Fare'].isnull()), ' Null values')

There is only 1 null value in Fare in the test dataset. Replace it with the median value, for the distribution of Fare is skewed.

In [191]:
df_test['Fare'].fillna(df_test["Fare"].median(), inplace=True)

## 3.2 Apply log to Fare

Since Fare is too skewed, applying to our model directly might harm the quality of the model. For instance, if we have a high fare value, the weight applied to that specific entry would be over-emphasized. To reduce the effect of outliers, we apply log to Fare.

In [192]:
df_test.loc[df_test.Fare.isnull(), 'Fare'] = df_test['Fare'].mean()

df_train['Fare'] = df_train['Fare'].map(lambda i: np.log(i) if i > 0 else 0)
df_test['Fare'] = df_test['Fare'].map(lambda i: np.log(i) if i > 0 else 0)

In [193]:
fig, ax = plt.subplots(1, 1, figsize=(8, 8))
g = sns.distplot(df_train['Fare'], color='b', label='Skewness : {:.2f}'.format(
    df_train['Fare'].skew()), ax=ax)
g = g.legend(loc='best')

We can see that Fare is now less skewed.

## 3.3 Change Age (continuous to categorical)

Age is a continuous feature, which can make the model work. But we can also group Age into certain categories and make it categorical.

**Note that converting a continuous data to categorical may cause information loss.**

In [194]:
def category_age(x):
    if x < 10:
        return 0
    elif x < 20:
        return 1
    elif x < 30:
        return 2
    elif x < 40:
        return 3
    elif x < 50:
        return 4
    elif x < 60:
        return 5
    elif x < 70:
        return 6
    else:
        return 7    
    
df_train['Age_cat'] = df_train['Age'].apply(category_age)
df_test['Age_cat'] = df_train['Age'].apply(category_age)

In [195]:
df_train.drop(['Age'], axis=1, inplace=True)
df_test.drop(['Age'], axis=1, inplace=True)

## 3.4 Change Initial, Embarked and Sex (string to numerical)

**We need to change string data to numerical so that when we put these columns into model, the machine can detect them.**

We use the map method to do so.

In [196]:
df_train['Initial'] = df_train['Initial'].map({'Master': 0, 'Miss': 1, 'Mr': 2, 'Mrs': 3, 'Other': 4})
df_test['Initial'] = df_test['Initial'].map({'Master': 0, 'Miss': 1, 'Mr': 2, 'Mrs': 3, 'Other': 4})

In [197]:
df_train['Embarked'] = df_train['Embarked'].map({'C': 0, 'Q': 1, 'S': 2})
df_test['Embarked'] = df_test['Embarked'].map({'C': 0, 'Q': 1, 'S': 2})

In [198]:
df_train['Sex'] = df_train['Sex'].map({'female': 0, 'male': 1})
df_test['Sex'] = df_test['Sex'].map({'female': 0, 'male': 1})

Now, we find the correlation between the features:

In [199]:
heatmap_data = df_train[['Survived', 'Pclass', 'Sex', 'Fare', 'Embarked', 'FamilySize', 'Initial', 'Age_cat']] 

colormap = plt.cm.RdBu
plt.figure(figsize=(14, 12))
plt.title('Pearson Correlation of Features', y=1.05, size=15)
sns.heatmap(heatmap_data.astype(float).corr(), linewidths=0.1, vmax=1.0,
           square=True, cmap=colormap, linecolor='white', annot=True, annot_kws={"size": 16})

del heatmap_data

1. As we already demonstrated during the EDA, Sex and Pclass are somewhat correlated to Survival.
2. Fare and Pclass are also negatively correlated.
3. **There are no features that show extremely high correlation.** This indicates that no features suffer from multicolinearity. Multicolinearity can cause redundant features to be present.

## 3.5 One-hot encoding on Initial and Embarked

Numerized categorical data can be directly applied, but to enhance the model we can do the one-hot encoding.

In [200]:
df_train = pd.get_dummies(df_train, columns=['Initial'], prefix='Initial')
df_test = pd.get_dummies(df_test, columns=['Initial'], prefix='Initial')

In [201]:
df_train = pd.get_dummies(df_train, columns=['Embarked'], prefix='Embarked')
df_test = pd.get_dummies(df_test, columns=['Embarked'], prefix='Embarked')

In [202]:
df_train.head()

**If there are too many categories, using one-hot encoding may not be the most efficient approach.**

## 3.6 Drop columns

Remove all unnecessary columns.

In [203]:
df_train.drop(['PassengerId', 'Name', 'SibSp', 'Parch', 'Ticket', 'Cabin'], axis=1, inplace=True)
df_test.drop(['PassengerId', 'Name',  'SibSp', 'Parch', 'Ticket', 'Cabin'], axis=1, inplace=True)

In [204]:
df_train.head()

In [205]:
df_test.head()

# 4. Building machine learning model and prediction using the trained model

In [206]:
#importing all the required ML packages

from sklearn import metrics # for the evaluation metric

from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier, ExtraTreesClassifier, VotingClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV, cross_val_score, StratifiedKFold, learning_curve, train_test_split
from sklearn.naive_bayes import GaussianNB #Naive bayes
from sklearn.metrics import confusion_matrix #for confusion matrix

**This is a binary classification problem.**

## 4.1 Preparation - Split dataset into train, valid, test set

In [207]:
# X_train = df_train.drop('Survived', axis=1).values
# target_label = df_train['Survived'].values
# X_test = df_test.values

In [208]:
# X_tr, X_vld, y_tr, y_vld = train_test_split(X_train, target_label, test_size=0.3, random_state=42)

Think of it as this:

the data is trained (train), goes through a skrimmage to evaluate how well the model is trained (valid), and then put to test (test).

In [209]:
X_train = df_train.drop('Survived', axis=1)
y_train = df_train['Survived']

## 4.2 Model generation and comparison

We try different models.

In [210]:
train, vld = train_test_split(df_train,test_size=0.3,random_state=0,stratify=df_train['Survived'])
train_X = train[train.columns[1:]]
train_Y = train[train.columns[:1]]
vld_X = vld[vld.columns[1:]]
vld_Y = vld[vld.columns[:1]]
X = df_train[df_train.columns[1:]]
target_column = df_train['Survived']

### 4.2.1 Radial Support Vector Machines(rbf-SVM)

In [211]:
model=SVC(kernel='rbf',C=1,gamma=0.1)
model.fit(train_X,train_Y)
prediction1=model.predict(vld_X)
print('Accuracy for rbf SVM is ',metrics.accuracy_score(prediction1,vld_Y))

### 4.2.2 Linear Support Vector Machine(linear-SVM)

In [212]:
model=svm.SVC(kernel='linear',C=0.1,gamma=0.1)
model.fit(train_X,train_Y)
prediction2=model.predict(vld_X)
print('Accuracy for linear SVM is',metrics.accuracy_score(prediction2,vld_Y))

### 4.2.3 Logistic Regression

In [213]:
model = LogisticRegression()
model.fit(train_X,train_Y)
prediction3=model.predict(vld_X)
print('Accuracy for Logistic Regression is',metrics.accuracy_score(prediction3,vld_Y))

### 4.2.4 Decision Tree

In [214]:
model=DecisionTreeClassifier()
model.fit(train_X,train_Y)
prediction4=model.predict(vld_X)
print('Accuracy for Decision Tree is',metrics.accuracy_score(prediction4,vld_Y))

### 4.2.5 k-Nearest Neighbors (kNN)

In [215]:
model=KNeighborsClassifier() 
model.fit(train_X,train_Y)
prediction5=model.predict(vld_X)
print('Accuracy for KNN is',metrics.accuracy_score(prediction5,vld_Y))

We find the ideal number of groups.

In [216]:
a_index=list(range(1,11))
a=pd.Series()
x=[0,1,2,3,4,5,6,7,8,9,10]
for i in list(range(1,11)):
    model=KNeighborsClassifier(n_neighbors=i) 
    model.fit(train_X,train_Y)
    prediction=model.predict(vld_X)
    a=a.append(pd.Series(metrics.accuracy_score(prediction,vld_Y)))
plt.plot(a_index, a)
plt.xticks(x)
fig=plt.gcf()
fig.set_size_inches(12,6)
plt.show()
print('Accuracies for different values of n are:',a.values,'with the max value as ',a.values.max())

### 4.2.6 Gaussian Naive Bayes

In [217]:
model=GaussianNB()
model.fit(train_X,train_Y)
prediction6=model.predict(vld_X)
print('Accuracy for Naive Bayes is',metrics.accuracy_score(prediction6,vld_Y))

### 4.2.7 Random Forests

In [218]:
model=RandomForestClassifier(n_estimators=100)
model.fit(train_X,train_Y)
prediction7=model.predict(vld_X)
print('Accuracy for Random Forests is',metrics.accuracy_score(prediction7,vld_Y))

## 4.3 Cross validation

In [219]:
from sklearn.model_selection import KFold #for K-fold cross validation
from sklearn.model_selection import cross_val_score #score evaluation
from sklearn.model_selection import cross_val_predict #prediction
kfold = KFold(n_splits=10, random_state=42, shuffle=True) # k=10, split the data into 10 equal parts
xyz=[]
accuracy=[]
std=[]
classifiers=['Linear Svm','Radial Svm','Logistic Regression','KNN','Decision Tree','Naive Bayes','Random Forest']
models=[svm.SVC(kernel='linear'),svm.SVC(kernel='rbf'),LogisticRegression(),
        KNeighborsClassifier(n_neighbors=9),DecisionTreeClassifier(),GaussianNB(),
        RandomForestClassifier(n_estimators=100)]
for i in models:
    model = i
    cv_result = cross_val_score(model, X, target_column, cv = kfold, scoring = "accuracy")
    xyz.append(cv_result.mean())
    std.append(cv_result.std())
    accuracy.append(cv_result)
    
new_models_dataframe2=pd.DataFrame({'CV Mean':xyz,'Std':std},index=classifiers)       
new_models_dataframe2

In [220]:
plt.subplots(figsize=(12,6))
box=pd.DataFrame(accuracy,index=[classifiers])
box.T.boxplot()

In [221]:
new_models_dataframe2['CV Mean'].plot.barh(width=0.8)
plt.title('Average CV Mean Accuracy')
fig=plt.gcf()
fig.set_size_inches(8,5)
plt.show()

## 4.4 Confusion Matrix

In [222]:
f,ax=plt.subplots(3,3,figsize=(12,10))
y_pred = cross_val_predict(svm.SVC(kernel='rbf'),X,target_column,cv=10)
sns.heatmap(confusion_matrix(target_column,y_pred),ax=ax[0,0],annot=True,fmt='2.0f')
ax[0,0].set_title('Matrix for rbf-SVM')
y_pred = cross_val_predict(svm.SVC(kernel='linear'),X,target_column,cv=10)
sns.heatmap(confusion_matrix(target_column,y_pred),ax=ax[0,1],annot=True,fmt='2.0f')
ax[0,1].set_title('Matrix for Linear-SVM')
y_pred = cross_val_predict(KNeighborsClassifier(n_neighbors=9),X,target_column,cv=10)
sns.heatmap(confusion_matrix(target_column,y_pred),ax=ax[0,2],annot=True,fmt='2.0f')
ax[0,2].set_title('Matrix for KNN')
y_pred = cross_val_predict(RandomForestClassifier(n_estimators=100),X,target_column,cv=10)
sns.heatmap(confusion_matrix(target_column,y_pred),ax=ax[1,0],annot=True,fmt='2.0f')
ax[1,0].set_title('Matrix for Random-Forests')
y_pred = cross_val_predict(LogisticRegression(),X,target_column,cv=10)
sns.heatmap(confusion_matrix(target_column,y_pred),ax=ax[1,1],annot=True,fmt='2.0f')
ax[1,1].set_title('Matrix for Logistic Regression')
y_pred = cross_val_predict(DecisionTreeClassifier(),X,target_column,cv=10)
sns.heatmap(confusion_matrix(target_column,y_pred),ax=ax[1,2],annot=True,fmt='2.0f')
ax[1,2].set_title('Matrix for Decision Tree')
y_pred = cross_val_predict(GaussianNB(),X,target_column,cv=10)
sns.heatmap(confusion_matrix(target_column,y_pred),ax=ax[2,0],annot=True,fmt='2.0f')
ax[2,0].set_title('Matrix for Naive Bayes')
plt.subplots_adjust(hspace=0.2,wspace=0.2)
plt.show()

## 4.5 Hyperparameter tuning

### 4.5.1 SVM

In [223]:
from sklearn.model_selection import GridSearchCV

C=[0.05,0.1,0.2,0.3,0.25,0.4,0.5,0.6,0.7,0.8,0.9,1]
gamma=[0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9,1.0]
kernel=['rbf','linear']
hyper={'kernel':kernel,'C':C,'gamma':gamma}
gd=GridSearchCV(estimator=SVC(),param_grid=hyper,verbose=True)
gd.fit(X,target_column)
print(gd.best_score_)
print(gd.best_estimator_)

### 4.5.2 Random Forest

In [224]:
n_estimators=range(100,1000,100)
hyper={'n_estimators':n_estimators}
gd=GridSearchCV(estimator=RandomForestClassifier(random_state=42),param_grid=hyper,verbose=True)
gd.fit(X,target_column)
print(gd.best_score_)
print(gd.best_estimator_)

## 4.6 Ensembling

### 4.6.1 Voting Classifier

In [225]:
from sklearn.ensemble import VotingClassifier
ensemble_lin_rbf=VotingClassifier(estimators=[('KNN',KNeighborsClassifier(n_neighbors=9)),
                                              ('RBF',svm.SVC(probability=True,kernel='rbf',C=1,gamma=0.3)),
                                              ('RFor',RandomForestClassifier(n_estimators=700,random_state=42)),
                                              ('LR',LogisticRegression(C=0.05)),
                                              ('DT',DecisionTreeClassifier(random_state=42)),
                                              ('NB',GaussianNB()),
                                              ('svm',svm.SVC(kernel='linear',probability=True))
                                             ], 
                       voting='soft').fit(train_X,train_Y)
print('The accuracy for ensembled model is:',ensemble_lin_rbf.score(vld_X,vld_Y))
cross=cross_val_score(ensemble_lin_rbf,X,target_column, cv = 10,scoring = "accuracy")
print('The cross validated score is',cross.mean())

### 4.6.2 Bagging

#### 4.6.2.1 Bagged kNN

In [226]:
from sklearn.ensemble import BaggingClassifier
model=BaggingClassifier(base_estimator=KNeighborsClassifier(n_neighbors=9),random_state=42,n_estimators=100)
model.fit(train_X,train_Y)
prediction=model.predict(vld_X)
print('The accuracy for bagged KNN is:',metrics.accuracy_score(prediction,vld_Y))
result=cross_val_score(model,X,target_column,cv=10,scoring='accuracy')
print('The cross validated score for bagged KNN is:',result.mean())

#### 4.6.2.2 Bagged Decision Tree

In [227]:
model=BaggingClassifier(base_estimator=DecisionTreeClassifier(),random_state=42,n_estimators=100)
model.fit(train_X,train_Y)
prediction=model.predict(vld_X)
print('The accuracy for bagged Decision Tree is:',metrics.accuracy_score(prediction,vld_Y))
result=cross_val_score(model,X,target_column,cv=10,scoring='accuracy')
print('The cross validated score for bagged Decision Tree is:',result.mean())

### 4.6.3 Boosting

#### 4.6.3.1 AdaBoost

In [228]:
from sklearn.ensemble import AdaBoostClassifier
ada=AdaBoostClassifier(n_estimators=200,random_state=42,learning_rate=0.1)
result=cross_val_score(ada,X,target_column,cv=10,scoring='accuracy')
print('The cross validated score for AdaBoost is:',result.mean())

#### 4.6.3.2 Stochastic Gradient Boosting

In [229]:
from sklearn.ensemble import GradientBoostingClassifier
grad=GradientBoostingClassifier(n_estimators=500,random_state=42,learning_rate=0.1)
result=cross_val_score(grad,X,target_column,cv=10,scoring='accuracy')
print('The cross validated score for Gradient Boosting is:',result.mean())

#### 4.6.3.3 XGBoost

In [230]:
import xgboost as xg
xgboost=xg.XGBClassifier(n_estimators=900,learning_rate=0.1)
result=cross_val_score(xgboost,X,target_column,cv=10,scoring='accuracy')
print('The cross validated score for XGBoost is:',result.mean())

#### 4.6.3.4 Boosting Hyperparameter Tuning

We try the hyperparameter tuning to Stochastic Gradient Boosting, the one with the most accuracy.

In [231]:
# n_estimators=list(range(100,1100,100))
# learn_rate=[0.05,0.1,0.2,0.3,0.25,0.4,0.5,0.6,0.7,0.8,0.9,1]
# hyper={'n_estimators':n_estimators,'learning_rate':learn_rate}
# gd=GridSearchCV(estimator=GradientBoostingClassifier(),param_grid=hyper,verbose=True)
# gd.fit(X,target_column)
# print(gd.best_score_)
# print(gd.best_estimator_)

In [232]:
gd = GradientBoostingClassifier(n_estimators=300,random_state=42,learning_rate=0.05)
result = cross_val_predict(gd,X,target_column,cv=10)
sns.heatmap(confusion_matrix(target_column,result),cmap='winter',annot=True,fmt='2.0f')
plt.show()

## 4.7 Feature Importance

In [233]:
f,ax=plt.subplots(2,2,figsize=(15,12))
model=RandomForestClassifier(n_estimators=500,random_state=42)
model.fit(X,target_column)
pd.Series(model.feature_importances_,X.columns).sort_values(ascending=True).plot.barh(width=0.8,ax=ax[0,0])
ax[0,0].set_title('Feature Importance in Random Forests')
model=AdaBoostClassifier(n_estimators=200,learning_rate=0.05,random_state=42)
model.fit(X,target_column)
pd.Series(model.feature_importances_,X.columns).sort_values(ascending=True).plot.barh(width=0.8,ax=ax[0,1],color='#ddff11')
ax[0,1].set_title('Feature Importance in AdaBoost')
model=GradientBoostingClassifier(n_estimators=500,learning_rate=0.1,random_state=42)
model.fit(X,target_column)
pd.Series(model.feature_importances_,X.columns).sort_values(ascending=True).plot.barh(width=0.8,ax=ax[1,0],cmap='RdYlGn_r')
ax[1,0].set_title('Feature Importance in Gradient Boosting')
model=xg.XGBClassifier(n_estimators=300,learning_rate=0.1)
model.fit(X,target_column)
pd.Series(model.feature_importances_,X.columns).sort_values(ascending=True).plot.barh(width=0.8,ax=ax[1,1],color='#FD0F00')
ax[1,1].set_title('Feature Importance in XgBoost')
plt.show()

Note that feature importance shows the importance **in this model**. If we use a different model, feature importance may return a different result.

**Using feature importance, we can use feature selection to increase accuracy, or we can remove features to increase speed.**

In [234]:
# from pandas import Series

# feature_importance = model.feature_importances_
# Series_feat_imp = Series(feature_importance, index=df_test.columns)

# plt.figure(figsize=(8, 8))
# Series_feat_imp.sort_values(ascending=True).plot.barh()
# plt.xlabel('Feature importance')
# plt.ylabel('Feature')
# plt.show()

## 4.8 Prediction on Test Set

In [235]:
submission = pd.read_csv('../input/titanic/gender_submission.csv')

In [236]:
submission.head()

In [237]:
ensemble_selective=VotingClassifier(estimators=[('KNN',KNeighborsClassifier(n_neighbors=9)),
                                              ('RBF',svm.SVC(probability=True,kernel='rbf',C=1,gamma=0.3)),
                                              ('RFor',RandomForestClassifier(n_estimators=700,random_state=42)),
                                              ('LR',LogisticRegression(C=0.05))], 
                       voting='soft').fit(train_X,train_Y)
print('The accuracy for ensembled model is:',ensemble_lin_rbf.score(vld_X,vld_Y))
cross=cross_val_score(ensemble_lin_rbf,X,target_column, cv = 10,scoring = "accuracy")
print('The cross validated score is',cross.mean())

In [238]:
X_test = df_test.values

prediction = ensemble_selective.predict(X_test)
submission['Survived'] = prediction

In [239]:
submission.to_csv('./my_second_submission.csv', index=False)