In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import confusion_matrix, roc_auc_score, precision_recall_curve, precision_score
from sklearn.metrics import accuracy_score, recall_score, classification_report
from sklearn.metrics import f1_score, roc_curve, auc
from sklearn.model_selection import GridSearchCV

In [2]:
train_df = pd.read_csv('train.csv')
test_df = pd.read_csv('test.csv')
combined = [train_df, test_df]

In [3]:
import re
for df in combined:
    df['Title'] = df['Name'].apply(lambda x: re.search(' ([A-Z][a-z]+)\.', x).group(1))
train_df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Title
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S,Mr
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C,Mrs
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S,Miss
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S,Mrs
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S,Mr


In [4]:
for df in combined:
    df['Title'] = df['Title'].replace('Mlle', 'Miss')
    df['Title'] = df['Title'].replace('Mme', 'Mrs')
    df.loc[(~df['Title'].isin(['Mr', 'Mrs', 'Miss', 'Master'])), 'Title'] = 'Rare'
train_df['Title'].unique()

array(['Mr', 'Mrs', 'Miss', 'Master', 'Rare'], dtype=object)

In [5]:
titles = ['Mr', 'Mrs', 'Miss', 'Master', 'Rare']
for df in combined:
    for title in titles:
        age_to_impute = df.groupby('Title')['Age'].median()[titles.index(title)]
        df.loc[(df['Age'].isnull()) & (df['Title'] == title), 'Age'] = age_to_impute
train_df.isnull().sum()

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age              0
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
Title            0
dtype: int64

In [6]:
mapping = {'Master':1, 'Rare':2, 'Mrs':3, 'Miss':4, 'Mr':5}
for df in combined:
    df.replace({'Title':mapping}, inplace=True)
train_df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Title
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S,5
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C,3
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S,4
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S,3
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S,5


In [7]:
for df in combined:
    df['Fsize'] = df['SibSp'] + df['Parch']+1

The minimum supported version is 2.4.6



In [8]:
temp = train_df['Ticket'].value_counts().reset_index(name='Tsize')
train_df = train_df.merge(temp, left_on='Ticket', right_on='index',how='inner').drop('index', axis=1)
temp = test_df['Ticket'].value_counts().reset_index(name='Tsize')
test_df = test_df.merge(temp, left_on='Ticket', right_on='index',how='inner').drop('index', axis=1)
train_df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Title,Fsize,Tsize
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S,5,2,1
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C,3,2,1
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S,4,1,1
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S,3,2,2
4,138,0,1,"Futrelle, Mr. Jacques Heath",male,37.0,1,0,113803,53.1,C123,S,5,2,2


In [9]:
train_df[["Tsize", "Survived"]].groupby(['Tsize'], as_index=False).mean().sort_values(by='Survived', ascending=False)

Unnamed: 0,Tsize,Survived
2,3,0.698413
1,2,0.574468
3,4,0.5
0,1,0.297989
6,7,0.238095
4,5,0.0
5,6,0.0


In [10]:
train_df[["Fsize", "Survived"]].groupby(['Fsize'], as_index=False).mean().sort_values(by='Survived', ascending=False)

Unnamed: 0,Fsize,Survived
3,4,0.724138
2,3,0.578431
1,2,0.552795
6,7,0.333333
0,1,0.303538
4,5,0.2
5,6,0.136364
7,8,0.0
8,11,0.0


In [11]:
combined = [train_df, test_df]

In [12]:
for df in combined:
    df.drop(['Cabin', 'Ticket', 'Name', 'SibSp', 'Parch'], axis=1, inplace=True)

In [13]:
train_df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Sex,Age,Fare,Embarked,Title,Fsize,Tsize
0,1,0,3,male,22.0,7.25,S,5,2,1
1,2,1,1,female,38.0,71.2833,C,3,2,1
2,3,1,3,female,26.0,7.925,S,4,1,1
3,4,1,1,female,35.0,53.1,S,3,2,2
4,138,0,1,male,37.0,53.1,S,5,2,2


In [14]:
train_df['AgeBand'] = pd.cut(train_df['Age'], 5)
train_df[['AgeBand', 'Survived']].groupby(['AgeBand'], as_index=False).mean().sort_values(by='AgeBand', ascending=True)

Unnamed: 0,AgeBand,Survived
0,"(0.34, 16.336]",0.315068
1,"(16.336, 32.252]",0.411028
2,"(32.252, 48.168]",0.404145
3,"(48.168, 64.084]",0.434783
4,"(64.084, 80.0]",0.090909


In [15]:
for dataset in combined:    
    dataset.loc[ dataset['Age'] <= 16, 'Age'] = 0
    dataset.loc[(dataset['Age'] > 16) & (dataset['Age'] <= 32), 'Age'] = 1
    dataset.loc[(dataset['Age'] > 32) & (dataset['Age'] <= 48), 'Age'] = 2
    dataset.loc[(dataset['Age'] > 48) & (dataset['Age'] <= 64), 'Age'] = 3
    dataset.loc[ dataset['Age'] > 64, 'Age'] = 4
train_df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Sex,Age,Fare,Embarked,Title,Fsize,Tsize,AgeBand
0,1,0,3,male,1.0,7.25,S,5,2,1,"(16.336, 32.252]"
1,2,1,1,female,2.0,71.2833,C,3,2,1,"(32.252, 48.168]"
2,3,1,3,female,1.0,7.925,S,4,1,1,"(16.336, 32.252]"
3,4,1,1,female,2.0,53.1,S,3,2,2,"(32.252, 48.168]"
4,138,0,1,male,2.0,53.1,S,5,2,2,"(32.252, 48.168]"


In [16]:
train_df.drop('AgeBand', axis=1, inplace=True)
combined = [train_df, test_df]
train_df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Sex,Age,Fare,Embarked,Title,Fsize,Tsize
0,1,0,3,male,1.0,7.25,S,5,2,1
1,2,1,1,female,2.0,71.2833,C,3,2,1
2,3,1,3,female,1.0,7.925,S,4,1,1
3,4,1,1,female,2.0,53.1,S,3,2,2
4,138,0,1,male,2.0,53.1,S,5,2,2


In [17]:
for dataset in combined:
    dataset['Sex'] = dataset['Sex'].map({'female':1, 'male':0}).astype(int)
train_df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Sex,Age,Fare,Embarked,Title,Fsize,Tsize
0,1,0,3,0,1.0,7.25,S,5,2,1
1,2,1,1,1,2.0,71.2833,C,3,2,1
2,3,1,3,1,1.0,7.925,S,4,1,1
3,4,1,1,1,2.0,53.1,S,3,2,2
4,138,0,1,0,2.0,53.1,S,5,2,2


In [18]:
for df in combined:
    df['Fare'].fillna(df['Fare'].median(), inplace = True)

    # Making Bins
    df['FareBin'] = pd.qcut(df['Fare'], 5)

    label = LabelEncoder()
    df['FareBin_Code'] = label.fit_transform(df['FareBin'])

In [19]:
train_df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Sex,Age,Fare,Embarked,Title,Fsize,Tsize,FareBin,FareBin_Code
0,1,0,3,0,1.0,7.25,S,5,2,1,"(-0.001, 7.854]",0
1,2,1,1,1,2.0,71.2833,C,3,2,1,"(39.688, 512.329]",4
2,3,1,3,1,1.0,7.925,S,4,1,1,"(7.854, 10.5]",1
3,4,1,1,1,2.0,53.1,S,3,2,2,"(39.688, 512.329]",4
4,138,0,1,0,2.0,53.1,S,5,2,2,"(39.688, 512.329]",4


In [20]:
test_df.head()

Unnamed: 0,PassengerId,Pclass,Sex,Age,Fare,Embarked,Title,Fsize,Tsize,FareBin,FareBin_Code
0,892,3,0,2.0,7.8292,Q,5,1,1,"(7.796, 11.342]",1
1,893,3,1,2.0,7.0,S,3,2,1,"(-0.001, 7.796]",0
2,894,2,0,3.0,9.6875,Q,5,1,1,"(7.796, 11.342]",1
3,895,3,0,1.0,8.6625,S,5,1,1,"(7.796, 11.342]",1
4,896,3,1,1.0,12.2875,S,3,3,1,"(11.342, 21.196]",2


In [21]:
for df in combined:
    df.drop(['Fare', 'FareBin'], axis=1, inplace=True)

In [22]:
train_df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Sex,Age,Embarked,Title,Fsize,Tsize,FareBin_Code
0,1,0,3,0,1.0,S,5,2,1,0
1,2,1,1,1,2.0,C,3,2,1,4
2,3,1,3,1,1.0,S,4,1,1,1
3,4,1,1,1,2.0,S,3,2,2,4
4,138,0,1,0,2.0,S,5,2,2,4


In [23]:
freq_port = train_df.Embarked.dropna().mode()[0]
for dataset in combined:
    dataset['Embarked'] = dataset['Embarked'].fillna(freq_port)

In [24]:
for dataset in combined:
    dataset['Embarked'] = dataset['Embarked'].map( {'S': 0, 'C': 1, 'Q': 2} ).astype(int)

In [25]:
for df in combined:
    df['Age'] = df['Age'].astype(int)

In [26]:
train_df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Sex,Age,Embarked,Title,Fsize,Tsize,FareBin_Code
0,1,0,3,0,1,0,5,2,1,0
1,2,1,1,1,2,1,3,2,1,4
2,3,1,3,1,1,0,4,1,1,1
3,4,1,1,1,2,0,3,2,2,4
4,138,0,1,0,2,0,5,2,2,4


In [27]:
test_df.head()

Unnamed: 0,PassengerId,Pclass,Sex,Age,Embarked,Title,Fsize,Tsize,FareBin_Code
0,892,3,0,2,2,5,1,1,1
1,893,3,1,2,0,3,2,1,0
2,894,2,0,3,2,5,1,1,1
3,895,3,0,1,0,5,1,1,1
4,896,3,1,1,0,3,3,1,2


In [28]:
Y_pred = test_df['PassengerId']

In [29]:
X_train = train_df.drop(['PassengerId', 'Survived'], axis=1)

In [30]:
Y_train = train_df['Survived']

In [31]:
from sklearn.model_selection import train_test_split
xtrain, xtest, ytrain, ytest = train_test_split(X_train, Y_train)

In [32]:
n_neighbors = [6,7,8,9,10,11,12,14,16,18,20,22]
algorithm = ['auto']
weights = ['uniform', 'distance']
leaf_size = list(range(1,50,5))
hyperparams = {'algorithm': algorithm, 'weights': weights, 'leaf_size': leaf_size, 
               'n_neighbors': n_neighbors}
gd=GridSearchCV(estimator = KNeighborsClassifier(), param_grid = hyperparams, verbose=True, 
                cv=10, scoring = "accuracy")
gd.fit(xtrain, ytrain)
print(gd.best_score_)
print(gd.best_estimator_)

Fitting 10 folds for each of 240 candidates, totalling 2400 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


0.8143712574850299
KNeighborsClassifier(algorithm='auto', leaf_size=21, metric='minkowski',
           metric_params=None, n_jobs=None, n_neighbors=20, p=2,
           weights='uniform')


[Parallel(n_jobs=1)]: Done 2400 out of 2400 | elapsed:   25.3s finished


In [33]:
knn = KNeighborsClassifier(algorithm='auto', leaf_size=21, metric='minkowski',
           metric_params=None, n_jobs=None, n_neighbors=20, p=2,
           weights='uniform')

In [34]:
knn.fit(xtrain, ytrain)

KNeighborsClassifier(algorithm='auto', leaf_size=21, metric='minkowski',
           metric_params=None, n_jobs=None, n_neighbors=20, p=2,
           weights='uniform')

In [36]:
ypred = knn.predict(xtest)

In [37]:
print(confusion_matrix(ytest, ypred))
print(accuracy_score(ytest,ypred))
print(classification_report(ytest, ypred))

[[118  17]
 [ 25  63]]
0.8116591928251121
              precision    recall  f1-score   support

           0       0.83      0.87      0.85       135
           1       0.79      0.72      0.75        88

   micro avg       0.81      0.81      0.81       223
   macro avg       0.81      0.79      0.80       223
weighted avg       0.81      0.81      0.81       223



In [38]:
knn2 = KNeighborsClassifier(algorithm='auto', leaf_size=26, metric='minkowski', metric_params=None, n_jobs=-1,
                          n_neighbors=6, p=2, weights='uniform')
knn2.fit(xtrain, ytrain)
ypred2 = knn2.predict(xtest)
print(confusion_matrix(ytest, ypred2))
print(accuracy_score(ytest,ypred2))
print(classification_report(ytest, ypred2))

[[114  21]
 [ 26  62]]
0.7892376681614349
              precision    recall  f1-score   support

           0       0.81      0.84      0.83       135
           1       0.75      0.70      0.73        88

   micro avg       0.79      0.79      0.79       223
   macro avg       0.78      0.77      0.78       223
weighted avg       0.79      0.79      0.79       223



In [39]:
import xgboost as xgb

In [40]:
xgbclr = xgb.XGBClassifier()

In [41]:
xgbclr.fit(xtrain, ytrain)
ypred3 = xgbclr.predict(xtest)
print(confusion_matrix(ytest, ypred3))
print(accuracy_score(ytest,ypred3))
print(classification_report(ytest, ypred3))

[[115  20]
 [ 24  64]]
0.8026905829596412
              precision    recall  f1-score   support

           0       0.83      0.85      0.84       135
           1       0.76      0.73      0.74        88

   micro avg       0.80      0.80      0.80       223
   macro avg       0.79      0.79      0.79       223
weighted avg       0.80      0.80      0.80       223

