In [1]:
import pandas as pd
import numpy as np



In [2]:
train_df = pd.read_csv("train.csv", header=0)
test_df = pd.read_csv("test.csv")

In [3]:
train_df.columns

Index(['PassengerId', 'Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp',
       'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked'],
      dtype='object')

# Cleaning the training data

In [4]:
# Extract title
# extracting the titel
train_df["Title"] = train_df["Name"].str.extract('([A-Za-z]+)\.',expand=False)
test_df["Title"] = test_df["Name"].str.extract('([A-Za-z]+)\.',expand=False)

for i in [train_df, test_df]:
    i['Title'] = i['Title'].replace('Mr', 'Mr')
    i['Title'] = i['Title'].replace(('Mme', 'Ms'), 'Mrs')
    i['Title'] = i['Title'].replace('Mlle', 'Miss')
    i['Title'] = i['Title'].replace(('Capt', 'Col', 'Major', 'Dr','Rev'), 'Officer')
    i['Title'] = i['Title'].replace(('Jonkheer', 'Don', 'Sir', 'Countess','Dona', 'Lady'), 'Royalty')

In [5]:
train_df.groupby(["Sex", "Survived"]).count()

Unnamed: 0_level_0,Unnamed: 1_level_0,PassengerId,Pclass,Name,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Title
Sex,Survived,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
female,0,81,81,81,64,81,81,81,81,6,81,81
female,1,233,233,233,197,233,233,233,233,91,231,233
male,0,468,468,468,360,468,468,468,468,62,468,468
male,1,109,109,109,93,109,109,109,109,45,109,109


In [6]:
# Calculating mean age
train_df[["Age", "Title"]].groupby("Title").mean().to_dict()

{'Age': {'Master': 4.574166666666667,
  'Miss': 21.804054054054053,
  'Mr': 32.368090452261306,
  'Mrs': 35.71818181818182,
  'Officer': 46.705882352941174,
  'Royalty': 41.6}}

In [7]:
# Filling age with mean age based on title
mean_age_title = {'Master': 4.574166666666667,
  'Miss': 21.804054054054053,
  'Mr': 32.368090452261306,
  'Mrs': 35.71818181818182,
  'Officer': 46.705882352941174,
  'Royalty': 41.6}

In [8]:
train_df['Age'] = train_df['Age'].fillna(train_df['Title'].map(mean_age_title))
test_df['Age'] = test_df['Age'].fillna(test_df['Title'].map(mean_age_title))

In [9]:
test_df.Age.unique()

array([34.5       , 47.        , 62.        , 27.        , 22.        ,
       14.        , 30.        , 26.        , 18.        , 21.        ,
       32.36809045, 46.        , 23.        , 63.        , 24.        ,
       35.        , 45.        , 55.        ,  9.        , 35.71818182,
       48.        , 50.        , 22.5       , 41.        , 33.        ,
       18.5       , 21.80405405, 25.        , 39.        , 60.        ,
       36.        , 20.        , 28.        , 10.        , 17.        ,
       32.        , 13.        , 31.        , 29.        , 28.5       ,
       32.5       ,  6.        , 67.        , 49.        ,  2.        ,
       76.        , 43.        , 16.        ,  1.        , 12.        ,
       42.        , 53.        , 26.5       , 40.        , 61.        ,
       60.5       ,  7.        , 15.        , 54.        , 64.        ,
       37.        , 34.        , 11.5       ,  8.        ,  0.33      ,
       38.        , 57.        , 40.5       ,  4.57416667,  0.92

In [10]:
# Bin the categories - 
# 1.Age
# 2.
# 3.
# 4.
# 5.
train_df['age_feature'] = pd.cut(train_df["Age"], bins=[0, 0.99, 7, 23, 58, 100], labels=['infant', 'child', 'young', 'adult', 'senior'], include_lowest=True)
train_df['fare_feature'] = pd.cut(train_df['Fare'], bins=[0, 12, 40, 80, 1000], labels=['least', 'low', 'mid', 'high'])

test_df['age_feature'] = pd.cut(test_df["Age"], bins=[0, 0.99, 7, 23, 58, 100], labels=['infant', 'child', 'young', 'adult', 'senior'], include_lowest=True)
test_df['fare_feature'] = pd.cut(test_df['Fare'], bins=[0, 12, 40, 80, 1000], labels=['least', 'low', 'mid', 'high'])

In [11]:
train_df

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Title,age_feature,fare_feature
0,1,0,3,"Braund, Mr. Owen Harris",male,22.000000,1,0,A/5 21171,7.2500,,S,Mr,young,least
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.000000,1,0,PC 17599,71.2833,C85,C,Mrs,adult,mid
2,3,1,3,"Heikkinen, Miss. Laina",female,26.000000,0,0,STON/O2. 3101282,7.9250,,S,Miss,adult,least
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.000000,1,0,113803,53.1000,C123,S,Mrs,adult,mid
4,5,0,3,"Allen, Mr. William Henry",male,35.000000,0,0,373450,8.0500,,S,Mr,adult,least
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
886,887,0,2,"Montvila, Rev. Juozas",male,27.000000,0,0,211536,13.0000,,S,Officer,adult,low
887,888,1,1,"Graham, Miss. Margaret Edith",female,19.000000,0,0,112053,30.0000,B42,S,Miss,young,low
888,889,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,21.804054,1,2,W./C. 6607,23.4500,,S,Miss,young,low
889,890,1,1,"Behr, Mr. Karl Howell",male,26.000000,0,0,111369,30.0000,C148,C,Mr,adult,low


In [12]:
# Getting the cabin type
train_df['c_type'] = train_df['Cabin'].str[0]
test_df['c_type'] = test_df['Cabin'].str[0]
train_df['c_type'] = train_df['c_type'].fillna('unknown')
test_df['c_type'] = test_df['c_type'].fillna('unknown')

In [13]:
train_df

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Title,age_feature,fare_feature,c_type
0,1,0,3,"Braund, Mr. Owen Harris",male,22.000000,1,0,A/5 21171,7.2500,,S,Mr,young,least,unknown
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.000000,1,0,PC 17599,71.2833,C85,C,Mrs,adult,mid,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.000000,0,0,STON/O2. 3101282,7.9250,,S,Miss,adult,least,unknown
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.000000,1,0,113803,53.1000,C123,S,Mrs,adult,mid,C
4,5,0,3,"Allen, Mr. William Henry",male,35.000000,0,0,373450,8.0500,,S,Mr,adult,least,unknown
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
886,887,0,2,"Montvila, Rev. Juozas",male,27.000000,0,0,211536,13.0000,,S,Officer,adult,low,unknown
887,888,1,1,"Graham, Miss. Margaret Edith",female,19.000000,0,0,112053,30.0000,B42,S,Miss,young,low,B
888,889,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,21.804054,1,2,W./C. 6607,23.4500,,S,Miss,young,low,unknown
889,890,1,1,"Behr, Mr. Karl Howell",male,26.000000,0,0,111369,30.0000,C148,C,Mr,adult,low,C


In [14]:
train_df['fam_count'] = train_df['SibSp'] + train_df['Parch']
test_df['fam_count'] = test_df['SibSp'] + test_df['Parch']

In [15]:
train_df['fam_size'] = pd.cut(train_df['fam_count'], bins=[0, 1, 4, 11], labels=['alone', 'small', 'large'], include_lowest=True, right=False)
test_df['fam_size'] = pd.cut(test_df['fam_count'], bins=[0, 1, 4, 11], labels=['alone', 'small', 'large'], include_lowest=True, right=False)
train_df['is_alone'] = train_df['fam_count'] == 0
test_df['is_alone'] = test_df['fam_count'] == 0

In [16]:
test_df.tail(30)

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Title,age_feature,fare_feature,c_type,fam_count,fam_size,is_alone
388,1280,3,"Canavan, Mr. Patrick",male,21.0,0,0,364858,7.75,,Q,Mr,young,least,unknown,0,alone,True
389,1281,3,"Palsson, Master. Paul Folke",male,6.0,3,1,349909,21.075,,S,Master,child,low,unknown,4,large,False
390,1282,1,"Payne, Mr. Vivian Ponsonby",male,23.0,0,0,12749,93.5,B24,S,Mr,young,high,B,0,alone,True
391,1283,1,"Lines, Mrs. Ernest H (Elizabeth Lindsey James)",female,51.0,0,1,PC 17592,39.4,D28,S,Mrs,adult,low,D,1,small,False
392,1284,3,"Abbott, Master. Eugene Joseph",male,13.0,0,2,C.A. 2673,20.25,,S,Master,young,low,unknown,2,small,False
393,1285,2,"Gilbert, Mr. William",male,47.0,0,0,C.A. 30769,10.5,,S,Mr,adult,least,unknown,0,alone,True
394,1286,3,"Kink-Heilmann, Mr. Anton",male,29.0,3,1,315153,22.025,,S,Mr,adult,low,unknown,4,large,False
395,1287,1,"Smith, Mrs. Lucien Philip (Mary Eloise Hughes)",female,18.0,1,0,13695,60.0,C31,S,Mrs,young,mid,C,1,small,False
396,1288,3,"Colbert, Mr. Patrick",male,24.0,0,0,371109,7.25,,Q,Mr,adult,least,unknown,0,alone,True
397,1289,1,"Frolicher-Stehli, Mrs. Maxmillian (Margaretha ...",female,48.0,1,1,13567,79.2,B41,C,Mrs,adult,mid,B,2,small,False


In [17]:
from sklearn.preprocessing import MinMaxScaler, StandardScaler, LabelEncoder, LabelBinarizer, scale, Normalizer, PowerTransformer, MaxAbsScaler, 

In [18]:
# Convery labels to binary in cases where there are only 2 possible values 
lb = LabelBinarizer()
for i in ['Sex', 'is_alone']:
    train_df[i] = lb.fit_transform(train_df[i])
    test_df[i] = lb.fit_transform(test_df[i])

In [19]:
# Scale the age and fare in a range of -1 to 1
mm = MinMaxScaler()
for i in ['Age', 'Fare']:
    train_df[i] = mm.fit_transform(train_df[i].values.reshape(-1, 1))
    test_df[i] = mm.fit_transform(test_df[i].values.reshape(-1, 1))

In [20]:
train_df

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Title,age_feature,fare_feature,c_type,fam_count,fam_size,is_alone
0,1,0,3,"Braund, Mr. Owen Harris",1,0.271174,1,0,A/5 21171,0.014151,,S,Mr,young,least,unknown,1,small,0
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",0,0.472229,1,0,PC 17599,0.139136,C85,C,Mrs,adult,mid,C,1,small,0
2,3,1,3,"Heikkinen, Miss. Laina",0,0.321438,0,0,STON/O2. 3101282,0.015469,,S,Miss,adult,least,unknown,0,alone,1
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",0,0.434531,1,0,113803,0.103644,C123,S,Mrs,adult,mid,C,1,small,0
4,5,0,3,"Allen, Mr. William Henry",1,0.434531,0,0,373450,0.015713,,S,Mr,adult,least,unknown,0,alone,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
886,887,0,2,"Montvila, Rev. Juozas",1,0.334004,0,0,211536,0.025374,,S,Officer,adult,low,unknown,0,alone,1
887,888,1,1,"Graham, Miss. Margaret Edith",0,0.233476,0,0,112053,0.058556,B42,S,Miss,young,low,B,0,alone,1
888,889,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",0,0.268711,1,2,W./C. 6607,0.045771,,S,Miss,young,low,unknown,3,small,0
889,890,1,1,"Behr, Mr. Karl Howell",1,0.321438,0,0,111369,0.058556,C148,C,Mr,adult,low,C,0,alone,1


In [21]:
features = ['Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'fam_count', 'is_alone']
X_actual_train = train_df[features]
Y_actual_train = train_df['Survived']

X_actual_test = test_df[features]
X_actual_test

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,fam_count,is_alone
0,3,1,0.452723,0,0,0.015282,0,1
1,3,0,0.617566,1,0,0.013663,1,0
2,2,1,0.815377,0,0,0.018909,0,1
3,3,1,0.353818,0,0,0.016908,0,1
4,3,0,0.287881,1,1,0.023984,2,0
...,...,...,...,...,...,...,...,...
413,3,1,0.424609,0,0,0.015713,0,1
414,1,0,0.512066,0,0,0.212559,0,1
415,3,1,0.505473,0,0,0.014151,0,1
416,3,1,0.424609,0,0,0.015713,0,1


In [22]:
from sklearn.model_selection import train_test_split

In [55]:
X_train, X_test, Y_train, Y_test = train_test_split(X_actual_train, Y_actual_train, test_size=0.3)

# Models 

In [56]:
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.preprocessing import MinMaxScaler, StandardScaler, LabelEncoder, LabelBinarizer, scale, Normalizer, PowerTransformer, MaxAbsScaler
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

from sklearn.naive_bayes import GaussianNB, MultinomialNB, BernoulliNB
from sklearn.svm import SVC, NuSVC, LinearSVC
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier

# 1. Gaussian Naive Bayes

In [57]:
nb = GaussianNB()

In [58]:
nb.fit(X_train, Y_train)

GaussianNB(priors=None, var_smoothing=1e-09)

In [59]:
Y_pred = nb.predict(X_test)

In [60]:
print(accuracy_score(Y_pred, Y_test))

0.7873134328358209


In [61]:
# Testing on actual test data after training on complete data
X_train = X_actual_train
Y_train = Y_actual_train
nb_complete = GaussianNB()

In [62]:
nb_complete.fit(X_train, Y_train)
X_actual_test.dtypes

Pclass         int64
Sex            int64
Age          float64
SibSp          int64
Parch          int64
Fare         float64
fam_count      int64
is_alone       int64
dtype: object

In [63]:
X_actual_test['Fare'] = X_actual_test["Fare"].fillna(1.1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


In [64]:
Y_test_predict = nb_complete.predict(X_actual_test)

In [65]:
Y_test_predict

array([0, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 1, 1, 1, 0,
       1, 1, 1, 0, 1, 0, 0, 0, 0, 0, 1, 1, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1,
       1, 0, 0, 0, 1, 1, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 1,
       1, 0, 0, 1, 1, 0, 1, 0, 1, 1, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 1, 1,
       1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0,
       0, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1,
       0, 0, 1, 1, 0, 1, 1, 0, 1, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1,
       1, 1, 1, 1, 0, 1, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1,
       0, 1, 1, 0, 1, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 1, 1, 0,
       1, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 1, 1, 1, 1,
       1, 0, 0, 1, 1, 0, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1,
       0, 0, 0, 0, 1, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 0, 1, 1, 1, 0,
       0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0,

In [66]:
result_df = pd.DataFrame({
    "PassengerId": test_df['PassengerId'], 
    "Survived": Y_test_predict
})

In [67]:
result_df.to_csv("submission.csv",index=False)

# 2. Logistic regression

In [121]:
lr = LogisticRegression(C = 1, penalty= 'l2', solver= 'liblinear')

In [122]:
X_train, X_test, Y_train, Y_test = train_test_split(X_actual_train, Y_actual_train, test_size=0.3)
lr.fit(X_train, Y_train)

LogisticRegression(C=1, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='liblinear', tol=0.0001, verbose=0,
                   warm_start=False)

In [123]:
lr.fit(X_train, Y_train)
Y_predicted = lr.predict(X_test)

In [124]:
print(accuracy_score(Y_predicted, Y_test))

0.7761194029850746


In [125]:
# Training on complete training data and testing on complete test data
X_train = X_actual_train
Y_train = Y_actual_train
lr.fit(X_train, Y_train)

LogisticRegression(C=1, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='liblinear', tol=0.0001, verbose=0,
                   warm_start=False)

In [126]:
# Get the predicted values on actual test data
Y_test_predicted = lr.predict(X_actual_test)

In [127]:
result_df = pd.DataFrame({
    "PassengerId": test_df['PassengerId'], 
    "Survived": Y_test_predicted
})

In [128]:
result_df.to_csv("submission.csv", index= False)

# 3. K Nearest neighbors 

In [164]:
model = KNeighborsClassifier()

In [165]:
hyperparameters = {
    "n_neighbors" : range(1,20,2),
    'weights' : ['uniform', 'distance'],
    'p' : [1, 2]
}

In [166]:
# Create a grid and fit the classifer on the grid
X_train, X_test, Y_train, Y_test = train_test_split(X_actual_train, Y_actual_train, test_size=0.3)

grid = GridSearchCV(model, param_grid=hyperparameters, cv=10)
grid.fit(X_train, Y_train)

GridSearchCV(cv=10, error_score=nan,
             estimator=KNeighborsClassifier(algorithm='auto', leaf_size=30,
                                            metric='minkowski',
                                            metric_params=None, n_jobs=None,
                                            n_neighbors=5, p=2,
                                            weights='uniform'),
             iid='deprecated', n_jobs=None,
             param_grid={'n_neighbors': range(1, 20, 2), 'p': [1, 2],
                         'weights': ['uniform', 'distance']},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring=None, verbose=0)

In [167]:
Y_predicted = grid.predict(X_test)

In [168]:
print(accuracy_score(Y_predicted, Y_test))

0.8171641791044776


In [169]:
# Train on complete data and test on actual test data
grid.fit(X_actual_train, Y_actual_train)

GridSearchCV(cv=10, error_score=nan,
             estimator=KNeighborsClassifier(algorithm='auto', leaf_size=30,
                                            metric='minkowski',
                                            metric_params=None, n_jobs=None,
                                            n_neighbors=5, p=2,
                                            weights='uniform'),
             iid='deprecated', n_jobs=None,
             param_grid={'n_neighbors': range(1, 20, 2), 'p': [1, 2],
                         'weights': ['uniform', 'distance']},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring=None, verbose=0)

In [170]:
Y_test_predicted = grid.predict(X_actual_test)

In [171]:

Y_test_predicted
result_df = pd.DataFrame({
    "PassengerId": test_df['PassengerId'], 
    "Survived": Y_test_predicted
})

In [172]:
result_df.to_csv("submission.csv", index= False)