In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list the files in the input directory

import os
print(os.listdir("../input"))

import warnings
warnings.filterwarnings('ignore')

# Any results you write to the current directory are saved as output.

['train.csv', 'gender_submission.csv', 'test.csv']


In [2]:
#Load Titanic data

def load_titanic_data(filename):
    return pd.read_csv(filename)

In [3]:
train_data = load_titanic_data("../input/train.csv")
test_data = load_titanic_data("../input/test.csv")

In [4]:
train_data.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


The attributes have the following meaning:

**Survived:** that's the target, 0 means the passenger did not survive, while 1 means he/she survived.

**Pclass:** passenger class.

**Name, Sex, Age:** self-explanatory

**SibSp:** how many siblings & spouses of the passenger aboard the Titanic.

**Parch:** how many children & parents of the passenger aboard the Titanic.

**Ticket:** ticket id

**Fare:** price paid (in pounds)

**Cabin:** passenger's cabin number

**Embarked:** where the passenger embarked the Titanic


In [5]:
#data info

train_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
PassengerId    891 non-null int64
Survived       891 non-null int64
Pclass         891 non-null int64
Name           891 non-null object
Sex            891 non-null object
Age            714 non-null float64
SibSp          891 non-null int64
Parch          891 non-null int64
Ticket         891 non-null object
Fare           891 non-null float64
Cabin          204 non-null object
Embarked       889 non-null object
dtypes: float64(2), int64(5), object(5)
memory usage: 83.6+ KB


**Age**, **Cabin** and **Embarked** have missing values.

**Age** data is missing about **19%**, **Cabin** data is missing by **77%** ( we will ignore this column for now), **Embarked** is missing only **2 values.** We can impute Embarked by highest occuring value in the data. 
**Age** can be replaced by median values $(?)$

Also , **Name** and **Ticket** Data is **object** type, so we need to check if we get some information out of it in the later part.

In [6]:
#Embarked

train_data["Embarked"].value_counts()

S    644
C    168
Q     77
Name: Embarked, dtype: int64

In [7]:
#Replace NaN in Embarked by category S

train_data["Embarked"].fillna('S', inplace=True)


In [8]:
#Get Median Age and replace NaN with Median Age

age_median = train_data["Age"].median()
age_median

train_data["Age"].fillna(age_median, inplace=True)

In [9]:
#Let's describe the data now.

train_data.describe()

Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare
count,891.0,891.0,891.0,891.0,891.0,891.0,891.0
mean,446.0,0.383838,2.308642,29.361582,0.523008,0.381594,32.204208
std,257.353842,0.486592,0.836071,13.019697,1.102743,0.806057,49.693429
min,1.0,0.0,1.0,0.42,0.0,0.0,0.0
25%,223.5,0.0,2.0,22.0,0.0,0.0,7.9104
50%,446.0,0.0,3.0,28.0,0.0,0.0,14.4542
75%,668.5,1.0,3.0,35.0,1.0,0.0,31.0
max,891.0,1.0,3.0,80.0,8.0,6.0,512.3292


All Numerical columns are free of **NaN** now.

Let's makes some notes of data distribution now.

* Mean for survived is 38%, This means only 38% of the passengers survived.
* Average Fare is $32, Average Age is 29 years.

In [10]:
#A quick look at categorical features.

#Pclass

train_data["Pclass"].value_counts()

3    491
1    216
2    184
Name: Pclass, dtype: int64

In [11]:
#Let's see how is the distribution for survived passengers.

train_data[['Pclass', 'Survived']].groupby(['Pclass'], as_index=False).mean().sort_values(by='Survived', ascending=False)

Unnamed: 0,Pclass,Survived
0,1,0.62963
1,2,0.472826
2,3,0.242363


**Observation:** Higher the class, greater the chance of survival

In [12]:
#Sex

train_data["Sex"].value_counts()


male      577
female    314
Name: Sex, dtype: int64

In [13]:
train_data[['Sex', 'Survived']].groupby(['Sex'], as_index=False).mean().sort_values(by='Survived', ascending=False)

Unnamed: 0,Sex,Survived
0,female,0.742038
1,male,0.188908


**Observation:** : Females has very high chnaces of survival as compared to males

In [14]:
#Embarked

train_data["Embarked"].value_counts()
train_data[['Embarked', 'Survived']].groupby(['Embarked'], as_index=False).mean().sort_values(by='Survived', ascending=False)

Unnamed: 0,Embarked,Survived
0,C,0.553571
1,Q,0.38961
2,S,0.339009


**Observation:**: From initial summary, it looks like even port of embarkment is related to survival chances

We will also need an imputer for string categorical values, 

In [15]:
train_data.head() 

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [16]:
#Drop columns Cabin, Name, PassengerId, Ticket

train_data.drop(["PassengerId","Name", "Ticket", "Cabin"], axis =1, inplace=True)

In [17]:
train_data.head()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,0,3,male,22.0,1,0,7.25,S
1,1,1,female,38.0,1,0,71.2833,C
2,1,3,female,26.0,0,0,7.925,S
3,1,1,female,35.0,1,0,53.1,S
4,0,3,male,35.0,0,0,8.05,S


* First we will import Label encoder and changed categorical values into numerical values as per the category.
* Second, we will call OneHotEncoder and changed required variables into hot encoding.
* Third, we will concate the original data set with hot encoding values dataset.
* Fouth, we will remove unneccesary columns.

In [18]:
from sklearn.preprocessing import LabelEncoder

le_pClass = LabelEncoder()
le_sex = LabelEncoder()
le_embarked = LabelEncoder()
train_data['PClass_encoded'] = le_pClass.fit_transform(train_data.Pclass)
train_data['Sex_encoded'] = le_sex.fit_transform(train_data.Sex)
train_data['Embarked_encoded'] = le_embarked.fit_transform(train_data.Embarked)

train_data.head()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked,PClass_encoded,Sex_encoded,Embarked_encoded
0,0,3,male,22.0,1,0,7.25,S,2,1,2
1,1,1,female,38.0,1,0,71.2833,C,0,0,0
2,1,3,female,26.0,0,0,7.925,S,2,0,2
3,1,1,female,35.0,1,0,53.1,S,0,0,2
4,0,3,male,35.0,0,0,8.05,S,2,1,2


In [19]:
#One hot encoding for categorical columns (PClass, Sex, Embarked)

from sklearn.preprocessing import OneHotEncoder

pClass_ohe = OneHotEncoder()
sex_ohe = OneHotEncoder()
embarked_ohe = OneHotEncoder()

Xp =pClass_ohe.fit_transform(train_data.PClass_encoded.values.reshape(-1,1)).toarray()
Xs =sex_ohe.fit_transform(train_data.Sex_encoded.values.reshape(-1,1)).toarray()
Xe =embarked_ohe.fit_transform(train_data.Embarked_encoded.values.reshape(-1,1)).toarray()


In [20]:
#Add back to original dataframe

train_dataOneHot = pd.DataFrame(Xp, columns = ["PClass_"+str(int(i)) for i in range(Xp.shape[1])])
train_data = pd.concat([train_data, train_dataOneHot], axis=1)

train_dataOneHot = pd.DataFrame(Xs, columns = ["Sex_"+str(int(i)) for i in range(Xs.shape[1])])
train_data = pd.concat([train_data, train_dataOneHot], axis=1)

train_dataOneHot = pd.DataFrame(Xe, columns = ["Embarked_"+str(int(i)) for i in range(Xe.shape[1])])
train_data = pd.concat([train_data, train_dataOneHot], axis=1)

In [21]:
train_data.head()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked,PClass_encoded,Sex_encoded,Embarked_encoded,PClass_0,PClass_1,PClass_2,Sex_0,Sex_1,Embarked_0,Embarked_1,Embarked_2
0,0,3,male,22.0,1,0,7.25,S,2,1,2,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0
1,1,1,female,38.0,1,0,71.2833,C,0,0,0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0
2,1,3,female,26.0,0,0,7.925,S,2,0,2,0.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0
3,1,1,female,35.0,1,0,53.1,S,0,0,2,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0
4,0,3,male,35.0,0,0,8.05,S,2,1,2,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0


In [22]:
#Drop unneccesary columns

train_data.drop(["Pclass","Sex", "Embarked", "PClass_encoded", "Sex_encoded", "Embarked_encoded"], axis =1, inplace=True)

In [23]:
train_data.head()

Unnamed: 0,Survived,Age,SibSp,Parch,Fare,PClass_0,PClass_1,PClass_2,Sex_0,Sex_1,Embarked_0,Embarked_1,Embarked_2
0,0,22.0,1,0,7.25,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0
1,1,38.0,1,0,71.2833,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0
2,1,26.0,0,0,7.925,0.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0
3,1,35.0,1,0,53.1,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0
4,0,35.0,0,0,8.05,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0


In [24]:
train_data.shape

(891, 13)

## Split into train and Test Data

In [25]:
train_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 13 columns):
Survived      891 non-null int64
Age           891 non-null float64
SibSp         891 non-null int64
Parch         891 non-null int64
Fare          891 non-null float64
PClass_0      891 non-null float64
PClass_1      891 non-null float64
PClass_2      891 non-null float64
Sex_0         891 non-null float64
Sex_1         891 non-null float64
Embarked_0    891 non-null float64
Embarked_1    891 non-null float64
Embarked_2    891 non-null float64
dtypes: float64(10), int64(3)
memory usage: 90.6 KB


In [26]:
#Feature Matrix

X = train_data.drop(['Survived'], axis=1)
X.shape

(891, 12)

In [27]:
#Target Vector

y = train_data['Survived']
y.shape

(891,)

In [28]:
#Split into train test data

from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.2, random_state =42)

In [29]:
print(X_train.shape,X_test.shape,y_train.shape,y_test.shape)

(712, 12) (179, 12) (712,) (179,)


In [30]:
#SGDClassifier

from sklearn.linear_model import SGDClassifier

sgd_clf = SGDClassifier(random_state =42)
sgd_clf.fit(X_train, y_train)

SGDClassifier(alpha=0.0001, average=False, class_weight=None,
       early_stopping=False, epsilon=0.1, eta0=0.0, fit_intercept=True,
       l1_ratio=0.15, learning_rate='optimal', loss='hinge', max_iter=None,
       n_iter=None, n_iter_no_change=5, n_jobs=None, penalty='l2',
       power_t=0.5, random_state=42, shuffle=True, tol=None,
       validation_fraction=0.1, verbose=0, warm_start=False)

In [31]:
#SupportVectorMachine

from sklearn.svm import SVC
svm_clf = SVC()
svm_clf.fit(X_train, y_train)

SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto_deprecated',
  kernel='rbf', max_iter=-1, probability=False, random_state=None,
  shrinking=True, tol=0.001, verbose=False)

In [32]:
#RandomForestClassifier

from sklearn.ensemble import RandomForestClassifier
fr_clf = RandomForestClassifier(random_state=42)
fr_clf.fit(X_train, y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=None,
            oob_score=False, random_state=42, verbose=0, warm_start=False)

In [33]:
#LogisticRegression

from sklearn.linear_model import LogisticRegression
lr_clf = LogisticRegression(random_state=42)
lr_clf.fit(X_train, y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=42, solver='warn',
          tol=0.0001, verbose=0, warm_start=False)

In [34]:
#KNearestNeighbors

from sklearn.neighbors import KNeighborsClassifier
knn_clf = KNeighborsClassifier(n_neighbors=3)
knn_clf.fit(X_train,y_train)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=None, n_neighbors=3, p=2,
           weights='uniform')

In [35]:
#DecisionTree

from sklearn.tree import DecisionTreeClassifier
dt_clf = DecisionTreeClassifier(random_state=42)
dt_clf.fit(X_train, y_train)

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=42,
            splitter='best')

In [36]:
#Measuring accuracy Using Cross Validation

from sklearn.model_selection import cross_val_score

#SGDClassifier
sgd_clf_score = cross_val_score(sgd_clf, X_train, y_train, cv=10, scoring="accuracy")
sgd_mean = sgd_clf_score.mean()
sgd_mean

0.5575067069081154

In [37]:
#SVMClassifier
svm_clf_score = cross_val_score(svm_clf, X_train, y_train, cv=10)
svm_mean = svm_clf_score.mean()
svm_mean

0.7161535881958419

In [38]:
#RandomForestClassifier
rf_clf_score = cross_val_score(fr_clf, X_train, y_train, cv=10)
rf_mean = rf_clf_score.mean()
rf_mean

0.7865649452269171

In [39]:
#LogisticRegression
lr_clf_score = cross_val_score(lr_clf, X_train, y_train, cv=10)
lr_mean = lr_clf_score.mean()
lr_mean

0.8005739995528728

In [40]:
#KNearestNeighbors
knn_clf_score = cross_val_score(knn_clf, X_train, y_train, cv=10)
knn_mean = knn_clf_score.mean()
knn_mean

0.7315699754080036

In [41]:
#DecisionTreeClassifier
dt_clf_score = cross_val_score(dt_clf, X_train, y_train, cv=10)
dt_mean = dt_clf_score.mean()
dt_mean

0.7514671361502347

In [42]:
#Model Evaluations

models = pd.DataFrame({'Model': ['SGDClassifier', 'SupportVectorMachine', 'RandomForestClassifier',
                                'LogisticRegression', 'KNearestNeighbors', 'DecisionTreeClassifier'], 
                       'Score':[sgd_mean, svm_mean, rf_mean, lr_mean, knn_mean, dt_mean
                    ]})
models.sort_values(by = 'Score', ascending = False)

Unnamed: 0,Model,Score
3,LogisticRegression,0.800574
2,RandomForestClassifier,0.786565
5,DecisionTreeClassifier,0.751467
4,KNearestNeighbors,0.73157
1,SupportVectorMachine,0.716154
0,SGDClassifier,0.557507


# Predictions on test data

In [43]:
test_data.head()

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S


In [44]:
test_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 418 entries, 0 to 417
Data columns (total 11 columns):
PassengerId    418 non-null int64
Pclass         418 non-null int64
Name           418 non-null object
Sex            418 non-null object
Age            332 non-null float64
SibSp          418 non-null int64
Parch          418 non-null int64
Ticket         418 non-null object
Fare           417 non-null float64
Cabin          91 non-null object
Embarked       418 non-null object
dtypes: float64(2), int64(4), object(5)
memory usage: 36.0+ KB


In [45]:
#We need to do the same transformation on test data file, so we can predict.

#Replace NaN in Embarked by category S
test_data["Embarked"].fillna('S', inplace=True)


#Get Median Age and replace NaN with Median Age
age_median = test_data["Age"].median()
test_data["Age"].fillna(age_median, inplace=True)

#Drop columns Cabin, Name, Ticket
test_data.drop(["Name", "Ticket", "Cabin"], axis =1, inplace=True)

In [46]:
#OneHotEncoding

le_pClass = LabelEncoder()
le_sex = LabelEncoder()
le_embarked = LabelEncoder()
test_data['PClass_encoded'] = le_pClass.fit_transform(test_data.Pclass)
test_data['Sex_encoded'] = le_sex.fit_transform(test_data.Sex)
test_data['Embarked_encoded'] = le_embarked.fit_transform(test_data.Embarked)

test_data.head()

Unnamed: 0,PassengerId,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked,PClass_encoded,Sex_encoded,Embarked_encoded
0,892,3,male,34.5,0,0,7.8292,Q,2,1,1
1,893,3,female,47.0,1,0,7.0,S,2,0,2
2,894,2,male,62.0,0,0,9.6875,Q,1,1,1
3,895,3,male,27.0,0,0,8.6625,S,2,1,2
4,896,3,female,22.0,1,1,12.2875,S,2,0,2


In [47]:
pClass_ohe = OneHotEncoder()
sex_ohe = OneHotEncoder()
embarked_ohe = OneHotEncoder()

Xp =pClass_ohe.fit_transform(test_data.PClass_encoded.values.reshape(-1,1)).toarray()
Xs =sex_ohe.fit_transform(test_data.Sex_encoded.values.reshape(-1,1)).toarray()
Xe =embarked_ohe.fit_transform(test_data.Embarked_encoded.values.reshape(-1,1)).toarray()

test_data.head()

Unnamed: 0,PassengerId,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked,PClass_encoded,Sex_encoded,Embarked_encoded
0,892,3,male,34.5,0,0,7.8292,Q,2,1,1
1,893,3,female,47.0,1,0,7.0,S,2,0,2
2,894,2,male,62.0,0,0,9.6875,Q,1,1,1
3,895,3,male,27.0,0,0,8.6625,S,2,1,2
4,896,3,female,22.0,1,1,12.2875,S,2,0,2


In [48]:
#Add back to original dataframe

test_dataOneHot = pd.DataFrame(Xp, columns = ["PClass_"+str(int(i)) for i in range(Xp.shape[1])])
test_data = pd.concat([test_data, test_dataOneHot], axis=1)

test_dataOneHot = pd.DataFrame(Xs, columns = ["Sex_"+str(int(i)) for i in range(Xs.shape[1])])
test_data = pd.concat([test_data, test_dataOneHot], axis=1)

test_dataOneHot = pd.DataFrame(Xe, columns = ["Embarked_"+str(int(i)) for i in range(Xe.shape[1])])
test_data = pd.concat([test_data, test_dataOneHot], axis=1)

test_data.head()

Unnamed: 0,PassengerId,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked,PClass_encoded,Sex_encoded,Embarked_encoded,PClass_0,PClass_1,PClass_2,Sex_0,Sex_1,Embarked_0,Embarked_1,Embarked_2
0,892,3,male,34.5,0,0,7.8292,Q,2,1,1,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0
1,893,3,female,47.0,1,0,7.0,S,2,0,2,0.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0
2,894,2,male,62.0,0,0,9.6875,Q,1,1,1,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0
3,895,3,male,27.0,0,0,8.6625,S,2,1,2,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0
4,896,3,female,22.0,1,1,12.2875,S,2,0,2,0.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0


In [49]:
#Drop unneccesary columns

test_data.drop(["Pclass","Sex", "Embarked", "PClass_encoded", "Sex_encoded", "Embarked_encoded"], axis =1, inplace=True)
test_data.head()

Unnamed: 0,PassengerId,Age,SibSp,Parch,Fare,PClass_0,PClass_1,PClass_2,Sex_0,Sex_1,Embarked_0,Embarked_1,Embarked_2
0,892,34.5,0,0,7.8292,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0
1,893,47.0,1,0,7.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0
2,894,62.0,0,0,9.6875,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0
3,895,27.0,0,0,8.6625,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0
4,896,22.0,1,1,12.2875,0.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0


In [50]:
#Predictions
test_data_pred = test_data.drop(["PassengerId"], axis =1)
test_data_pred.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 418 entries, 0 to 417
Data columns (total 12 columns):
Age           418 non-null float64
SibSp         418 non-null int64
Parch         418 non-null int64
Fare          417 non-null float64
PClass_0      418 non-null float64
PClass_1      418 non-null float64
PClass_2      418 non-null float64
Sex_0         418 non-null float64
Sex_1         418 non-null float64
Embarked_0    418 non-null float64
Embarked_1    418 non-null float64
Embarked_2    418 non-null float64
dtypes: float64(10), int64(2)
memory usage: 39.3 KB


In [51]:
#FARE is missing one entry , so will impute that by a median value.

fare_median = test_data_pred["Fare"].median()
fare_median

test_data_pred["Fare"].fillna(fare_median, inplace=True)
test_data_pred.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 418 entries, 0 to 417
Data columns (total 12 columns):
Age           418 non-null float64
SibSp         418 non-null int64
Parch         418 non-null int64
Fare          418 non-null float64
PClass_0      418 non-null float64
PClass_1      418 non-null float64
PClass_2      418 non-null float64
Sex_0         418 non-null float64
Sex_1         418 non-null float64
Embarked_0    418 non-null float64
Embarked_1    418 non-null float64
Embarked_2    418 non-null float64
dtypes: float64(10), int64(2)
memory usage: 39.3 KB


In [52]:
#Predict using LogisticRegression model

Y_pred = lr_clf.predict(test_data_pred)

In [53]:
#Make submission file version 1

submission = pd.DataFrame({
        "PassengerId": test_data["PassengerId"],
        "Survived": Y_pred
    })

submission.to_csv('Titanic_Prediction_v1.csv', index=False)

***Without spending much effort into preprocessing (only removing NaN and one hot encoding), Logistic regression gives the score of more than 80%. ***

***Let's see what we can do in feature enginnering. A visual inspection of data might also help in improving accuracy.***