## ML on Titanic dataset

- Applying different preprocessing method on the titanic data set
- Using different machine learning models to predict survival of passenger

In [61]:
#Importing libraries
from sklearn import preprocessing
from sklearn.preprocessing import StandardScaler,LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [62]:
#Loading data from CSV file
train = pd.read_csv('train.csv')
train.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


### Data Preprocessing

In [63]:
#Dropping unwanted features
train = train.drop(['Ticket','Fare','PassengerId'],axis=1)

#Changing cabin details to deck details
cabin_list = ['A', 'B', 'C', 'D', 'E', 'F', 'T', 'G', 'Unknown']
train = train.rename(columns={'Cabin':'Deck'})

for i,j in enumerate(train['Deck']):
    if type(j) == str:
        train.loc[[i],['Deck']] = j[0]
    else:
        train.loc[[i],['Deck']] ='Unknown'
        
#Extracting title from complete name and combining rare type into "Others"
train['Name'] = train.Name.str.extract(' ([A-Za-z]+)\.', expand=False)
train = train.rename(columns={'Name':'Title'})
train['Title'] = train['Title'].replace(['Dr','Rev','Sir','Lady','Countess','Col','Major','Capt','Jonkheer','Don'],'Others')
train['Title'] = train['Title'].replace('Ms','Miss')
train['Title'] = train['Title'].replace('Mme','Mrs')
train['Title'] = train['Title'].replace('Mlle','Miss')


#IMPUTATION OF EMBARKED COLUMN

for i,j in enumerate(train['Embarked']):
    if type(j) == float:
        train.loc[[i],['Embarked']] ='S'  

#IMPUTATION OF AGE COLUMN 

age = pd.Series(train['Age'])
for i,j in enumerate(age.isnull()):
    if j == True:
        train.loc[[i],['Age']] = 0.0
        
title_list = ['Mr','Mrs','Master','Miss','Others']
title_list.sort()
age_avg = [0,0,0,0,0]
non_zero_count=age_avg


#COUNTING NON ZERO AGES IN EACH CATEGORY        
for i, j in enumerate(title_list):
    for m, n in enumerate(train['Title']):
        if n == j and (train.loc[[m],['Age']]>0).bool():     #train.loc[[i],['Age']] gives dataframe.
                                                           #Ambiguity error. Use df.bool()
            non_zero_count[i]+= 1
            
title_sum=train.groupby('Title')['Age'].sum()

for i,j in enumerate(title_sum):
    age_avg[i] = title_sum[i] / non_zero_count[i]
    
    #REPLACING MISSING VALUES BY MEAN 
for i,j in enumerate(title_list):
    for m,n in enumerate(train['Age']):
        if (j == train.loc[[m],['Title']]).bool() and (train.loc[[m],['Age']] == 0).bool():
            train.loc[[m],['Age']] = age_avg[i]
            
#LABEL ENCODING CATEGORICAL FEATURES            
categ_col = ['Sex','Title','Deck','Embarked']
for i in categ_col:
    le = LabelEncoder()
    le.fit(train[i])
    train[i] = le.transform(train[i])
    
#STANDARDISING BECAUSE AGE VARIES OVER LARGE RANGE    
sc = StandardScaler()
sc.fit(train['Age'].values.reshape(-1,1))
train['Age'] = sc.transform(train['Age'].values.reshape(-1,1))

# ONEHOT ENCODING TO GET DIFFERENT COLUMNS FOR CATEGORICAL FEATURES
#ADDING ROWS IS TIME CONSUMING, MORE COLUMNS MORE EFFICIENT
    
train = pd.get_dummies(train,columns = ['Sex','Embarked','Deck','Title'])

In [64]:
#Selecting values for X and y (X is without survived column because it is target value)
X = train.drop('Survived',axis = 1).values
y = train['Survived'].values


In [65]:
#USING LOGISTIC REGRESSION MODEL,CREATING OBJECT AND FITTING
log_reg = LogisticRegression()
log_reg.fit(X,y)
log_reg.score(X,y)



0.8395061728395061

In [66]:
#USING KNN CLASSIFIER MODEL,CREATING OBJECT AND FITTING
knn_obj = KNeighborsClassifier()
knn_obj.fit(X,y)
knn_obj.score(X,y)

0.8641975308641975

In [67]:
#USING DECISION TREE MODEL,CREATING OBJECT AND FITTING
dec_tree = DecisionTreeClassifier()
dec_tree.fit(X,y)
dec_tree.score(X,y)

0.9506172839506173