# Logistic Regression

Objective: Predicting Titanic Survivor  
Dataset: titanic_test.csv and titanic_train.csv  
Ref: https://www.kaggle.com/mahendermahi77/beginner-prediction-using-logistic-regression

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

titanic_train=pd.read_csv("titanic_train.csv")
print('TRAIN Set', titanic_train.shape)

titanic_test=pd.read_csv("titanic_test.csv")
print('TEST Set', titanic_test.shape)

ori_test = titanic_test.copy()

TRAIN Set (891, 12)
TEST Set (418, 11)


In [2]:
#checking null 
titanic_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
PassengerId    891 non-null int64
Survived       891 non-null int64
Pclass         891 non-null int64
Name           891 non-null object
Sex            891 non-null object
Age            714 non-null float64
SibSp          891 non-null int64
Parch          891 non-null int64
Ticket         891 non-null object
Fare           891 non-null float64
Cabin          204 non-null object
Embarked       889 non-null object
dtypes: float64(2), int64(5), object(5)
memory usage: 83.6+ KB


In [3]:
titanic_test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 418 entries, 0 to 417
Data columns (total 11 columns):
PassengerId    418 non-null int64
Pclass         418 non-null int64
Name           418 non-null object
Sex            418 non-null object
Age            332 non-null float64
SibSp          418 non-null int64
Parch          418 non-null int64
Ticket         418 non-null object
Fare           417 non-null float64
Cabin          91 non-null object
Embarked       418 non-null object
dtypes: float64(2), int64(4), object(5)
memory usage: 36.0+ KB


# Fill Null Values

In [4]:
titanic_train['Age'].fillna(titanic_train['Age'].median(),inplace=True)
titanic_test['Age'].fillna(titanic_test['Age'].median(),inplace=True)

titanic_train["Embarked"].fillna('S',inplace=True)

titanic_test['Fare'].fillna(titanic_test['Fare'].median(),inplace=True)
titanic_train['Fare']=titanic_train['Fare'].astype(int)
titanic_test['Fare']=titanic_test['Fare'].astype(int)

titanic_train.drop(['PassengerId','Name','Ticket','Cabin'],axis=1,inplace=True)
titanic_test.drop(['Name','Ticket','Cabin'],axis=1,inplace=True)

# Dummy Variables

In [5]:
titanic_train['Age_Child']=(titanic_train['Age']<=18).astype(int)
titanic_train['Age_Youth']=((titanic_train['Age']>18)&(titanic_train['Age']<=35)).astype(int)
titanic_train['Age_Middle']=((titanic_train['Age']>35)&(titanic_train['Age']<=50)).astype(int)

titanic_test['Age_Child']=(titanic_test['Age']<=18).astype(int)
titanic_test['Age_Youth']=((titanic_test['Age']>18)&(titanic_test['Age']<=35)).astype(int)
titanic_test['Age_Middle']=((titanic_test['Age']>35)&(titanic_test['Age']<=50)).astype(int)

titanic_train['Sex_M']=(titanic_train['Sex']=='male').astype(int)
titanic_test['Sex_M']=(titanic_test['Sex']=='male').astype(int)

titanic_train['Parch_Y']=(titanic_train['Parch']>=1).astype(int)
titanic_test['Parch_Y']=(titanic_test['Parch']>=1).astype(int)

titanic_train['SibSp_Y']=(titanic_train['SibSp']>=1).astype(int)
titanic_test['SibSp_Y']=(titanic_test['SibSp']>=1).astype(int)

titanic_train['Embarked_S']=(titanic_train['Embarked']=='S').astype(int)
titanic_train['Embarked_C']=(titanic_train['Embarked']=='C').astype(int)

titanic_test['Embarked_S']=(titanic_test['Embarked']=='S').astype(int)
titanic_test['Embarked_C']=(titanic_test['Embarked']=='C').astype(int)

titanic_train['Pclass_1']=(titanic_train['Pclass']==1).astype(int)
titanic_train['Pclass_2']=(titanic_train['Pclass']==2).astype(int)

titanic_test['Pclass_1']=(titanic_test['Pclass']==1).astype(int)
titanic_test['Pclass_2']=(titanic_test['Pclass']==2).astype(int)

In [6]:
print('TRAIN Set', titanic_train.shape)
titanic_train.head()

TRAIN Set (891, 18)


Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked,Age_Child,Age_Youth,Age_Middle,Sex_M,Parch_Y,SibSp_Y,Embarked_S,Embarked_C,Pclass_1,Pclass_2
0,0,3,male,22.0,1,0,7,S,0,1,0,1,0,1,1,0,0,0
1,1,1,female,38.0,1,0,71,C,0,0,1,0,0,1,0,1,1,0
2,1,3,female,26.0,0,0,7,S,0,1,0,0,0,0,1,0,0,0
3,1,1,female,35.0,1,0,53,S,0,1,0,0,0,1,1,0,1,0
4,0,3,male,35.0,0,0,8,S,0,1,0,1,0,0,1,0,0,0


In [7]:
print('TEST Set', titanic_test.shape)
titanic_test.head()

TEST Set (418, 18)


Unnamed: 0,PassengerId,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked,Age_Child,Age_Youth,Age_Middle,Sex_M,Parch_Y,SibSp_Y,Embarked_S,Embarked_C,Pclass_1,Pclass_2
0,892,3,male,34.5,0,0,7,Q,0,1,0,1,0,0,0,0,0,0
1,893,3,female,47.0,1,0,7,S,0,0,1,0,0,1,1,0,0,0
2,894,2,male,62.0,0,0,9,Q,0,0,0,1,0,0,0,0,0,1
3,895,3,male,27.0,0,0,8,S,0,1,0,1,0,0,1,0,0,0
4,896,3,female,22.0,1,1,12,S,0,1,0,0,1,1,1,0,0,0


# Modelling

In [8]:
from sklearn.linear_model import LogisticRegression

#default
logreg=LogisticRegression()

X_train=titanic_train.drop(['Sex','SibSp','Parch','Pclass','Embarked','Age','Survived'],axis=1)
Y_train=titanic_train['Survived']
X_test=titanic_test.drop(['Sex','SibSp','Parch','Pclass','Embarked','Age','PassengerId'],axis=1)

print('X TRAIN Data', X_train.shape)
print('Y TRAIN Data', Y_train.shape)
print('X TEST Data', X_test.shape)

X TRAIN Data (891, 11)
Y TRAIN Data (891,)
X TEST Data (418, 11)


In [9]:
#fit data into model
logreg.fit(X_train,Y_train)
Y_pred=logreg.predict(X_test)

#calculate model accuracy
round(logreg.score(X_train,Y_train)*100,2)

79.69

# Prediction

In [10]:
survival_pred = pd.DataFrame({'Survived Prediction':Y_pred})
print(survival_pred.shape)
survival_pred.head()

(418, 1)


Unnamed: 0,Survived Prediction
0,0
1,0
2,0
3,0
4,1


In [11]:
titanic_test.head()

Unnamed: 0,PassengerId,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked,Age_Child,Age_Youth,Age_Middle,Sex_M,Parch_Y,SibSp_Y,Embarked_S,Embarked_C,Pclass_1,Pclass_2
0,892,3,male,34.5,0,0,7,Q,0,1,0,1,0,0,0,0,0,0
1,893,3,female,47.0,1,0,7,S,0,0,1,0,0,1,1,0,0,0
2,894,2,male,62.0,0,0,9,Q,0,0,0,1,0,0,0,0,0,1
3,895,3,male,27.0,0,0,8,S,0,1,0,1,0,0,1,0,0,0
4,896,3,female,22.0,1,1,12,S,0,1,0,0,1,1,1,0,0,0


In [12]:
ori_test.head()

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S


In [13]:
result = ori_test.join(survival_pred, how='outer')
result.head()

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Survived Prediction
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q,0
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S,0
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q,0
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S,0
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S,1


In [14]:
#Simplify table
simplify = result.drop(columns=['Pclass', 'Ticket', 'Fare', 'Cabin', 'SibSp', 'Parch', 'Embarked'])
simplify.head()

Unnamed: 0,PassengerId,Name,Sex,Age,Survived Prediction
0,892,"Kelly, Mr. James",male,34.5,0
1,893,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,0
2,894,"Myles, Mr. Thomas Francis",male,62.0,0
3,895,"Wirz, Mr. Albert",male,27.0,0
4,896,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1
