# Importing Libraries

In [1]:

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix


import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))



/kaggle/input/titanic/gender_submission.csv
/kaggle/input/titanic/test.csv
/kaggle/input/titanic/train.csv


# Importing Dataset

In [2]:
train_data = pd.read_csv('/kaggle/input/titanic/train.csv')
X_test = pd.read_csv('/kaggle/input/titanic/test.csv')
submission_data = pd.read_csv('/kaggle/input/titanic/gender_submission.csv')

# Understanding Dataset

In [3]:
print(train_data.info())
print('_'*50)
print(X_test.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
PassengerId    891 non-null int64
Survived       891 non-null int64
Pclass         891 non-null int64
Name           891 non-null object
Sex            891 non-null object
Age            714 non-null float64
SibSp          891 non-null int64
Parch          891 non-null int64
Ticket         891 non-null object
Fare           891 non-null float64
Cabin          204 non-null object
Embarked       889 non-null object
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB
None
__________________________________________________
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 418 entries, 0 to 417
Data columns (total 11 columns):
PassengerId    418 non-null int64
Pclass         418 non-null int64
Name           418 non-null object
Sex            418 non-null object
Age            332 non-null float64
SibSp          418 non-null int64
Parch          418 non-null int64
Ticket      

In [4]:
print(train_data.isna().sum())
print('_'*50)
print(X_test.isna().sum())

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64
__________________________________________________
PassengerId      0
Pclass           0
Name             0
Sex              0
Age             86
SibSp            0
Parch            0
Ticket           0
Fare             1
Cabin          327
Embarked         0
dtype: int64


# Cleaning Train Dataset

Though Cabin column has so many NaN, from one of the kaggle notebooks availabe i found that it plays a major role in survival (i.e. location of their cabin plays a major role). And most of the NaN data are associated with class 3 passangers which results in correlation.

In [5]:
train_data['Age'].fillna(train_data['Age'].mean(), inplace = True)
train_data['Cabin'].fillna('N0', inplace = True)
train_data['Deck'] = train_data["Cabin"].str.slice(0,1)
train_data['Embarked'].fillna('S', inplace = True)

In [6]:
sex_dummy = pd.get_dummies(train_data['Sex'], drop_first = True)
class_dummy = pd.get_dummies(train_data['Pclass'], drop_first = True)
embarked_dummy = pd.get_dummies(train_data['Embarked'], drop_first = True)
deck_dummy = pd.get_dummies(train_data['Deck'], drop_first = True)

In [7]:
train_data.drop(['Pclass','Name','Cabin','Deck', 'Sex', 'Ticket', 'Embarked'], axis = 1, inplace = True)
train_data = pd.concat([train_data, sex_dummy, class_dummy,embarked_dummy,deck_dummy], axis = 1)

## Cleaning Test Dataset

In [8]:
X_test['Age'].fillna(X_test['Age'].mean(), inplace = True)
X_test['Cabin'].fillna('N0', inplace = True)
X_test['Deck'] = X_test["Cabin"].str.slice(0,1)
X_test['Fare'].fillna(X_test['Fare'].mean(), inplace = True)

In [9]:
sex_dummy = pd.get_dummies(X_test['Sex'], drop_first = True)
class_dummy = pd.get_dummies(X_test['Pclass'], drop_first = True)
embarked_dummy = pd.get_dummies(X_test['Embarked'], drop_first = True)
deck_dummy = pd.get_dummies(X_test['Deck'], drop_first = True)

In [10]:
X_test.drop(['Pclass','Name','Cabin','Deck', 'Sex', 'Ticket', 'Embarked'], axis = 1, inplace = True)
X_test = pd.concat([X_test, sex_dummy, class_dummy,embarked_dummy,deck_dummy], axis = 1)

Adding Column 'T' with zeros since Test dataset does't contain 'T' 

In [11]:
X_test['T'] = 0

In [12]:
print(train_data.columns)
print('_'*50)
print(X_test.columns)

Index(['PassengerId',    'Survived',         'Age',       'SibSp',
             'Parch',        'Fare',        'male',             2,
                   3,           'Q',           'S',           'B',
                 'C',           'D',           'E',           'F',
                 'G',           'N',           'T'],
      dtype='object')
__________________________________________________
Index(['PassengerId',         'Age',       'SibSp',       'Parch',
              'Fare',        'male',             2,             3,
                 'Q',           'S',           'B',           'C',
                 'D',           'E',           'F',           'G',
                 'N',           'T'],
      dtype='object')


# Dependent and Independent Variables

In [13]:
y = train_data['Survived']
train_data = train_data.drop('Survived' , axis = 1)

# Train - Validation data split

In [14]:
X_train, X_val, y_train, y_val = train_test_split(train_data,y, test_size = 0.1, random_state = 0                                                 )

In [15]:
X_train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 801 entries, 815 to 684
Data columns (total 18 columns):
PassengerId    801 non-null int64
Age            801 non-null float64
SibSp          801 non-null int64
Parch          801 non-null int64
Fare           801 non-null float64
male           801 non-null uint8
2              801 non-null uint8
3              801 non-null uint8
Q              801 non-null uint8
S              801 non-null uint8
B              801 non-null uint8
C              801 non-null uint8
D              801 non-null uint8
E              801 non-null uint8
F              801 non-null uint8
G              801 non-null uint8
N              801 non-null uint8
T              801 non-null uint8
dtypes: float64(2), int64(3), uint8(13)
memory usage: 47.7 KB


# Logistic Regression

In [16]:
classifier = LogisticRegression(random_state = 0)
classifier.fit(X_train, y_train)
y_pred = classifier.predict(X_val)



# Confusion Matrix

In [17]:
cm = confusion_matrix(y_val, y_pred)
print((cm[0,0]+cm[1,1])/cm.sum())

0.8333333333333334


# Test data prediction

In [18]:
y_submit = classifier.predict(X_test)

# Submission Data Frame

In [19]:
results_df = pd.DataFrame()
results_df['PassengerId'] = X_test['PassengerId']
results_df["Survived"] = y_submit
results_df.to_csv("Predictions", index=False)