In [58]:
# Logistic Regression
# Inspired from Deeplearning.ai - Coursera
# These code bases are not a repository of the course assignments, but my personal experimental repository


In [11]:
# Loading the necessary packages 
# numpy is the fundamental package for scientific computing with Python.
# h5py is a common package to interact with a dataset that is stored on an H5 file.
# matplotlib is a famous library to plot graphs in Python.
# PIL and scipy are used here to test your model with your own picture at the end.

import numpy as np
import matplotlib.pyplot as plt
import h5py
import scipy
import pandas as pd
from PIL import Image
from scipy import ndimage

In [24]:
# Locating the dataset
cd "C:\users\vimal.kumar\desktop\lr_titanic_dataset"

C:\users\vimal.kumar\desktop\lr_titanic_dataset


In [48]:
# loading the training dataset
train = pd.read_csv('train.csv')
test=pd.read_csv('test.csv')

In [54]:
# studying the structure of the dataset uploaded
train.shape
list(train.columns.values)

['PassengerId',
 'Pclass',
 'Name',
 'Sex',
 'Age',
 'SibSp',
 'Parch',
 'Ticket',
 'Fare',
 'Cabin',
 'Embarked']

In [69]:
# Understand the ground truth/dependent variable 
train.Survived.unique()

array([0, 1], dtype=int64)

In [55]:
# creating a seperate dataset for the ground truth/dependent variable
train_set_y=train['Survived']


In [57]:
# creating a dataset with only the training independent variables
train_set_x_orig=train.drop(['Survived'], axis=1)
test_set_x_orig=test


In [94]:
# EDA
train_set_x_orig.Cabin.unique()

array([nan, 'C85', 'C123', 'E46', 'G6', 'C103', 'D56', 'A6',
       'C23 C25 C27', 'B78', 'D33', 'B30', 'C52', 'B28', 'C83', 'F33',
       'F G73', 'E31', 'A5', 'D10 D12', 'D26', 'C110', 'B58 B60', 'E101',
       'F E69', 'D47', 'B86', 'F2', 'C2', 'E33', 'B19', 'A7', 'C49', 'F4',
       'A32', 'B4', 'B80', 'A31', 'D36', 'D15', 'C93', 'C78', 'D35',
       'C87', 'B77', 'E67', 'B94', 'C125', 'C99', 'C118', 'D7', 'A19',
       'B49', 'D', 'C22 C26', 'C106', 'C65', 'E36', 'C54',
       'B57 B59 B63 B66', 'C7', 'E34', 'C32', 'B18', 'C124', 'C91', 'E40',
       'T', 'C128', 'D37', 'B35', 'E50', 'C82', 'B96 B98', 'E10', 'E44',
       'A34', 'C104', 'C111', 'C92', 'E38', 'D21', 'E12', 'E63', 'A14',
       'B37', 'C30', 'D20', 'B79', 'E25', 'D46', 'B73', 'C95', 'B38',
       'B39', 'B22', 'C86', 'C70', 'A16', 'C101', 'C68', 'A10', 'E68',
       'B41', 'A20', 'D19', 'D50', 'D9', 'A23', 'B50', 'A26', 'D48',
       'E58', 'C126', 'B71', 'B51 B53 B55', 'D49', 'B5', 'B20', 'F G63',
       'C62 C64',

In [92]:
# df.info provides a conscise summary of variables
train_set_x_orig.info

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 11 columns):
PassengerId    891 non-null int64
Pclass         891 non-null int64
Name           891 non-null object
Sex            891 non-null object
Age            714 non-null float64
SibSp          891 non-null int64
Parch          891 non-null int64
Ticket         891 non-null object
Fare           891 non-null float64
Cabin          204 non-null object
Embarked       889 non-null object
dtypes: float64(2), int64(4), object(5)
memory usage: 76.6+ KB


In [145]:
# The dataset contains both categorical and continuous variables
# Creating a seperate dataset for categorical (for on hot encoding) and continous (normalization)

train_set_x_orig.columns.unique()






Index(['PassengerId', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp', 'Parch',
       'Ticket', 'Fare', 'Cabin', 'Embarked'],
      dtype='object')

In [151]:
Numerical = ['PassengerId','Age','Fare']

In [159]:
train_set_x_orig_num=train_set_x_orig[Numerical]
train_set_x_orig_cat=train_set_x_orig.drop(Numerical, axis=1)

Unnamed: 0,Pclass,Name,Sex,SibSp,Parch,Ticket,Cabin,Embarked
0,3,"Braund, Mr. Owen Harris",male,1,0,A/5 21171,,S
1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,1,0,PC 17599,C85,C
2,3,"Heikkinen, Miss. Laina",female,0,0,STON/O2. 3101282,,S
3,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,1,0,113803,C123,S
4,3,"Allen, Mr. William Henry",male,0,0,373450,,S
5,3,"Moran, Mr. James",male,0,0,330877,,Q
6,1,"McCarthy, Mr. Timothy J",male,0,0,17463,E46,S
7,3,"Palsson, Master. Gosta Leonard",male,3,1,349909,,S
8,3,"Johnson, Mrs. Oscar W (Elisabeth Vilhelmina Berg)",female,0,2,347742,,S
9,2,"Nasser, Mrs. Nicholas (Adele Achem)",female,1,0,237736,,C


In [158]:
train_set_x_orig.Embarked.unique()

array(['S', 'C', 'Q', nan], dtype=object)

In [161]:
ticket_array=train_set_x_orig.Ticket.unique()
ticket_array.sort()

In [162]:
# There seems to be the type of ticket attached to the ticket number, which can be extracted and made as a categorical variable.
# There are full stops, that need to be removed
ticket_array

array(['110152', '110413', '110465', '110564', '110813', '111240',
       '111320', '111361', '111369', '111426', '111427', '111428',
       '112050', '112052', '112053', '112058', '112059', '112277',
       '112379', '113028', '113043', '113050', '113051', '113055',
       '113056', '113059', '113501', '113503', '113505', '113509',
       '113510', '113514', '113572', '113760', '113767', '113773',
       '113776', '113781', '113783', '113784', '113786', '113787',
       '113788', '113789', '113792', '113794', '113796', '113798',
       '113800', '113803', '113804', '113806', '113807', '11668', '11751',
       '11752', '11753', '11755', '11765', '11767', '11769', '11771',
       '11774', '11813', '11967', '12233', '12460', '12749', '13049',
       '13213', '13214', '13502', '13507', '13509', '13567', '13568',
       '14311', '14312', '14313', '14973', '1601', '16966', '16988',
       '17421', '17453', '17463', '17464', '17465', '17466', '17474',
       '17764', '19877', '19928', '19943

In [165]:
ticket_array=print([s.replace('.', '') for s in ticket_array])


['110152', '110413', '110465', '110564', '110813', '111240', '111320', '111361', '111369', '111426', '111427', '111428', '112050', '112052', '112053', '112058', '112059', '112277', '112379', '113028', '113043', '113050', '113051', '113055', '113056', '113059', '113501', '113503', '113505', '113509', '113510', '113514', '113572', '113760', '113767', '113773', '113776', '113781', '113783', '113784', '113786', '113787', '113788', '113789', '113792', '113794', '113796', '113798', '113800', '113803', '113804', '113806', '113807', '11668', '11751', '11752', '11753', '11755', '11765', '11767', '11769', '11771', '11774', '11813', '11967', '12233', '12460', '12749', '13049', '13213', '13214', '13502', '13507', '13509', '13567', '13568', '14311', '14312', '14313', '14973', '1601', '16966', '16988', '17421', '17453', '17463', '17464', '17465', '17466', '17474', '17764', '19877', '19928', '19943', '19947', '19950', '19952', '19972', '19988', '19996', '2003', '211536', '21440', '218629', '219533', 

In [65]:
# Transposing the dataset such that individual training datapoint are across columns
train_set_x_flatten=train_set_x_orig.T
test_set_x_flatten=test_set_x_orig.T