In [58]:
# Logistic Regression
# Inspired from Deeplearning.ai - Coursera
# These code bases are not a repository of the course assignments, but my personal experimental repository


In [11]:
# Loading the necessary packages 
# numpy is the fundamental package for scientific computing with Python.
# h5py is a common package to interact with a dataset that is stored on an H5 file.
# matplotlib is a famous library to plot graphs in Python.
# PIL and scipy are used here to test your model with your own picture at the end.

import numpy as np
import matplotlib.pyplot as plt
import h5py
import scipy
import pandas as pd
from PIL import Image
from scipy import ndimage

In [24]:
# Locating the dataset
cd "C:\users\vimal.kumar\desktop\lr_titanic_dataset"

C:\users\vimal.kumar\desktop\lr_titanic_dataset


In [48]:
# loading the training dataset
train = pd.read_csv('train.csv')
test=pd.read_csv('test.csv')

In [54]:
# studying the structure of the dataset uploaded
train.shape
list(train.columns.values)

['PassengerId',
 'Pclass',
 'Name',
 'Sex',
 'Age',
 'SibSp',
 'Parch',
 'Ticket',
 'Fare',
 'Cabin',
 'Embarked']

In [69]:
# Understand the ground truth/dependent variable 
train.Survived.unique()

array([0, 1], dtype=int64)

In [55]:
# creating a seperate dataset for the ground truth/dependent variable
train_set_y=train['Survived']


In [57]:
# creating a dataset with only the training independent variables
train_set_x_orig=train.drop(['Survived'], axis=1)
test_set_x_orig=test


In [65]:
# Transposing the dataset such that individual training datapoint are across columns
train_set_x_flatten=train_set_x_orig.T
test_set_x_flatten=test_set_x_orig.T

In [94]:
train_set_x_orig.Cabin.unique()


array([nan, 'C85', 'C123', 'E46', 'G6', 'C103', 'D56', 'A6',
       'C23 C25 C27', 'B78', 'D33', 'B30', 'C52', 'B28', 'C83', 'F33',
       'F G73', 'E31', 'A5', 'D10 D12', 'D26', 'C110', 'B58 B60', 'E101',
       'F E69', 'D47', 'B86', 'F2', 'C2', 'E33', 'B19', 'A7', 'C49', 'F4',
       'A32', 'B4', 'B80', 'A31', 'D36', 'D15', 'C93', 'C78', 'D35',
       'C87', 'B77', 'E67', 'B94', 'C125', 'C99', 'C118', 'D7', 'A19',
       'B49', 'D', 'C22 C26', 'C106', 'C65', 'E36', 'C54',
       'B57 B59 B63 B66', 'C7', 'E34', 'C32', 'B18', 'C124', 'C91', 'E40',
       'T', 'C128', 'D37', 'B35', 'E50', 'C82', 'B96 B98', 'E10', 'E44',
       'A34', 'C104', 'C111', 'C92', 'E38', 'D21', 'E12', 'E63', 'A14',
       'B37', 'C30', 'D20', 'B79', 'E25', 'D46', 'B73', 'C95', 'B38',
       'B39', 'B22', 'C86', 'C70', 'A16', 'C101', 'C68', 'A10', 'E68',
       'B41', 'A20', 'D19', 'D50', 'D9', 'A23', 'B50', 'A26', 'D48',
       'E58', 'C126', 'B71', 'B51 B53 B55', 'D49', 'B5', 'B20', 'F G63',
       'C62 C64',

In [92]:
train_set_x_orig.info

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 11 columns):
PassengerId    891 non-null int64
Pclass         891 non-null int64
Name           891 non-null object
Sex            891 non-null object
Age            714 non-null float64
SibSp          891 non-null int64
Parch          891 non-null int64
Ticket         891 non-null object
Fare           891 non-null float64
Cabin          204 non-null object
Embarked       889 non-null object
dtypes: float64(2), int64(4), object(5)
memory usage: 76.6+ KB


In [95]:
train_set_x_orig.describe()

Unnamed: 0,PassengerId,Pclass,Age,SibSp,Parch,Fare
count,891.0,891.0,714.0,891.0,891.0,891.0
mean,446.0,2.308642,29.699118,0.523008,0.381594,32.204208
std,257.353842,0.836071,14.526497,1.102743,0.806057,49.693429
min,1.0,1.0,0.42,0.0,0.0,0.0
25%,223.5,2.0,20.125,0.0,0.0,7.9104
50%,446.0,3.0,28.0,0.0,0.0,14.4542
75%,668.5,3.0,38.0,1.0,0.0,31.0
max,891.0,3.0,80.0,8.0,6.0,512.3292
