In [3]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [4]:
data = pd.read_csv("Titanic_data_for logistic_regression.csv") # import the data

In [5]:
data.head() # inspect the data 

Unnamed: 0,Id,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked,Survived
0,0,3,0,22.0,1,0,7.25,1,0
1,1,1,1,38.0,1,0,71.2833,0,1
2,2,3,1,26.0,0,0,7.925,1,1
3,3,1,1,35.0,1,0,53.1,1,1
4,4,3,0,35.0,0,0,8.05,1,0


In [6]:
# data pre-processing step
df = data.drop("Id", axis = 1)
# split data to training and testing data
df_train = df.iloc[:891]
df_test = df.iloc[891:]
# extract features (X) and target (Y) and convert to numpy arrays
X_train = df_train.iloc[:, :-1].values  # All rows, all columns except the last column (target)
Y_train = df_train['Survived'].values
X_test = df_test.iloc[:, :-1].values  # All rows, all columns except the last column (target)
Y_test = df_test['Survived'].values
# print all shapes
print("Shape of X_train : ", X_train.shape)
print("Shape of Y_train : ", Y_train.shape)
print("Shape of X_test : ", X_test.shape)
print("Shape of Y_test : ", Y_test.shape)

Shape of X_train :  (891, 7)
Shape of Y_train :  (891,)
Shape of X_test :  (418, 7)
Shape of Y_test :  (418,)


In [7]:
# reshape matrices
X_train = X_train.T
Y_train = Y_train.reshape(1, X_train.shape[1])

X_test = X_test.T
Y_test = Y_test.reshape(1, X_test.shape[1])

# print new shapes
print("Shape of X_train : ", X_train.shape)
print("Shape of Y_train : ", Y_train.shape)
print("Shape of X_test : ", X_test.shape)
print("Shape of Y_test : ", Y_test.shape)

Shape of X_train :  (7, 891)
Shape of Y_train :  (1, 891)
Shape of X_test :  (7, 418)
Shape of Y_test :  (1, 418)


In [8]:
def sigmoid(x):
    return 1/(1 + np.exp(-x))

In [9]:
# Model
def LinearRegressionModel(X, Y, learning_rate, iterations):
    # number of features and rows
    m = X_train.shape[1] # features
    n = X_train.shape[0] # rows
    # initiation of all weights to 0 
    W = np.zeros((n,1)) # matrix of weights
    B = 0 
    
    cost_list = [] # to calculate and keep track of cost
    
    for i in range(iterations):
        
        Z = np.dot(W.T, X) + B # Z: matrix of X * W
        A = sigmoid(Z) # A: matrix; apply sigmoid to Z
        
        # cost function for linear regression
        cost = -(1/m)*np.sum( Y*np.log(A) + (1-Y)*np.log(1-A))
        
        # Gradient Descent to minimize cost
        dW = (1/m)*np.dot(A-Y, X.T) # matrix of d(cost)/d(w) for all weights
        dB = (1/m)*np.sum(A - Y) # = d(cost)/d(b)
        
        W = W - learning_rate*dW.T # new weights 
        B = B - learning_rate*dB # new b 
        
        # Keeping track of our cost function value
        cost_list.append(cost)
        # print evolution of cost by iteration
        if(i%(iterations/10) == 0):
            print("cost after ", i, "iteration is : ", cost)
        
    return W, B, cost_list # return matrix of weights, B, and cost
        

In [10]:
# apply the model to our data
W, B, cost_list = LinearRegressionModel(X_train, Y_train, learning_rate = 0.0015, iterations = 100000)

cost after  0 iteration is :  0.6931471805599454
cost after  10000 iteration is :  0.49652777693895306
cost after  20000 iteration is :  0.46674868550665993
cost after  30000 iteration is :  0.45687787762434423
cost after  40000 iteration is :  0.45288994293089646
cost after  50000 iteration is :  0.45093260252226425
cost after  60000 iteration is :  0.4497708749009468
cost after  70000 iteration is :  0.4489640829216279
cost after  80000 iteration is :  0.4483412696612483
cost after  90000 iteration is :  0.4478304524693579


In [11]:
# Testing model accuracy
def accuracy(X, Y, W, B):    
    Z = np.dot(W.T, X) + B
    A = sigmoid(Z)    
    A = A > 0.5    
    A = np.array(A, dtype = 'int64')    
    acc = (1 - np.sum(np.absolute(A - Y))/Y.shape[1])*100    
    print("Accuracy of the model is : ", round(acc, 2), "%")
    
accuracy(X_test, Y_test, W, B)

Accuracy of the model is :  91.39 %


**Our model accuracy is 91 % on Test dataset. Which is pretty good.**