# **An Implementation of Logistic Regression using the Titanic Dataset**



### Importing required libraries

In [None]:
import numpy as np
import pandas as pd
from sklearn.impute import KNNImputer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import scale


### Load the dataset


In [None]:
df_train = pd.read_csv('/content/Titanic_ISTE.csv')

In [None]:
df_train.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Unnamed: 12,Unnamed: 13,Unnamed: 14,Unnamed: 15,Unnamed: 16,Unnamed: 17
0,1,0.0,3.0,"Braund, Mr. Owen Harris",male,22.0,1.0,0.0,A/5 21171,7.25,,S,,,,,,
1,2,1.0,1.0,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1.0,0.0,PC 17599,71.2833,C85,C,,,,,,
2,3,1.0,3.0,"Heikkinen, Miss. Laina",female,26.0,0.0,0.0,STON/O2. 3101282,7.925,,S,,,,,,
3,4,1.0,1.0,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1.0,0.0,113803,53.1,C123,S,,,,,,
4,5,0.0,3.0,"Allen, Mr. William Henry",male,35.0,0.0,0.0,373450,8.05,,S,,,,,,


In [None]:
df_train.describe()

Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare,Unnamed: 12,Unnamed: 13,Unnamed: 14,Unnamed: 15,Unnamed: 16,Unnamed: 17
count,891.0,889.0,890.0,712.0,890.0,889.0,891.0,0.0,0.0,0.0,0.0,0.0,1.0
mean,446.0,0.382452,2.307865,29.675801,0.517978,0.381327,32.204208,,,,,,0.0
std,257.353842,0.48626,0.83622,14.530556,1.093087,0.806596,49.693429,,,,,,
min,1.0,0.0,1.0,0.42,0.0,0.0,0.0,,,,,,0.0
25%,223.5,0.0,2.0,20.0,0.0,0.0,7.9104,,,,,,0.0
50%,446.0,0.0,3.0,28.0,0.0,0.0,14.4542,,,,,,0.0
75%,668.5,1.0,3.0,38.0,1.0,0.0,31.0,,,,,,0.0
max,891.0,1.0,3.0,80.0,8.0,6.0,512.3292,,,,,,0.0


### Preprocessing the data



In [None]:
df_train.isna().sum()

PassengerId      0
Survived         2
Pclass           1
Name             0
Sex             10
Age            179
SibSp            1
Parch            2
Ticket           0
Fare             0
Cabin          687
Embarked         2
Unnamed: 12    891
Unnamed: 13    891
Unnamed: 14    891
Unnamed: 15    891
Unnamed: 16    891
Unnamed: 17    890
dtype: int64

In [None]:
# drop rows with missing target/categorical values
df_train.dropna(subset=['Survived', 'Embarked'], inplace=True, how='any')
df_train = pd.get_dummies(data = df_train, columns = ['Sex'], dummy_na=True)
df_train.isnull().sum()

PassengerId      0
Survived         0
Pclass           1
Name             0
Age            179
SibSp            1
Parch            2
Ticket           0
Fare             0
Cabin          685
Embarked         0
Unnamed: 12    887
Unnamed: 13    887
Unnamed: 14    887
Unnamed: 15    887
Unnamed: 16    887
Unnamed: 17    886
Sex_female       0
Sex_male         0
Sex_nan          0
dtype: int64

In [None]:
# separate target column
Y_train = df_train['Survived']

In [None]:
# remove columns with maximum NaN values and maximum unique categorical values

columns = ["Unnamed: 12", "Unnamed: 13", "Unnamed: 14", "Unnamed: 15", "Unnamed: 16", 
        "Name","Ticket", "Cabin", "Unnamed: 17"]
df_train.drop(columns, axis=1, inplace = True)
df_train.replace([np.inf, -np.inf], np.nan, inplace=True)
X = df_train.loc[:, df_train.columns!='Survived']

In [None]:
X.nunique()

PassengerId    887
Pclass           3
Age             88
SibSp            7
Parch            7
Fare           247
Embarked         3
Sex_female       2
Sex_male         2
Sex_nan          2
dtype: int64

In [None]:
X.head()

Unnamed: 0,PassengerId,Pclass,Age,SibSp,Parch,Fare,Embarked,Sex_female,Sex_male,Sex_nan
0,1,3.0,22.0,1.0,0.0,7.25,S,0,1,0
1,2,1.0,38.0,1.0,0.0,71.2833,C,1,0,0
2,3,3.0,26.0,0.0,0.0,7.925,S,1,0,0
3,4,1.0,35.0,1.0,0.0,53.1,S,1,0,0
4,5,3.0,35.0,0.0,0.0,8.05,S,0,1,0


In [None]:
X.isna().sum()

PassengerId      0
Pclass           1
Age            179
SibSp            1
Parch            2
Fare             0
Embarked         0
Sex_female       0
Sex_male         0
Sex_nan          0
dtype: int64

In [None]:
from sklearn.preprocessing import OneHotEncoder

# select categorical columns for OneHotEncoding
s = (X.dtypes == 'object')
object_cols = list(s[s].index)


In [None]:
from sklearn.model_selection import train_test_split
X, X_valid, Y_train, y_valid = train_test_split(X, Y_train, train_size=0.8, test_size=0.2,
                                                                random_state=0)

In [None]:
OH_encoder = OneHotEncoder(handle_unknown='ignore', sparse=False)
OH_cols = pd.DataFrame(OH_encoder.fit_transform(X[object_cols]))
OH_cols_valid = pd.DataFrame(OH_encoder.transform(X_valid[object_cols]))

# one-hot encoding removed index, put it back
OH_cols.index = X.index
OH_cols_valid.index = X_valid.index

# remove categorical columns 
num_X = X.drop(object_cols, axis=1)
num_X_valid = X_valid.drop(object_cols, axis=1)

# add one-hot encoded columns to numerical features
OH_X = pd.concat([num_X, OH_cols], axis=1)
OH_X_valid = pd.concat([num_X_valid, OH_cols_valid], axis=1)


In [None]:
X = OH_X
X_valid = OH_X_valid


In [None]:
# normalizing data to improve KNN performance
X_normalized = scale(X)
X_train_try=pd.DataFrame(data=X_normalized,columns=X.columns)
X_valid_normalized=scale(X_valid)
X_train_try.head()

Unnamed: 0,PassengerId,Pclass,Age,SibSp,Parch,Fare,Sex_female,Sex_male,Sex_nan,0,1,2
0,-0.298167,0.843671,0.041596,-0.475933,-0.477752,-0.494249,-0.733458,0.754045,-0.113389,-0.467133,-0.304056,0.598529
1,-0.715153,-1.548979,0.041596,-0.475933,-0.477752,1.056001,1.363404,-1.326181,-0.113389,-0.467133,-0.304056,0.598529
2,0.138124,0.843671,-0.515443,-0.475933,-0.477752,-0.488952,-0.733458,0.754045,-0.113389,-0.467133,-0.304056,0.598529
3,-0.653378,-1.548979,0.529005,-0.475933,0.724278,-0.055093,-0.733458,0.754045,-0.113389,2.140719,-0.304056,-1.670761
4,0.43542,-1.548979,2.269752,-0.475933,-0.477752,-0.116712,-0.733458,0.754045,-0.113389,-0.467133,-0.304056,0.598529


In [None]:
# imputing missing values

X_filled=KNNImputer(n_neighbors=7).fit_transform(X_train_try)
X_filled=pd.DataFrame(data=X_filled,columns=X.columns)

X_valid_filled=KNNImputer(n_neighbors=7).fit_transform(X_valid_normalized)
X_valid_filled=pd.DataFrame(data=X_valid_filled,columns=X_valid.columns)


In [None]:
X_filled.columns=X.columns
X_valid_filled.columns=X_valid.columns
X_filled.head()

Unnamed: 0,PassengerId,Pclass,Age,SibSp,Parch,Fare,Sex_female,Sex_male,Sex_nan,0,1,2
0,-0.298167,0.843671,0.041596,-0.475933,-0.477752,-0.494249,-0.733458,0.754045,-0.113389,-0.467133,-0.304056,0.598529
1,-0.715153,-1.548979,0.041596,-0.475933,-0.477752,1.056001,1.363404,-1.326181,-0.113389,-0.467133,-0.304056,0.598529
2,0.138124,0.843671,-0.515443,-0.475933,-0.477752,-0.488952,-0.733458,0.754045,-0.113389,-0.467133,-0.304056,0.598529
3,-0.653378,-1.548979,0.529005,-0.475933,0.724278,-0.055093,-0.733458,0.754045,-0.113389,2.140719,-0.304056,-1.670761
4,0.43542,-1.548979,2.269752,-0.475933,-0.477752,-0.116712,-0.733458,0.754045,-0.113389,-0.467133,-0.304056,0.598529


In [None]:
# checking if all null values are removed
X_filled.isnull().sum()
X_valid_filled.isnull().sum()

PassengerId    0
Pclass         0
Age            0
SibSp          0
Parch          0
Fare           0
Sex_female     0
Sex_male       0
Sex_nan        0
0              0
1              0
2              0
dtype: int64

### Implementation of Gradient Descent Algorithm

In [None]:
# sigmoid activation function
def sigmoid(x):
    return 1/(1 + np.exp(-x))

In [None]:
# cost function for gradient descent
def cost_function(theta,X, y, r):

    m, n = X.shape
    h = sigmoid(np.dot(X, theta))
    J = (-1/m) * (np.dot(y.T, np.log(h)) + np.dot((1 - y).T, np.log(1-h))) + (r * np.sum(theta))

    return J[0].astype(float).item()


In [None]:
# implementation of gradient descent 

def grad(X, y, theta, alpha, num_iters, r):
    # r - L1 regularization term
    costs = []
    # m - number of samples
    m = X.shape[0]
    
    for i in range(0, num_iters):
        
        z = np.dot(X, theta)
        
        h = sigmoid(z)
        costs.append(cost_function(theta, X, y, r))
        print(f"Iteration {i+1}: Cost {costs[i]}")
        theta = theta - ((alpha / m) * np.dot(X.T, h - y))
        

    return  theta

In [None]:
X = X_filled
np.random.seed(1)
m = X.shape[0]
n = X.shape[1]

theta = np.zeros((n,1))
Y_train = Y_train.to_numpy().reshape(-1,1)
# find optimum values for theta using gradient descent
theta = (grad(X, (Y_train), theta, 0.1, 500, 0.1))

# calculate cost using optimum theta values
J = cost_function(theta, X, Y_train, 0.1)
# J.shape
print(f"The cost after training in 500 iterations is {J}.")

Iteration 1: Cost 0.6931471805599452
Iteration 2: Cost 0.6743175393153248
Iteration 3: Cost 0.6574088253242057
Iteration 4: Cost 0.6421899197859928
Iteration 5: Cost 0.6284543655972562
Iteration 6: Cost 0.6160197865262763
Iteration 7: Cost 0.6047263680718211
Iteration 8: Cost 0.5944348313804512
Iteration 9: Cost 0.5850242123123561
Iteration 10: Cost 0.5763896495652837
Iteration 11: Cost 0.5684403016645329
Iteration 12: Cost 0.5610974535805787
Iteration 13: Cost 0.5542928354638179
Iteration 14: Cost 0.547967152983765
Iteration 15: Cost 0.54206881613084
Iteration 16: Cost 0.5365528473425167
Iteration 17: Cost 0.5313799478327879
Iteration 18: Cost 0.5265157013036994
Iteration 19: Cost 0.5219298957065172
Iteration 20: Cost 0.5175959457345716
Iteration 21: Cost 0.5134904008852439
Iteration 22: Cost 0.5095925260145553
Iteration 23: Cost 0.5058839432200658
Iteration 24: Cost 0.502348325584201
Iteration 25: Cost 0.4989711347836607
Iteration 26: Cost 0.4957393958327345
Iteration 27: Cost 0.4926

### Predicting on Validation Set

In [None]:
X_valid = X_valid_filled
y_valid = y_valid.to_numpy().reshape(-1,1)

In [None]:
# predict for validation set
def predict( X ) :    
        Z =  1/(1 + np.exp(-(np.dot(X, theta))))     
        Y = np.where( Z > 0.5, 1, 0 )        
        return Y

In [None]:
Y_pred = predict(X_valid )    
# counts correctly classifed entries
correctly_classified = 0    

# counter    
count = 0    
for count in range( np.size( Y_pred ) ) :  
    if y_valid[count] == Y_pred[count] :            
        correctly_classified += 1
    count += 1

print( "Accuracy on validation set:  ", ( correctly_classified / count ) * 100 )

Accuracy on validation set:   79.7752808988764
