In [1]:
# importing required libraries
import numpy as np
import pandas as pd
from sklearn.impute import KNNImputer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import scale


In [2]:
# load the dataset
df_train = pd.read_csv('Titanic_ISTE.csv')

In [3]:
df_train.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Unnamed: 12,Unnamed: 13,Unnamed: 14,Unnamed: 15,Unnamed: 16,Unnamed: 17
0,1,0.0,3.0,"Braund, Mr. Owen Harris",male,22.0,1.0,0.0,A/5 21171,7.25,,S,,,,,,
1,2,1.0,1.0,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1.0,0.0,PC 17599,71.2833,C85,C,,,,,,
2,3,1.0,3.0,"Heikkinen, Miss. Laina",female,26.0,0.0,0.0,STON/O2. 3101282,7.925,,S,,,,,,
3,4,1.0,1.0,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1.0,0.0,113803,53.1,C123,S,,,,,,
4,5,0.0,3.0,"Allen, Mr. William Henry",male,35.0,0.0,0.0,373450,8.05,,S,,,,,,


In [4]:
df_train.describe()

Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare,Unnamed: 12,Unnamed: 13,Unnamed: 14,Unnamed: 15,Unnamed: 16,Unnamed: 17
count,891.0,889.0,890.0,712.0,890.0,889.0,891.0,0.0,0.0,0.0,0.0,0.0,1.0
mean,446.0,0.382452,2.307865,29.675801,0.517978,0.381327,32.204208,,,,,,0.0
std,257.353842,0.48626,0.83622,14.530556,1.093087,0.806596,49.693429,,,,,,
min,1.0,0.0,1.0,0.42,0.0,0.0,0.0,,,,,,0.0
25%,223.5,0.0,2.0,20.0,0.0,0.0,7.9104,,,,,,0.0
50%,446.0,0.0,3.0,28.0,0.0,0.0,14.4542,,,,,,0.0
75%,668.5,1.0,3.0,38.0,1.0,0.0,31.0,,,,,,0.0
max,891.0,1.0,3.0,80.0,8.0,6.0,512.3292,,,,,,0.0


In [5]:
# check which columns have null values
df_train.isna().sum()

PassengerId      0
Survived         2
Pclass           1
Name             0
Sex             10
Age            179
SibSp            1
Parch            2
Ticket           0
Fare             0
Cabin          687
Embarked         2
Unnamed: 12    891
Unnamed: 13    891
Unnamed: 14    891
Unnamed: 15    891
Unnamed: 16    891
Unnamed: 17    890
dtype: int64

In [6]:
# drop rows with missing target values
df_train.dropna(axis = 0, subset=['Survived'], inplace=True, how='all')

In [7]:
df_train.shape

(889, 18)

In [8]:
# separate target column
Y_train = df_train['Survived']
Y_train = Y_train.to_numpy()

In [9]:
X = df_train.loc[:, df_train.columns!='Survived']

In [10]:
X.nunique()

PassengerId    889
Pclass           3
Name           889
Sex              2
Age             88
SibSp            7
Parch            7
Ticket         680
Fare           248
Cabin          147
Embarked         3
Unnamed: 12      0
Unnamed: 13      0
Unnamed: 14      0
Unnamed: 15      0
Unnamed: 16      0
Unnamed: 17      1
dtype: int64

In [11]:
# remove columns with maximum NaN values and maximum unique categorical values
X = pd.get_dummies(data = X, columns = ["Unnamed: 12", "Unnamed: 13",
                                       "Unnamed: 14", "Unnamed: 15",
                                       "Unnamed: 16"], 
                  dummy_na = True, drop_first = True)
X.replace([np.inf, -np.inf], np.nan, inplace=True)
X.drop([ "Name","Ticket", "Cabin", "Unnamed: 17"], inplace=True, axis = 1)

In [12]:
X.head()

Unnamed: 0,PassengerId,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,1,3.0,male,22.0,1.0,0.0,7.25,S
1,2,1.0,female,38.0,1.0,0.0,71.2833,C
2,3,3.0,female,26.0,0.0,0.0,7.925,S
3,4,1.0,female,35.0,1.0,0.0,53.1,S
4,5,3.0,male,35.0,0.0,0.0,8.05,S


In [13]:
X.describe()

Unnamed: 0,PassengerId,Pclass,Age,SibSp,Parch,Fare
count,889.0,888.0,710.0,888.0,887.0,889.0
mean,446.048369,2.306306,29.684746,0.515766,0.382187,32.250698
std,257.140151,0.836515,14.545959,1.091,0.807302,49.739249
min,1.0,1.0,0.42,0.0,0.0,0.0
25%,224.0,2.0,20.125,0.0,0.0,7.925
50%,446.0,3.0,28.0,0.0,0.0,14.4542
75%,668.0,3.0,38.0,1.0,0.0,31.0
max,891.0,3.0,80.0,8.0,6.0,512.3292


In [14]:
from sklearn.preprocessing import OneHotEncoder

# select categorical columns for OneHotEncoding
s = (X.dtypes == 'object')
object_cols = list(s[s].index)

In [15]:
OH_encoder = OneHotEncoder(handle_unknown='ignore', sparse=False)
OH_cols = pd.DataFrame(OH_encoder.fit_transform(X[object_cols]))

# one-hot encoding removed index, put it back
OH_cols.index = X.index

# remove categorical columns 
num_X = X.drop(object_cols, axis=1)

# add one-hot encoded columns to numerical features
OH_X = pd.concat([num_X, OH_cols], axis=1)



In [16]:
X = OH_X
X.head()

Unnamed: 0,PassengerId,Pclass,Age,SibSp,Parch,Fare,0,1,2,3,4,5,6
0,1,3.0,22.0,1.0,0.0,7.25,0.0,1.0,0.0,0.0,0.0,1.0,0.0
1,2,1.0,38.0,1.0,0.0,71.2833,1.0,0.0,0.0,1.0,0.0,0.0,0.0
2,3,3.0,26.0,0.0,0.0,7.925,1.0,0.0,0.0,0.0,0.0,1.0,0.0
3,4,1.0,35.0,1.0,0.0,53.1,1.0,0.0,0.0,0.0,0.0,1.0,0.0
4,5,3.0,35.0,0.0,0.0,8.05,0.0,1.0,0.0,0.0,0.0,1.0,0.0


In [17]:
# normalizing data to improve KNN performance
X_normalized = scale(X)
X_train_try=pd.DataFrame(data=X_normalized,columns=X.columns)
X_train_try.head()



Unnamed: 0,PassengerId,Pclass,Age,SibSp,Parch,Fare,0,1,2,3,4,5,6
0,-1.731736,0.829734,-0.52868,0.444095,-0.47368,-0.502918,-0.733527,0.751759,-0.106661,-0.480939,-0.307941,0.618532,-0.047485
1,-1.727845,-1.562485,0.572057,0.444095,-0.47368,0.785186,1.363276,-1.330214,-0.106661,2.079267,-0.307941,-1.616732,-0.047485
2,-1.723954,0.829734,-0.253496,-0.473012,-0.47368,-0.48934,1.363276,-1.330214,-0.106661,-0.480939,-0.307941,0.618532,-0.047485
3,-1.720063,-1.562485,0.365669,0.444095,-0.47368,0.419408,1.363276,-1.330214,-0.106661,-0.480939,-0.307941,0.618532,-0.047485
4,-1.716172,0.829734,0.365669,-0.473012,-0.47368,-0.486825,-0.733527,0.751759,-0.106661,-0.480939,-0.307941,0.618532,-0.047485


In [18]:
# imputing missing values

X_filled=KNNImputer(n_neighbors=7).fit_transform(X_train_try)
X_filled=pd.DataFrame(data=X_filled,columns=X.columns)

In [19]:
X_filled.columns=X.columns
X_filled.head()

Unnamed: 0,PassengerId,Pclass,Age,SibSp,Parch,Fare,0,1,2,3,4,5,6
0,-1.731736,0.829734,-0.52868,0.444095,-0.47368,-0.502918,-0.733527,0.751759,-0.106661,-0.480939,-0.307941,0.618532,-0.047485
1,-1.727845,-1.562485,0.572057,0.444095,-0.47368,0.785186,1.363276,-1.330214,-0.106661,2.079267,-0.307941,-1.616732,-0.047485
2,-1.723954,0.829734,-0.253496,-0.473012,-0.47368,-0.48934,1.363276,-1.330214,-0.106661,-0.480939,-0.307941,0.618532,-0.047485
3,-1.720063,-1.562485,0.365669,0.444095,-0.47368,0.419408,1.363276,-1.330214,-0.106661,-0.480939,-0.307941,0.618532,-0.047485
4,-1.716172,0.829734,0.365669,-0.473012,-0.47368,-0.486825,-0.733527,0.751759,-0.106661,-0.480939,-0.307941,0.618532,-0.047485


In [20]:
# checking if all null values are removed
X_filled.isnull().sum()

PassengerId    0
Pclass         0
Age            0
SibSp          0
Parch          0
Fare           0
0              0
1              0
2              0
3              0
4              0
5              0
6              0
dtype: int64

In [21]:
# sigmoid activation function
def sigmoid(x):
    return 1/(1 + np.exp(-x))

In [22]:
# cost function for gradient descent
def cost_function(theta,X, y, r):

    m, n = X.shape
    h = sigmoid(np.dot(X, theta))
    J = (-1/m) * (np.dot(y.T, np.log(h)) + np.dot((1 - y).T, np.log(1-h))) + (r * np.sum(theta))

    return J[0].astype(float).item()


In [23]:
# implementation of gradient descent algorithm  

def grad(X, y, theta, alpha, num_iters, r):
    # r - L1 regularization term
    costs = []
    # m - number of samples
    m = X.shape[0]
    
    for i in range(0, num_iters):
        
        z = np.dot(X, theta)
        
        h = sigmoid(z)
        costs.append(cost_function(theta, X, y, r))
        print(f"Iteration {i+1}: Cost {costs[i]}")
        theta = theta - ((alpha / m) * np.dot(X.T, h - y))
        

    return  theta

In [24]:
X = X_filled
np.random.seed(1)
m = X.shape[0]
n = X.shape[1]

theta = np.zeros((n,1))
Y_train.shape
# find optimum values for theta using gradient descent
theta = (grad(X, (Y_train), theta, 0.1, 500, 0.1))

# calculate cost using optimum theta values
J = cost_function(theta, X, Y_train, 0.1)
print(f"The cost after training in 500 iterations is {J}.")

Iteration 1: Cost 0.6931471805599453
Iteration 2: Cost 0.6859449978053688
Iteration 3: Cost 0.6788834092722247
Iteration 4: Cost 0.6719585029040525
Iteration 5: Cost 0.6651665100278841
Iteration 6: Cost 0.6585037990439261
Iteration 7: Cost 0.6519668694265429
Iteration 8: Cost 0.6455523460200777
Iteration 9: Cost 0.6392569736139555
Iteration 10: Cost 0.6330776117823809
Iteration 11: Cost 0.6270112299747297
Iteration 12: Cost 0.6210549028435179
Iteration 13: Cost 0.6152058057975316
Iteration 14: Cost 0.6094612107683932
Iteration 15: Cost 0.60381848217948
Iteration 16: Cost 0.5982750731067012
Iteration 17: Cost 0.5928285216212316
Iteration 18: Cost 0.5874764473048196
Iteration 19: Cost 0.5822165479288111
Iteration 20: Cost 0.5770465962884979
Iteration 21: Cost 0.5719644371848589
Iteration 22: Cost 0.5669679845461959
Iteration 23: Cost 0.5620552186825557
Iteration 24: Cost 0.5572241836662222
Iteration 25: Cost 0.552472984831925
Iteration 26: Cost 0.5477997863907363
Iteration 27: Cost 0.543