In [12]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

In [13]:
train = pd.read_csv("train.csv")
train.drop(['PassengerId'], inplace=True, axis=True)

In [14]:
# test.isnull().sum() + train.isnull().sum()

In [15]:
num_cols = ['Pclass', 'Age', 'Fare', 'FamilySize']
target = ['Survived']

In [16]:
def create_honorific(df):
  df['Honorific'] = (df['Name'].apply(lambda name : name.split(',')[1].split('.')[0].strip()))
  df['Honorific'].replace(['Rev', 'Col', 'Dr', 'Major', 'Don', 'Capt'], 'Mr', inplace=True)
  df['Honorific'].replace(['Dona', 'Countess'], 'Mrs', inplace=True)
  df['Honorific'].replace(['Mlle', 'Ms'], 'Miss', inplace=True)
  return df

train = create_honorific(train)

In [17]:
def fix_embarked(df):
  df = df[df['Embarked'].notnull()]
  return df

def fix_fare(df):
  mean_fare = df[df['Fare'].notnull()]['Fare'].mean()
  df.loc[df['Fare'].isnull(), 'Fare'] = mean_fare
  return df

def fix_age(df):
  df['Age'] = df.apply(lambda row: df[df['Honorific'] == row['Honorific']]['Age'].mean() if np.isnan(row['Age']) else row['Age'], axis=1)
  return df

def fix_cabin(df):
  return df.drop(['Cabin'], axis=True)

def fix_dataset(df):
  df = fix_cabin(df)
  df = fix_age(df)
  df = fix_fare(df)
  return df

train = fix_embarked(train)
train = fix_dataset(train)

In [18]:
def create_family_size(df):
  df['FamilySize'] = df['SibSp'] + df['Parch']
  return df

train = create_family_size(train)

In [19]:
def drop_unused_cols(df):
 return  df.drop(['Parch', 'SibSp', 'Ticket', 'Name'], axis=True)

train = drop_unused_cols(train)

In [20]:
X_train, X_test, y_train, y_test = train_test_split(
  train.drop(target, axis=True),
  train[target],
  test_size=0.2
)

In [21]:
train_objs_num = len(X_train)
dataset = pd.concat(objs=[X_train, X_test], axis=0)
dataset_preprocessed = pd.get_dummies(dataset)
X_train = dataset_preprocessed[:train_objs_num]
X_test = dataset_preprocessed[train_objs_num:]

In [22]:
import matplotlib.pyplot as plt

class LogReg():

  def __init__(self):
    self.T = np.array([])

  def sig(self, t):
    return 1 / (1 + np.exp(-t))

  def logloss(Y, predicted):
    return -np.mean(Y * np.log(predicted) + (1 - Y) * np.log(1 - predicted))

  def gradient_descent(self, X, Y, iterations, learning_rate):
    N, P = X.shape
    T = np.zeros(P)
    for iteration in range(iterations):
      t = np.dot(X, T)
      z = self.sig(t)
      dt = ((1 / N) * np.dot(X.T, (z - Y)))
      T -= learning_rate * dt
    self.T = T

  def newton_optimization(self, X, Y, iterations):
    N, P = X.shape
    T = np.zeros(P)
    for iteration in range(iterations):
      t = np.dot(X, T)
      z = self.sig(t)
      dt = (1 / N) * np.dot(X.T, (z - Y))
      H = np.matmul(X.T, (((z * (1 - z)) * X.T).T))
      if np.linalg.det(H) == 0:
        self.T = T
        return
      T -= np.matmul(np.linalg.inv(H), dt)
    self.T = T

  def prob(self, x):
    return self.sig(np.dot(self.T, np.array(x)))


In [23]:
class Metrics():

  def __init__(self, Y_prediction, Y_test):
    TP = np.sum((Y_prediction == 1) & (Y_test == 1))
    TN = np.sum((Y_prediction == 0) & (Y_test == 0))
    FP = np.sum((Y_prediction == 1) & (Y_test == 0))
    FN = np.sum((Y_prediction == 0) & (Y_test == 1))
    accuracy = (TP + TN) / (TP + TN + FP + FN) if (TP + TN + FP + FN) != 0 else 0
    precision = TP / (TP + FP) if (TP + FP) != 0 else 0
    recall = TP / (TP + FN) if (TP + FN) != 0 else 0
    f1_score = 2 * precision * recall / (precision + recall) if (precision + recall) != 0 else 0
    self.accuracy = accuracy
    self.precision = precision
    self.recall = recall
    self.f1 = f1_score


In [24]:
PREDICT_THREASHOLD = 0.6

def predict(clf, X):
  predicted = []
  for x in X:
    predicted.append(int(clf.prob(x) > PREDICT_THREASHOLD))
  return np.array(predicted)

In [25]:
import warnings
warnings.filterwarnings('ignore')

HYPERPARAMS = {
    'iterations': [1000, 5000, 25000],
    'rate': [0.001, 0.01, 0.05, 0.1]
}

df = pd.DataFrame(columns=['Iterations', 'Rate', 'Method', 'Accuracy', 'Precision', 'Recall', 'F1'])
clf = LogReg()

for iterations in HYPERPARAMS['iterations']:
  for rate in HYPERPARAMS['rate']:
    clf.gradient_descent(X_train, np.array(list(map(lambda x: x[0], y_train.to_numpy()))), iterations, rate)
    predicted = predict(clf, X_test.to_numpy())
    mt = Metrics(predicted, np.array(list(map(lambda x: x[0], y_test.to_numpy()))))
    df = df.append({'Iterations': iterations, 'Rate': rate, 'Method': 'gradient', 'Accuracy': mt.accuracy, 'Precision': mt.precision , 'Recall': mt.recall, 'F1': mt.f1}, ignore_index=True)
  clf.newton_optimization(X_train, np.array(list(map(lambda x: x[0], y_train.to_numpy()))), iterations)
  predicted = predict(clf, X_test.to_numpy())
  mt = Metrics(predicted, np.array(list(map(lambda x: x[0], y_test.to_numpy()))))
  df = df.append({'Iterations': iterations, 'Rate': rate, 'Method': 'newton', 'Accuracy': mt.accuracy, 'Precision': mt.precision , 'Recall': mt.recall, 'F1': mt.f1}, ignore_index=True)

df

Unnamed: 0,Iterations,Rate,Method,Accuracy,Precision,Recall,F1
0,1000,0.001,gradient,0.634831,0.8,0.162162,0.269663
1,1000,0.01,gradient,0.646067,0.823529,0.189189,0.307692
2,1000,0.05,gradient,0.685393,0.581818,0.864865,0.695652
3,1000,0.1,gradient,0.646067,0.823529,0.189189,0.307692
4,1000,0.1,newton,0.58427,0.0,0.0,0.0
5,5000,0.001,gradient,0.657303,0.842105,0.216216,0.344086
6,5000,0.01,gradient,0.696629,0.916667,0.297297,0.44898
7,5000,0.05,gradient,0.741573,0.659091,0.783784,0.716049
8,5000,0.1,gradient,0.769663,0.823529,0.567568,0.672
9,5000,0.1,newton,0.58427,0.0,0.0,0.0
