<pre>
Reference: 
Machine Learning- Sudeshna Sarkar (Logistic Regression): https://www.youtube.com/watch?v=CE03E80wbRE
</pre>

## We will build a basic logistic regression model on titanic dataset

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_curve, auc, roc_auc_score

In [2]:
np.random.seed(42)

In [3]:
df = pd.read_csv('../data/titanic_train.csv')

In [4]:
df = df.replace([np.inf, -np.inf], np.nan)

In [5]:
df['Age'].fillna(df['Age'].mean(), inplace=True)

In [6]:
df['has_cabin'] = df['Cabin'].apply(lambda val: 0 if pd.isnull(val) else 1)

In [7]:
df.columns

Index(['PassengerId', 'Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp',
       'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked', 'has_cabin'],
      dtype='object')

In [8]:
df = df[['Age', 'Survived', 'Sex', 'PassengerId', 'has_cabin']]

In [9]:
df.set_index('PassengerId', inplace=True)

In [10]:
df['Sex'] = df['Sex'].map({'male':0, 'female':1})

In [11]:
df.shape

(891, 4)

In [12]:
df.head()

Unnamed: 0_level_0,Age,Survived,Sex,has_cabin
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,22.0,0,0,0
2,38.0,1,1,1
3,26.0,1,1,0
4,35.0,1,1,1
5,35.0,0,0,0


In [13]:
from sklearn import preprocessing
min_max_scaler = preprocessing.MinMaxScaler(feature_range=(-1,1))

In [14]:
df_survived = df.pop('Survived')
min_max_scaler.fit(df)
df['Survived'] = df_survived

In [15]:
X_train, X_test = train_test_split(df, test_size=0.3)

In [16]:
Y_train = X_train.pop('Survived')
Y_test = X_test.pop('Survived')

In [17]:
X_train = min_max_scaler.transform(X_train)
X_test = min_max_scaler.transform(X_test)

## Build a custom logistic regression model

In [18]:
# sigmoid is simply: 1/(1+ e^(-x))
def sigmoid(values):
    return 1/(1+np.exp(-values))

In [19]:
sigmoid(np.array([1,2,3]))

array([0.73105858, 0.88079708, 0.95257413])

In [20]:
# log_likelihood which we want to maximise
# Formula:
# Sigma (y_i * log(h(x_i)) + (1-y_i) log(1-h(x_i)))
# where y_i is ith output value
# h(x_i) is sigmoid(B'X) [sigmoid of output of i_th input features]
def log_likelihood(expected_output, predicted_output):
    return np.sum(expected_output * np.log(predicted_output) + (1 - expected_output) * np.log(1 - predicted_output))

In [21]:
print('log_likelihood 1', log_likelihood(np.array([1,1,0,0]), np.array([0.9,0.8,0.3,0.1])))
print('log_likelihood 2 ', log_likelihood(np.array([1,1,0,0]), np.array([0.9,0.9,0.3,0.1])))
# log_likelihood 2 > 1 because the predicted values are closer to expected values

log_likelihood 1 -0.7905395265685948
log_likelihood 2  -0.6727564909122112


In [22]:
X_train.shape, Y_train.shape

((623, 3), (623,))

In [23]:
Y_train.values

array([1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 1,
       0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0,
       1, 0, 0, 1, 1, 1, 1, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 1, 1, 1, 0, 0, 0, 0,
       1, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 1, 1,
       1, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0, 0, 1,
       1, 0, 1, 0, 1, 1, 1, 0, 1, 0, 0, 0, 1, 1, 0, 1, 1, 0, 0, 0, 1, 0,
       1, 0, 0, 0, 1, 0, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1,
       0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1,
       0, 0, 1, 1, 1, 0, 0, 0, 1, 1, 0, 1, 1, 0, 1, 0, 1, 0, 0, 0, 0, 1,
       1, 1, 1, 0, 1, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0,
       1, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 1,
       0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1, 0, 1, 1, 0, 1, 0, 1,
       0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0,

In [24]:
# if we wan't an intercept value then we add a new feature as all 1s
# Formula: for each step
# compute the prediction using B' * X done where B is weights and X is features
# prediccted output is passed to sigmoid fuction
# difference in prediction and actual is used to get error
# this error is used to get get gradient along with learning rate which is used to update weight

def logistic_regression(features, target, num_steps, learning_rate, fit_intercept=True, log_steps=100):
    if fit_intercept:
        # adding intercept, Beta_0
        intercept = np.ones((features.shape[0], 1))
        features = np.hstack((intercept, features))
    
    weights = np.zeros(features.shape[1])
    for i in range(num_steps):
        predicted_outputs = []
        for idx in range(features.shape[0]):
            row = features[idx,:]
            prediction = np.dot(row, weights)
            predicted_output = sigmoid(prediction)
            predicted_outputs.append(predicted_output)
            # Stochastic Gradient descent to update weights
            output_error = target[idx] - predicted_output
            gradient = np.dot(row.T, output_error)
            weights += learning_rate * gradient
        
        if i % log_steps == 0:
            print('step:', i, 'log_likelihood:', log_likelihood(target, np.array(predicted_outputs)))
    return weights

In [25]:
# Same process as above but no learning rate or updateing weights. just making predictions.
def logistic_predict(features, weights, fit_intercept=True):
    if fit_intercept:
        # adding intercept, Beta_0
        intercept = np.ones((features.shape[0], 1))
        features = np.hstack((intercept, features))
    prediction = np.dot(features, weights)
    predicted_output = sigmoid(prediction)
    return predicted_output

In [26]:
weights = logistic_regression(X_train, Y_train.values, num_steps=10000, learning_rate=5e-5, log_steps=1000)

step: 0 log_likelihood: -430.5298890220595
step: 1000 log_likelihood: -297.7290414711366
step: 2000 log_likelihood: -296.927818962074
step: 3000 log_likelihood: -296.6645579833639
step: 4000 log_likelihood: -296.5578778378138
step: 5000 log_likelihood: -296.51403678577753
step: 6000 log_likelihood: -296.49592891273306
step: 7000 log_likelihood: -296.48842498333215
step: 8000 log_likelihood: -296.48530816958885
step: 9000 log_likelihood: -296.4840113518603


In [27]:
print('intercept', weights[0], 'coef_', weights[1:])

intercept -0.025388469476834076 coef_ [-0.77621443  1.2261221   0.85275319]


In [28]:
roc_auc_score(Y_test, logistic_predict(X_test, weights))

0.8478223446376313

## Build a logistic regression model with Sklean for validation

In [29]:
from sklearn.linear_model import LogisticRegression

In [30]:
# C = 5 to reduce regularization on coeficients, as we don't have any regularization in our model.
model = LogisticRegression(fit_intercept=True, C=5, max_iter=10000, verbose=5)
model = model.fit(X_train, Y_train)

[LibLinear]

In [31]:
roc_auc_score(Y_test, model.predict_proba(X_test)[:,1])

0.8478223446376313

In [32]:
print('intercept', model.intercept_, 'coef', model.coef_)

intercept [-0.02545202] coef [[-0.76697725  1.22293027  0.84869292]]
