Krishna Penukonda

1001781

# Logistic Regression

 **Part (a)**:

> Probabilities lie in the range `[0, 1]`.

> In `Equation 1`, we multiply a potentially large number of probabilities, which would result in a vanishingly small value.
> Such values either result in inaccuracies or are computationally expensive to represent.

> If in its stead we use `Equation 2`, we could use the more computationally efficient addition operation without running into the vanishing value problem.

**Part (b)**:

## Import Dependencies

In [707]:
import os
import numpy as np
import pandas as pd
import itertools

## Constants and Hyperparameters

In [700]:
PATH = "HW3_data/4/diabetes_train.csv"
TEST_FRACTION = 0.2

## Helper Functions

In [794]:
# Data loaders
def split_xy(data):
    return data[:, 1:], data[:, 0]

def preprocess(x, y):
    """
    Transform labels from [-1, 1] to [0, 1]
    """
    return x, (y + 1) / 2

def train_test_split(x, y, fraction=TEST_FRACTION):
    assert len(x) == len(y)
    split_index = int(len(x) * fraction)
    x_train, x_test = x[split_index:], x[:split_index]
    y_train, y_test = y[split_index:], y[:split_index]
    return x_train, y_train, x_test, y_test

## Define Model

In [795]:
eps = np.finfo(np.float64).eps

class LogisticRegression:
    def __init__(self, model_dir="model", save_every=100, plot_every=100):
        self.save_every = save_every
        self.plot_every = plot_every
        self.model_dir = model_dir
        os.makedirs(self.model_dir, exist_ok=True)

    def concat_ones(self, x):
        if len(x.shape) == 1:
            concat_shape = (1,)
        else:
            concat_shape = (len(x), 1)
        return np.concatenate((np.ones(concat_shape), x), axis=-1)

    def sigmoid(self, x):
        return 1 / (1 + np.exp(-x))

    def loss(self, y_pred, y_true):
        return np.mean(-y_true * np.log(y_pred + eps) - (1 - y_true) * np.log(1 - y_pred + eps))

    def fit(self, features, labels, lr=0.01, iterations=1):
        ll_history = []

        assert len(features) == len(labels)
        num_samples = len(features)
        features = self.concat_ones(features)  # Add 1 to feature vectors
        self.W = np.zeros((features.shape[1])) # Initialize parameters to 0

        for i in itertools.count():  # Enables indefinite training by setting `iterations=0`
            if iterations and i == iterations:
                break
            index = np.random.randint(num_samples)
            x = features[index]
            y_true = labels[index]
            y_pred = self.sigmoid(np.dot(x, self.W))
            self.W -= lr * x * (y_pred - y_true)

            if i % self.save_every == 0:
                np.save(f"model/step_{i}", self.W)

            if i % self.plot_every == 0:
                pred = self.sigmoid(np.dot(features, self.W))
                ll = self.log_likelihood(pred, labels)
                ll_history.append(ll)
        print(ll_history)

    def transform(self, x):
        x = self.concat_ones(x)
        return self.sigmoid(np.dot(x, self.W))

    def log_likelihood(self, y_pred, y_true):
        y_pred = (y_pred * 2) - 1  # Transform back to [-1, 1]
        y_true = (y_true * 2) - 1
        return np.mean(np.log(1 + np.exp(-y_pred * y_true)), axis=0)

    def evaluate(self, features, labels):
        y_pred = self.transform(features)
        ll = self.log_likelihood(y_pred, labels)
        print("Log-likelihood:", ll)

## Load Data

In [796]:
data = pd.read_csv(PATH, header=None).to_numpy()
x, y = split_xy(data)
x, y = preprocess(x, y)
# x_train, y_train, x_test, y_test = train_test_split(x, y)

## Train Model

In [797]:
model = LogisticRegression()
model.fit(x, y, lr=0.1, iterations=10000)

[0.6545493786710933, 0.466740791498438, 0.4576984276623992, 0.46630203355405486, 0.43459230699763657, 0.4208569137046478, 0.41831796469324034, 0.4086740345764076, 0.40442790054076977, 0.4006075919713521, 0.3972036716500872, 0.395617231112861, 0.39087887208642835, 0.38804356095375925, 0.39218414897668635, 0.3837362032872491, 0.38836038092788766, 0.3806740680792935, 0.3873949478607018, 0.38308516588317304, 0.375539954045283, 0.37459486292591737, 0.37340770253506783, 0.37502047769576724, 0.3763889562670835, 0.3728805822011757, 0.37051863903964705, 0.37190924975061324, 0.3696260523603652, 0.36971435082560183, 0.3689531164104237, 0.37258982102242133, 0.37706000926633093, 0.3665356713440492, 0.3670977868762637, 0.3661475436831668, 0.36486031364760346, 0.36548336942042475, 0.3626275800269112, 0.3623443900768206, 0.361206662854668, 0.3608977292999043, 0.36031858427249136, 0.3597017055906214, 0.35993857781454863, 0.35908367042581896, 0.36265119668811596, 0.3585773618679106, 0.35992535757815614,

In [793]:
model.evaluate(x, y)

Log-likelihood: 0.34667505630741235
