Krishna Penukonda

1001781

# Logistic Regression

 **Part (a)**:

> Probabilities lie in the range `[0, 1]`.

> In `Equation 1`, we multiply a potentially large number of probabilities, which would result in a vanishingly small value.
> Such values either result in inaccuracies or are computationally expensive to represent.

> If in its stead we use `Equation 2`, we could use the more computationally efficient addition operation without running into the vanishing value problem.

**Part (b)**:

## Import Dependencies

In [707]:
import os
import numpy as np
import pandas as pd
import itertools

## Constants and Hyperparameters

In [700]:
PATH = "HW3_data/4/diabetes_train.csv"
TEST_FRACTION = 0.2

## Helper Functions

In [701]:
# Data loaders
def split_xy(data):
    return data[:, 1:], data[:, 0]

def preprocess(x, y):
    """
    Transform labels from [-1, 1] to [0, 1]
    """
    return x, (y + 1) / 2

def train_test_split(x, y, fraction=TEST_FRACTION):
    assert len(x) == len(y)
    split_index = int(len(x) * fraction)
    x_train, x_test = x[split_index:], x[:split_index]
    y_train, y_test = y[split_index:], y[:split_index]
    return x_train, y_train, x_test, y_test

## Define Model

In [790]:
eps = np.finfo(np.float64).eps

class LogisticRegression:
    def __init__(self, model_dir="model", save_every=100, plot_every=100):
        self.save_every = save_every
        self.plot_every = plot_every
        self.model_dir = model_dir
        os.makedirs(self.model_dir, exist_ok=True)

    def concat_ones(self, x):
        if len(x.shape) == 1:
            concat_shape = (1,)
        else:
            concat_shape = (len(x), 1)
        return np.concatenate((np.ones(concat_shape), x), axis=-1)

    def sigmoid(self, x):
        return 1 / (1 + np.exp(-x))

    def loss(self, y_pred, y_true):
        return np.mean(-y_true * np.log(y_pred + eps) - (1 - y_true) * np.log(1 - y_pred + eps))

    def fit(self, features, labels, lr=0.01, iterations=1):
        ll_history = []

        assert len(features) == len(labels)
        num_samples = len(features)
        features = self.concat_ones(features)  # Add 1 to feature vectors
        self.W = np.zeros((features.shape[1])) # Initialize parameters to 0

        for i in itertools.count():  # Enables indefinite training by setting `iterations=0`
            if iterations and i == iterations:
                break
            index = np.random.randint(num_samples)
            x = features[index]
            y_true = labels[index]
            y_pred = self.sigmoid(np.dot(x, self.W))
            self.W -= lr * x * (y_pred - y_true)

            if i % self.save_every == 0:
                np.save(f"model/step_{i}", self.W)

            if i % self.plot_every == 0:
                pred = self.sigmoid(np.dot(features, self.W))
                ll = self.log_likelihood(pred, labels)
                ll_history.append(ll)
        print(ll_history)

    def transform(self, x):
        x = self.concat_ones(x)
        return self.sigmoid(np.dot(x, self.W))

    def log_likelihood(self, y_pred, y_true):
        y_pred = (y_pred * 2) - 1  # Transform back to [-1, 1]
        y_true = (y_true * 2) - 1
        return np.mean(np.log(1 + np.exp(-y_pred * y_true)), axis=0)

    def evaluate(self, features, labels):
        y_pred = self.transform(features)
        ll = self.log_likelihood(y_pred, labels)
        print("Log-likelihood:", ll)

## Load Data

In [791]:
data = pd.read_csv(PATH, header=None).to_numpy()
x, y = split_xy(data)
x, y = preprocess(x, y)
# x_train, y_train, x_test, y_test = train_test_split(x, y)

## Train Model

In [792]:
model = LogisticRegression()
model.fit(x, y, lr=0.1, iterations=10000)

[0.653005583448345, 0.46760202123116495, 0.4580056293841321, 0.4447249559618966, 0.4384119320620811, 0.43260983367391465, 0.4173450341537369, 0.4109152829713283, 0.4185961138181349, 0.4024096988054098, 0.39713463590392833, 0.3994667357295151, 0.39421654329572264, 0.38946214455806233, 0.38989212936234285, 0.38526858827054095, 0.38496450554800016, 0.3814532418261249, 0.3805576766849137, 0.3782266599531407, 0.378093253118003, 0.37526367310082914, 0.37404671606175394, 0.37373290721171604, 0.37427073171866276, 0.37152014251025645, 0.3705441095819804, 0.3739383345252063, 0.36898671411456674, 0.3681464291733534, 0.3668170924107653, 0.3692723416792016, 0.36541091767783745, 0.3687280108825226, 0.36974698136184814, 0.36761237893139653, 0.36397603183062466, 0.36378351154403527, 0.36361339624419814, 0.36277448284485203, 0.36167267580269147, 0.36292867081528973, 0.36399925674010153, 0.3617391398945139, 0.3604947198669986, 0.36112871812817426, 0.36049961414629994, 0.359359490787791, 0.35907769091413

In [793]:
model.evaluate(x, y)

Log-likelihood: 0.34667505630741235
