In [1]:
import math
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from tqdm import tqdm

In [2]:
"""
1 - gradient descent
2 - learning rate, maxiteration, tolerance, X, y
3 - add column 0
4 - normalize function
5 - sigmoid function
6 - cost function
7 - cost derivative
8 - predict function 
10 - evaluate function
    precision
    recall
"""

'\n1 - gradient descent\n2 - learning rate, maxiteration, tolerance, X, y\n3 - add column 0\n4 - normalize function\n5 - sigmoid function\n6 - cost function\n7 - cost derivative\n8 - predict function \n10 - evaluate function\n    precision\n    recall\n'

In [11]:
class LogisticRegression:
    def __init__(self, learningRate, maxIteration, tolerance):
        self.learningRate = learningRate
        self.maxIteration = maxIteration
        self.tolerance = tolerance
        self.train_X, self.test_X, self.train_y, self.test_y = self.readDataset()

    def readDataset(self):
        train_df = pd.read_excel('Lab3_data.xls', sheet_name = '2004--2005 Data')
        test_df = pd.read_excel('Lab3_data.xls', sheet_name = '2004--2007 Data')

        train_df, test_df = train_df.values, test_df.values

        train_X, train_y = train_df[:, 1:], train_df[:, 0]
        test_X, test_y = test_df[:, 1:], test_df[:, 0]
        return train_X, test_X, train_y, test_y

    def addX0(self, X):
        return np.column_stack([np.ones([X.shape[0], 1]), X])

    def sigmoid(self, z):
        sig = 1 / (1+ np.exp(-z))
        return sig

    def costFunction(self, X, y):
        sig = self.sigmoid(X.dot(self.w))
        loss = y * np.log(sig) + (1-y) * np.log(1-sig)
        cost = - loss.sum()
        return cost

    def gradient(self, X, y):
        sig = self.sigmoid(X.dot(self.w))
        grad = (sig-y).dot(X)
        return grad

    def gradientDescent(self, X, y):
        losses = []
        prev_loss = float('inf')

        for i in tqdm(range(self.maxIteration), colour = 'red'):
            self.w = self.w - self.learningRate * self.gradient(X, y)
            current_loss = self.costFunction(X, y)
            diff_loss = np.abs(prev_loss - current_loss)
            losses.append(current_loss)

            if diff_loss < self.tolerance:
                print("The model stopped learning")
                break

            prev_loss = current_loss

        #self.plot_cost(losses) ---- need to find the code and add this later from the lecture

    def predict(self, X):
        sig = self.sigmoid(X.dot(self.w))
        return np.around(sig) 

    def fit(self):

        self.w = np.ones(self.train_X.shape[1], dtype = np.float64)
        print('Solving using gradient descent')
        self.gradientDescent(self.train_X, self.train_y)

        print("Evaluating the training results")
        y_hat_train = self.predict(self.train_X)

        recall, precision, f_score = self.evaluateFunction(self.train_y, y_hat_train)
        print("The recall of the model was {}".format(recall))
        print("The precision of the model was {}".format(precision))
        print("The F1 score of the model was {}".format(f_score))



    def evaluateFunction(self, y, y_hat):
        y = (y == 1)
        y_hat = (y_hat ==1)

        precision = (y & y_hat).sum() / y_hat.sum()
        recall = (y & y_hat).sum() / y.sum()

        f_score = 2 * (precision * recall)/ (precision + recall)

        return recall, precision, f_score
    
    # def plot function
    # def plot 3d


In [12]:
lr = LogisticRegression(tolerance=0.0, learningRate=0.1e-5, maxIteration= 10000)

In [13]:
lr.fit()

Solving using gradient descent


  loss = y * np.log(sig) + (1-y) * np.log(1-sig)
  loss = y * np.log(sig) + (1-y) * np.log(1-sig)
100%|[31m██████████[0m| 10000/10000 [00:00<00:00, 36652.60it/s]

Evaluating the training results
The recall of the model was 0.9722222222222222
The precision of the model was 0.9210526315789473
The F1 score of the model was 0.9459459459459458



