In [29]:
import numpy as np
import pandas as pd
from sklearn.utils import shuffle
from sklearn.preprocessing import  StandardScaler as scaler
from sklearn.metrics import confusion_matrix, classification_report, ConfusionMatrixDisplay
from sklearn.model_selection import train_test_split as tts

---
# **Logistic Regression Model**

In [30]:
class LogisticRegression:
    def __init__(self, lr=0.1, max_iters=500):
        self.lr = lr
        self.max_iters = max_iters

    def sigmoid(self, x):
        return 1/(1+np.exp(-x))

    def fit(self, X, y):
        n_samples = X.shape[0]
        
        b = np.ones(n_samples).reshape(-1,1)
        X = np.concatenate((b,X), axis = 1)

        self.weights = np.zeros(X.shape[1])

        for _ in range(self.max_iters):
            y_hat = self.sigmoid(np.dot(X,self.weights))


            dw = np.dot(X.T, y-y_hat)
            self.weights+=1/n_samples*dw    

        return
    
    def predict(self, X):
        return np.array([self._predict(x) for x in X]) >=0.5
    
    def _predict(self, x):
        return self.sigmoid(np.dot(x, self.weights[1:])+self.weights[0])

---
## **Code Usage**

In [31]:
def dataloader():

    df = pd.read_csv('../data/cancer_detection.csv')
    df.drop(columns=df.columns[[0, -1]], inplace=True)   #  Dropping non-informative columns
    df['diagnosis'] = df['diagnosis'].map({'M':1, 'B':0})   # Malignant -> 1, Benign -> -1

    return df

def preprocessor(df):
    y = df['diagnosis'].to_numpy()
    X = df.drop(['diagnosis'], axis=1).to_numpy()

    X = scaler().fit_transform(X)
    X, y = shuffle(X,y, random_state=42)

    return X, y

In [32]:
# Loading dataset
X, y = preprocessor(dataloader())

print("Feature Shape:", X.shape)
print("Target Shape:", y.shape)

train_X, test_X, train_y , test_y = tts(X, y, random_state=42)

print("Number of examples in training set:", train_X.shape[0])
print("Number of examples in test set:", test_X.shape[0])


Feature Shape: (569, 30)
Target Shape: (569,)
Number of examples in training set: 426
Number of examples in test set: 143


In [33]:
cls = LogisticRegression(lr = 0.001, max_iters=50)
cls.fit(train_X, train_y)

# Perfomance on training set

train_pred = cls.predict(train_X)

print(classification_report(train_y, train_pred))

              precision    recall  f1-score   support

           0       0.98      0.99      0.99       268
           1       0.99      0.97      0.98       158

    accuracy                           0.98       426
   macro avg       0.98      0.98      0.98       426
weighted avg       0.98      0.98      0.98       426



In [34]:
# Performance on test set
test_pred = cls.predict(test_X)

print(classification_report(test_y, test_pred))

              precision    recall  f1-score   support

           0       0.97      1.00      0.98        89
           1       1.00      0.94      0.97        54

    accuracy                           0.98       143
   macro avg       0.98      0.97      0.98       143
weighted avg       0.98      0.98      0.98       143

