<h1><center>scikit-learn implementations of Extension 2 - Logistic Regression with L1 regularization</center></h1>

### Imports

In [9]:
from PIL import Image
import numpy as np
import glob
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import f1_score
from sklearn.model_selection import train_test_split
from sklearn.linear_model import SGDClassifier
from sklearn.datasets import load_breast_cancer
import matplotlib.pyplot as plt

### Get the dogs data

I took the Chihuahua and Japanese Spaniel images and put them in the "selected_images/class0" directory and I put the Maltese and Pekinese images in the "selected_images/class1" directory.

So the classifier is distinguishing between \[Chihuahua or Japanese Spaniel\] and \[Maltese or Pekinese\].

In [10]:
def get_all_data():
    pics = list()
    y_all = np.array([], dtype=np.int8)
    new_size = (6, 6)

    for i, folder in enumerate(["selected_images/class0", "selected_images/class1"]):
        for f in glob.glob(f"{folder}/*"):
            curr_pic = np.array(Image.open(f).resize(new_size)).reshape(-1) # shrink and flatten the pic
            pics.append(curr_pic)
            y_all = np.concatenate((y_all, [i])) # all pictures in a given folder have the same label
            
    X_all = np.array(pics)
    return X_all, y_all

In [11]:
X_all, y_all = get_all_data()
X_train, X_test, y_train, y_test = train_test_split(X_all, y_all, train_size=0.8, random_state=0)
X_train = StandardScaler().fit_transform(X_train)
X_test = StandardScaler().fit_transform(X_test)

### Get the breast cancer data

In [12]:
X_all_bc, y_all_bc = load_breast_cancer(return_X_y=True)
X_train_bc, X_test_bc, y_train_bc, y_test_bc = train_test_split(X_all_bc, y_all_bc, train_size=0.8, random_state=0)
scaler = StandardScaler().fit(X_train_bc)
X_train_bc = scaler.transform(X_train_bc)
X_test_bc = scaler.transform(X_test_bc)

# Evaluate scikit-learn's F1 scores

### i) Check the F1 score of sklearn's plain logistic regression on the dogs data

In [13]:
plain_SGD = SGDClassifier(loss="log", shuffle=False, alpha=0, penalty=None, eta0=0.1, learning_rate="constant", tol=None, random_state=0).fit(X_train, y_train)
plain_y_pred = plain_SGD.predict(X_test)
print(f"The F1 score of sklearn's plain LR on the dogs data is {100 * f1_score(y_test, plain_y_pred)}")

The F1 score of sklearn's plain LR on the dogs data is 58.68263473053893


### ii) Check the F1 score of sklearn's logistic regression with L1 regularization on the dogs data

In [14]:
l1_SGD = SGDClassifier(loss="log", shuffle=False, penalty="l1", alpha=0.01, tol=None, random_state=0, learning_rate="constant", eta0=0.1).fit(X_train, y_train)
l1_y_pred = l1_SGD.predict(X_test)
print(f"The F1 score of sklearn's LR with L1 on the dogs data is {100 * f1_score(y_test, l1_y_pred)}")

The F1 score of sklearn's LR with L1 on the dogs data is 70.96774193548386


### iii) Check the F1 score of sklearn's plain logistic regression on the breast cancer data

In [15]:
plain_LR_sk = SGDClassifier(loss="log", shuffle=False, alpha=0, penalty=None, eta0=0.1, learning_rate="constant", tol=None, random_state=0).fit(X_train_bc, y_train_bc)
plain_preds_bc_sk = plain_LR_sk.predict(X_test_bc)
print(f"The F1 score of sklearn's plain LR on the breast cancer data is {100 * f1_score(y_test_bc, plain_preds_bc_sk)}")

The F1 score of sklearn's plain LR on the breast cancer data is 95.45454545454547


### iv) Check the F1 score of sklearn's logistic regression with L1 regularization on the breast cancer data

In [16]:
l1_LR_sk = SGDClassifier(loss="log", shuffle=False, penalty="l1", alpha=0.01, tol=None, random_state=0, learning_rate="constant", eta0=0.1).fit(X_train_bc, y_train_bc)
l1_preds_bc_sk = l1_LR_sk.predict(X_test_bc)
print(f"The F1 score of my LR with L1 on the breast cancer data is {100 * f1_score(y_test_bc, l1_preds_bc_sk)}")

The F1 score of my LR with L1 on the breast cancer data is 97.05882352941177
