## DDA3020 Homework 2
### Instructions:
- Follow the notebook and complete the code cells marked as TODO
- Ensure your code runs successfully until the end of the notebook

In [1]:
from os import path as osp
import numpy as np

# load data
def load_data():

    data_dir = './data'
    train_val_data_path = osp.join(data_dir, 'train_validation_data.npy')
    train_val_label_path = osp.join(data_dir, 'train_validation_label.npy')
    test_data_path = osp.join(data_dir, 'test_data.npy')
    test_label_path = osp.join(data_dir, 'test_label.npy')

    train_val_data = np.load(train_val_data_path)
    train_val_label = np.load(train_val_label_path)
    test_data = np.load(test_data_path)
    test_label = np.load(test_label_path)
    return train_val_data, train_val_label, test_data, test_label


train_validation_data, train_validation_label, test_data, test_label = load_data()

print(f'# ========== data info ============ #')
print(f'train validation data: {train_validation_data.shape}')
print(f'train validation label: {train_validation_label.shape}')
print(f'test data: {test_data.shape}')
print(f'test label: {test_label.shape}')
print(f'# ================================= #')

train validation data: (1000, 100)
train validation label: (1000,)
test data: (400, 100)
test label: (400,)


In [8]:
# data split for K-fold Cross-validation

def train_validation_split(K, train_val_data, train_val_label):
    train_datas = []
    train_labels = []
    val_datas = []
    val_labels = []

    class_0_indices = np.where(train_val_label == 0)[0]
    class_1_indices = np.where(train_val_label == 1)[0]

    np.random.shuffle(class_0_indices)
    np.random.shuffle(class_1_indices)

    num_val_data_per_class = 200 // 2
    num_train_data_per_class = 800 // 2

    for i in range(K):
        val_indices_0 = class_0_indices[i * num_val_data_per_class:(i + 1) * num_val_data_per_class]
        val_indices_1 = class_1_indices[i * num_val_data_per_class:(i + 1) * num_val_data_per_class]
        val_indices = np.concatenate((val_indices_0, val_indices_1))

        train_indices_0 = np.concatenate((class_0_indices[:i * num_val_data_per_class], class_0_indices[(i + 1) * num_val_data_per_class:]))
        train_indices_1 = np.concatenate((class_1_indices[:i * num_val_data_per_class], class_1_indices[(i + 1) * num_val_data_per_class:]))
        train_indices_0 = train_indices_0[:num_train_data_per_class]
        train_indices_1 = train_indices_1[:num_train_data_per_class]
        train_indices = np.concatenate((train_indices_0, train_indices_1))

        val_data = train_val_data[val_indices]
        val_label = train_val_label[val_indices]

        train_data = train_val_data[train_indices]
        train_label = train_val_label[train_indices]

        train_datas.append(train_data)
        train_labels.append(train_label)
        val_datas.append(val_data)
        val_labels.append(val_label)

    return train_datas, train_labels, val_datas, val_labels

In [9]:
# test the train_validation_split function
K = 5
train_datas, train_labels, val_datas, val_labels = train_validation_split(K, train_validation_data, train_validation_label)

# check if the class is balanced in the train and validation data
for i in range(K):
    print(f'fold {i}')
    print(f'train data: {train_datas[i].shape}')
    print(f'train label: {train_labels[i].shape}')
    print(f'validation data: {val_datas[i].shape}')
    print(f'validation label: {val_labels[i].shape}')
    print(f'unique train label: {np.unique(train_labels[i], return_counts=True)}')
    print(f'unique validation label: {np.unique(val_labels[i], return_counts=True)}')
    print(f'=====================')

fold 0
train data: (800, 100)
train label: (800,)
validation data: (200, 100)
validation label: (200,)
unique train label: (array([0., 1.]), array([400, 400]))
unique validation label: (array([0., 1.]), array([100, 100]))
fold 1
train data: (800, 100)
train label: (800,)
validation data: (200, 100)
validation label: (200,)
unique train label: (array([0., 1.]), array([400, 400]))
unique validation label: (array([0., 1.]), array([100, 100]))
fold 2
train data: (800, 100)
train label: (800,)
validation data: (200, 100)
validation label: (200,)
unique train label: (array([0., 1.]), array([400, 400]))
unique validation label: (array([0., 1.]), array([100, 100]))
fold 3
train data: (800, 100)
train label: (800,)
validation data: (200, 100)
validation label: (200,)
unique train label: (array([0., 1.]), array([400, 400]))
unique validation label: (array([0., 1.]), array([100, 100]))
fold 4
train data: (800, 100)
train label: (800,)
validation data: (200, 100)
validation label: (200,)
unique tr

In [10]:
# evaluation metrics

def eva_precision(true_label, pred_label, _class):
    
    # TODO: ==========================
    # calculate the precision of class _class
    # ================================
    TP = 0
    FP = 0
    for i in range(len(true_label)):
        if true_label[i] == _class and pred_label[i] == _class:
            TP += 1
        if true_label[i] != _class and pred_label[i] == _class:
            FP += 1

    if TP + FP == 0:
        precison = 0
    else:
        precison = TP / (TP + FP)

    return precison

def eva_recall(true_label, pred_label, _class):

    # TODO: ==========================
    # calculate the recall of class _class
    # ================================
    TP = 0
    FN = 0
    for i in range(len(true_label)):
        if true_label[i] == _class and pred_label[i] == _class:
            TP += 1
        if true_label[i] == _class and pred_label[i] != _class:
            FN += 1

    if TP + FN == 0:
        recall = 0
    else:
        recall = TP / (TP + FN)

    return recall

def eva_f1(true_label, pred_label, _class):

    # TODO: ==========================
    # calculate the f1 of class _class
    # ================================
    precision = eva_precision(true_label, pred_label, _class)
    recall = eva_recall(true_label, pred_label, _class)

    if precision + recall == 0:
        f1 = 0
    else:
        f1 = 2 * precision * recall / (precision + recall)

    return f1

def eva_accuracy(true_label, pred_label):

    # TODO: ==========================
    # calculate the accuracy
    # ================================
    correct = 0
    for i in range(len(true_label)):
        if true_label[i] == pred_label[i]:
            correct += 1

    accuracy = correct / len(true_label)

    return accuracy

def eva_auroc(true_label, pred_label):

    # TODO: ==========================
    # calculate the auroc
    # ================================
    TP = 0
    FP = 0
    TN = 0
    FN = 0
    for i in range(len(true_label)):
        if true_label[i] == 1 and pred_label[i] == 1:
            TP += 1
        if true_label[i] == 0 and pred_label[i] == 1:
            FP += 1
        if true_label[i] == 0 and pred_label[i] == 0:
            TN += 1
        if true_label[i] == 1 and pred_label[i] == 0:
            FN += 1

    if TP + FN == 0 or FP + TN == 0:
        auroc = 0
    else:
        TPR = TP / (TP + FN)
        FPR = FP / (FP + TN)
        auroc = (1 + TPR - FPR) / 2

    return auroc

def evaluation(true_label, pred_label, _class):

    precision = eva_precision(true_label, pred_label, _class)
    recall = eva_recall(true_label, pred_label, _class)
    f1 = eva_f1(true_label, pred_label, _class)
    accuracy = eva_accuracy(true_label, pred_label)
    auroc = eva_auroc(true_label, pred_label)

    return {'precision': precision, 'recall': recall, 'f1': f1, 'accuracy': accuracy, 'auroc': auroc}
    


In [11]:
# model training and hyper-parameters fine-tuning
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression

K = 5

# hyper-parameter for logistic regression
hyper_parameters_logistic_regression = {

    # TODO: please choose different values to tune the model
    'penalty': 'l1', # ['l1', 'l2']
}

# hyper-parameter for SVM
hyper_parameters_svm = {

    # TODO: please choose different values to tune the model
    'C': 1e-5, # [1e-5, 1e-4, 1e-3, 1e-2, 1e-1, 1]

}

# obtain cross-validation set
train_datas, train_labels, validation_datas, validation_labels = train_validation_split(K, train_validation_data, train_validation_label)


for i, (train_data, train_label, validation_data, validation_label) in enumerate(zip(train_datas, train_labels, validation_datas, validation_labels)):

    print(f'# ======================= {i + 1}-th time validation ======================= #')

    # logistic regression

    print(f'Algorithm: [logistic regression] =========================')
    print(f'hyper-parameter: {hyper_parameters_logistic_regression}')
    lr_model = LogisticRegression(solver='liblinear', **hyper_parameters_logistic_regression).fit(train_data, train_label)

    # performance evaluation on validation set for tuning hyper-parameters
    pred_label = lr_model.predict(validation_data)
    F1_0 = eva_f1(validation_label, pred_label, _class=0)
    print(f'F1 (Val set of Class-0): {F1_0:.4f}')
    F1_1 = eva_f1(validation_label, pred_label, _class=1)
    print(f'F1 (Val set of Class-1): {F1_1:.4f}')

    # SVM

    print(f'Algorithm: [SVM] =========================================')
    print(f'hyper-parameter: {hyper_parameters_svm}')
    svm_model = SVC(kernel='linear', **hyper_parameters_svm).fit(train_data, train_label)

    # performance evaluation on validation set for tuning hyper-parameters
    pred_label = svm_model.predict(validation_data)
    F1_0 = eva_f1(validation_label, pred_label, _class=0)
    print(f'F1 (Val set of Class-0): {F1_0:.4f}')
    F1_1 = eva_f1(validation_label, pred_label, _class=1)
    print(f'F1 (Val set of Class-1): {F1_1:.4f}')


hyper-parameter: {'penalty': 'l1'}
F1 (Val set of Class-0): 0.9652
F1 (Val set of Class-1): 0.9648
hyper-parameter: {'C': 1e-05}
F1 (Val set of Class-0): 0.9604
F1 (Val set of Class-1): 0.9596
hyper-parameter: {'penalty': 'l1'}
F1 (Val set of Class-0): 0.9538
F1 (Val set of Class-1): 0.9561
hyper-parameter: {'C': 1e-05}
F1 (Val set of Class-0): 0.9641
F1 (Val set of Class-1): 0.9659
hyper-parameter: {'penalty': 'l1'}
F1 (Val set of Class-0): 0.9372
F1 (Val set of Class-1): 0.9326
hyper-parameter: {'C': 1e-05}
F1 (Val set of Class-0): 0.9802
F1 (Val set of Class-1): 0.9798
hyper-parameter: {'penalty': 'l1'}
F1 (Val set of Class-0): 0.9293
F1 (Val set of Class-1): 0.9307
hyper-parameter: {'C': 1e-05}
F1 (Val set of Class-0): 0.9375
F1 (Val set of Class-1): 0.9423
hyper-parameter: {'penalty': 'l1'}
F1 (Val set of Class-0): 0.8995
F1 (Val set of Class-1): 0.8901
hyper-parameter: {'C': 1e-05}
F1 (Val set of Class-0): 0.9347
F1 (Val set of Class-1): 0.9353


Editted the following cell based on the given fine tuning for loop (from the previous cell) to find the best hyperparams for each fold for Log Reg and SVM individually

In [13]:
# EDITTED CODE
# model training and hyper-parameters fine-tuning
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression

K = 5

# hyper-parameter for logistic regression
penalties = ['l1', 'l2']

# hyper-parameter for SVM
C_values = [1e-5, 1e-4, 1e-3, 1e-2, 1e-1, 1]

# obtain cross-validation set
train_datas, train_labels, validation_datas, validation_labels = train_validation_split(K, train_validation_data, train_validation_label)

optimal_penalties = []
optimal_C_values = []

for i, (train_data, train_label, validation_data, validation_label) in enumerate(zip(train_datas, train_labels, validation_datas, validation_labels)):
    # print(f'# ======================= {i + 1}-th time validation ======================= #')

    # logistic regression
    best_f1 = 0
    best_penalty = None
    for penalty in penalties:
        # print(f'Algorithm: [logistic regression] =========================')
        # print(f'hyper-parameter: penalty={penalty}')
        lr_model = LogisticRegression(solver='liblinear', penalty=penalty).fit(train_data, train_label)

        # performance evaluation on validation set for tuning hyper-parameters
        pred_label = lr_model.predict(validation_data)
        F1_0 = eva_f1(validation_label, pred_label, _class=0)
        F1_1 = eva_f1(validation_label, pred_label, _class=1)
        avg_f1 = (F1_0 + F1_1) / 2
        # print(f'F1 (Val set of Class-0): {F1_0:.4f}')
        # print(f'F1 (Val set of Class-1): {F1_1:.4f}')
        # print(f'Average F1: {avg_f1:.4f}')

        if avg_f1 > best_f1:
            best_f1 = avg_f1
            best_penalty = penalty

    optimal_penalties.append(best_penalty)
    # print(f'Optimal penalty for fold {i + 1}: {best_penalty}')

    # SVM
    best_f1 = 0
    best_C = None
    for C in C_values:
        # print(f'Algorithm: [SVM] =========================================')
        # print(f'hyper-parameter: C={C}')
        svm_model = SVC(kernel='linear', C=C).fit(train_data, train_label)

        # performance evaluation on validation set for tuning hyper-parameters
        pred_label = svm_model.predict(validation_data)
        F1_0 = eva_f1(validation_label, pred_label, _class=0)
        F1_1 = eva_f1(validation_label, pred_label, _class=1)
        avg_f1 = (F1_0 + F1_1) / 2
        # print(f'F1 (Val set of Class-0): {F1_0:.4f}')
        # print(f'F1 (Val set of Class-1): {F1_1:.4f}')
        # print(f'Average F1: {avg_f1:.4f}')

        if avg_f1 > best_f1:
            best_f1 = avg_f1
            best_C = C

    optimal_C_values.append(best_C)
    # print(f'Optimal C for fold {i + 1}: {best_C}')

print(f'Optimal penalties for logistic regression: {optimal_penalties}')
print(f'Optimal C values for SVM: {optimal_C_values}')


Optimal penalties for logistic regression: ['l1', 'l1', 'l2', 'l2', 'l2']
Optimal C values for SVM: [1e-05, 0.0001, 1e-05, 0.001, 0.001]


note that the optimal values change every time the code is run

In [15]:
# performance evaluation on test set

from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression

K = 5

# hyper-parameter penlty for logistic regression. Hint: len(penalty) = 5
penalty = [

    # TODO: the optimal parameter selection for each split
    'l1','l1','l2','l2','l2'
]


# hyper-parameter C for SVM. Hint: len(C) = 5
C = [

    # TODO: the optimal parameter selection for each split
    1e-5, 0.01, 0.001, 1e-5, 1e-5

]

    
# obtain training data
train_datas, train_labels, _, _ = train_validation_split(K, train_validation_data, train_validation_label)


for i, (train_data, train_label) in enumerate(zip(train_datas, train_labels)):

    print(f'# ======================= {i + 1}-th time validation ======================= #')

    # logistic regression

    print(f'Algorithm: [logistic regression] =========================')
    print(f'hyper-parameter: {penalty[i]}')
    lr_model = LogisticRegression(solver='liblinear', penalty=penalty[i]).fit(train_data, train_label)


    # performance evaluation on test set
    pred_label = lr_model.predict(test_data)
    results_0 = evaluation(test_label, pred_label, _class=0)
    results_1 = evaluation(test_label, pred_label, _class=1)
    print(f'Result Class 0 (Test set): {results_0}')
    print(f'Result Class 1 (Test set): {results_1}')

    # SVM

    print(f'Algorithm: [SVM] =========================================')
    print(f'hyper-parameter: {C[i]}')
    svm_model = SVC(kernel='linear', C=C[i]).fit(train_data, train_label)

    # performance evaluation on test set
    pred_label = svm_model.predict(test_data)
    results_0 = evaluation(test_label, pred_label, _class=0)
    results_1 = evaluation(test_label, pred_label, _class=1)
    print(f'Result Class 0 (Test set): {results_0}')
    print(f'Result Class 1 (Test set): {results_1}')

hyper-parameter: l1
Result Class 0 (Test set): {'precision': 0.9292929292929293, 'recall': 0.92, 'f1': 0.9246231155778895, 'accuracy': 0.925, 'auroc': 0.925}
Result Class 1 (Test set): {'precision': 0.9207920792079208, 'recall': 0.93, 'f1': 0.9253731343283582, 'accuracy': 0.925, 'auroc': 0.925}
hyper-parameter: 1e-05
Result Class 0 (Test set): {'precision': 0.9585492227979274, 'recall': 0.925, 'f1': 0.9414758269720102, 'accuracy': 0.9425, 'auroc': 0.9425}
Result Class 1 (Test set): {'precision': 0.927536231884058, 'recall': 0.96, 'f1': 0.9434889434889435, 'accuracy': 0.9425, 'auroc': 0.9425}
hyper-parameter: l1
Result Class 0 (Test set): {'precision': 0.9068627450980392, 'recall': 0.925, 'f1': 0.9158415841584158, 'accuracy': 0.915, 'auroc': 0.915}
Result Class 1 (Test set): {'precision': 0.923469387755102, 'recall': 0.905, 'f1': 0.9141414141414141, 'accuracy': 0.915, 'auroc': 0.915}
hyper-parameter: 0.01
Result Class 0 (Test set): {'precision': 0.9108910891089109, 'recall': 0.92, 'f1':

### Conclusion
1. Based on the overall performance metrics of both models, SVM seems to be the more suitable model on the dataset.
    - On average, SVM produce higher Precision, Recall, F1 scores for both classes, 0 and 1.
    - Comparing between each model's Accuracy scores, SVM outperforms Logistic Regression: `0.932` > `0.9005`
    - Comparing between each model's AUROC, SVM again outperforms Logistic Regression: `0.932` > `0.9005`