In [1]:
import numpy as np

from helpers import load_train_data

# Load dataset

In [2]:
x_train, y_train = load_train_data("./dataset")

# Split them into training data & validation data for this ablation study

In [3]:
ln = len(x_train)  # or len(y_train) since they are the same

x_train, x_val = x_train[: int(ln * 0.9)], x_train[int(ln * 0.9) :]
y_train, y_val = y_train[: int(ln * 0.9)], y_train[int(ln * 0.9) :]

# Logistic regression

In [4]:
def safe_exp(x):
    return np.exp(np.clip(x, -100, 100))


def logistic_regression(y, tx, initial_w, max_iters, gamma, weight):
    """Logistic regression using gradient descent. (y in {-1, 1})"""

    def stable_sigmoid(t):
        """Numerically stable sigmoid"""
        return np.where(t >= 0, 1 / (1 + safe_exp(-t)), safe_exp(t) / (1 + safe_exp(t)))

    w = initial_w

    for i in range(max_iters):
        # Gradient descent
        pred = tx @ w
        weights = np.where(y == 1, weight, 1)
        gradient = -tx.T @ (y * weights * stable_sigmoid(-y * pred)) / len(y)
        w = w - gamma * gradient

        # compute loss
        train_loss = np.sum(weights * np.log(1 + safe_exp(-y * (tx @ w)))) / len(y)

        # log
        if i % 10 == 0:
            print(f"Current iteration: {i}, train loss: {train_loss}")

    print("Done\n")

    return w, train_loss

# main function

In [5]:
def experiment(
    x_train,
    y_train,
    x_val,
    y_val,
    is_fix_nan: bool,
    is_fix_outliers: bool,
    is_normalize: bool,
    is_remove_corr: bool,
    is_weight: bool,
):
    # 1. fixing nan
    if is_fix_nan:
        x_train = np.nan_to_num(x_train)
        x_val = np.nan_to_num(x_val)

    # 2. removing outliers
    if is_fix_outliers:
        quant = np.quantile(x_train, 0.9, axis=0)
        x_train = np.minimum(x_train, quant)
        x_val = np.minimum(x_val, quant)

    # 3. normalizing dataset
    if is_normalize:
        x_val -= np.min(x_train, axis=0)
        x_train -= np.min(x_train, axis=0)
        quant = np.max(x_train, axis=0)
        x_train /= quant + 0.01
        x_val /= quant + 0.01

    # 4. removing correlated features
    if is_remove_corr:
        arr = np.corrcoef(x_train, rowvar=False)
        bad = []
        good = []
        for i in range(arr.shape[0]):
            ok = True
            for j in range(i):
                if i != j and abs(arr[i][j]) > 0.8:
                    ok = False
            if ok:
                good.append(i)
            else:
                bad.append(i)
        x_train = x_train[:, good]
        x_val = x_val[:, good]

    # 5. setting weight for cross entropy
    if is_weight:
        weight = 10
    else:
        weight = 1

    # logistic regression
    w, _ = logistic_regression(
        y_train, x_train, np.zeros(x_train.shape[1]), 100, 0.1, weight
    )
    y_val_pred = 2 * (x_val @ w > 0) - 1
    print("Accuracy", np.mean(y_val_pred == y_val))

    # f1-score calculation
    ind = y_val == 1
    true_pos = sum(y_val[ind] == y_val_pred[ind])
    false_neg = sum(y_val[ind] != y_val_pred[ind])

    ind = y_val == -1
    # true_neg = sum(y_val[ind] == y_val_pred[ind])
    false_pos = sum(y_val[ind] != y_val_pred[ind])

    pr = true_pos / (true_pos + false_pos)
    rec = true_pos / (true_pos + false_neg)
    f1 = 2 * pr * rec / (pr + rec)

    return f1, pr, rec

In [6]:
print("All of the data preprocessing methods\n")
f1, pr, rec = experiment(x_train, y_train, x_val, y_val, 1, 1, 1, 1, 1)
print(f"Precision: {pr}")
print(f"Recall: {rec}")
print(f"F1-Score: {f1}")
print("----------------------------------------------------")

print("Without Missing Value Replacement\n")
f1, pr, rec = experiment(x_train, y_train, x_val, y_val, 0, 1, 1, 1, 1)
print(f"Precision: {pr}")
print(f"Recall: {rec}")
print(f"F1-Score: {f1}")
print("----------------------------------------------------")

print("without Outliers Clipping\n")
f1, pr, rec = experiment(x_train, y_train, x_val, y_val, 1, 0, 1, 1, 1)
print(f"Precision: {pr}")
print(f"Recall: {rec}")
print(f"F1-Score: {f1}")
print("----------------------------------------------------")

print("Without Normalization\n")
f1, pr, rec = experiment(x_train, y_train, x_val, y_val, 1, 1, 0, 1, 1)
print(f"Precision: {pr}")
print(f"Recall: {rec}")
print(f"F1-Score: {f1}")
print("----------------------------------------------------")

print("Without Correlated Features Removal\n")
f1, pr, rec = experiment(x_train, y_train, x_val, y_val, 1, 1, 1, 0, 1)
print(f"Precision: {pr}")
print(f"Recall: {rec}")
print(f"F1-Score: {f1}")
print("----------------------------------------------------")

print("Without Weighting\n")
f1, pr, rec = experiment(x_train, y_train, x_val, y_val, 1, 1, 1, 1, 0)
print(f"Precision: {pr}")
print(f"Recall: {rec}")
print(f"F1-Score: {f1}")
print("----------------------------------------------------")

All of the data preprocessing methods



  c /= stddev[:, None]
  c /= stddev[None, :]


Current iteration: 0, train loss: 1.222408117238126
Current iteration: 10, train loss: 1.0802942149030645
Current iteration: 20, train loss: 1.0089608618312393
Current iteration: 30, train loss: 0.9686065271638179
Current iteration: 40, train loss: 0.9434353572903723
Current iteration: 50, train loss: 0.9265116495519763
Current iteration: 60, train loss: 0.9144590818552418
Current iteration: 70, train loss: 0.905482360782001
Current iteration: 80, train loss: 0.8985550498147118
Current iteration: 90, train loss: 0.8930542253511482
Done

Accuracy 0.7504418845614677
Precision: 0.2332287766872407
Recall: 0.7904663923182441
F1-Score: 0.3601843894054223
----------------------------------------------------
Without Missing Value Replacement

Current iteration: 0, train loss: nan
Current iteration: 10, train loss: nan
Current iteration: 20, train loss: nan
Current iteration: 30, train loss: nan
Current iteration: 40, train loss: nan
Current iteration: 50, train loss: nan
Current iteration: 60,

  pr = true_pos / (true_pos + false_pos)


Current iteration: 0, train loss: 1.2358608689215391
Current iteration: 10, train loss: 1.1722389572471885
Current iteration: 20, train loss: 1.1269543551894587
Current iteration: 30, train loss: 1.093924200683148
Current iteration: 40, train loss: 1.06919972428536
Current iteration: 50, train loss: 1.0502154278096971
Current iteration: 60, train loss: 1.0352863467166007
Current iteration: 70, train loss: 1.0232862938801757
Current iteration: 80, train loss: 1.013447313473999
Current iteration: 90, train loss: 1.005234881429655
Done

Accuracy 0.6962881696836716
Precision: 0.19401041666666666
Recall: 0.7664609053497943
F1-Score: 0.3096425602660017
----------------------------------------------------
Without Normalization

Current iteration: 0, train loss: 88.23957659631384
Current iteration: 10, train loss: 88.23957659631384
Current iteration: 20, train loss: 88.23957659631384
Current iteration: 30, train loss: 88.23957659631384
Current iteration: 40, train loss: 88.23957659631384
Curre