In [1]:
import numpy as np

In [2]:
import sys, os
sys.path.insert(0, os.path.abspath(os.path.join(os.getcwd(), '..')))

In [3]:
from utils.helpers import load_csv_data, create_csv_submission


In [27]:
x_train, x_test, y_train, train_ids, test_ids= load_csv_data("../dataset/")

In [22]:
ln = len(x_train)

In [28]:
old_x_train = x_train
old_y_train = y_train


In [29]:
x_train = old_x_train[:int(ln*0.9)]
y_train = old_y_train[:int(ln * 0.9)]
x_test = old_x_train[int(ln*0.9):]
y_test = old_y_train[int(ln*0.9):]

In [None]:
def logistic_regression(y, tx, initial_w, max_iters, gamma, weight):
    """Logistic regression using gradient descent. (y in {-1, 1})
    """
    def stable_sigmoid(t):
        """Numerically stable sigmoid"""
        return np.where(t >= 0, 1 / (1 + np.exp(-t)), 
                    np.exp(t) / (1 + np.exp(t)))

    w = initial_w
    for i in range(max_iters):
        pred = tx @ w
        weights = np.where(y == 1, weight, 1)
        gradient = -tx.T @ (y * weights * stable_sigmoid(-y * pred)) / len(y)
        w = w - gamma * gradient
        loss = np.sum(weights * np.log(1 + np.exp(-y * (tx @ w)))) / len(y)
        if i % 10 == 0:
            print(loss,i)

    return w, loss


In [None]:
def experiment(is_fix_nan, is_fix_outliers, is_normalize, is_remove_corr, is_weight):
    # fixing nan
    if is_fix_nan:
        x_train = np.nan_to_num(x_train)
        x_test = np.nan_to_num(x_test)
    # removing outliers
    if is_fix_outliers:
        quant = np.quantile(x_train,0.9,axis=0)
        x_train = np.minimum(x_train, quant)
        x_test = np.minimum(x_test, quant)
    # normalizing dataset
    if is_normalize:
        x_test -= np.min(x_train,axis=0)
        x_train -= np.min(x_train,axis=0)
        quant = np.max(x_train,axis=0)
        x_train /= (quant+0.01)
        x_test /= (quant+0.01)
    # removing correlated features
    if is_remove_corr:
        arr = np.corrcoef(x_train,rowvar=False)
        bad = []
        good = []
        for i in range(arr.shape[0]):
            ok = True
            for j in range(i):
                if i != j and abs(arr[i][j]) > 0.8:
                    ok = False
            if ok:
                good.append(i)
            else:
                bad.append(i)
        x_train = x_train[:, good]
        x_test = x_test[:, good]
    # setting weight for cross entropy
    if is_weight:
        weight = 10
    else:
        weight = 1
    w, loss = logistic_regression(y_train, x_train, np.zeros(x_train.shape[1]), 100, 0.1, weight)
    y_tr = 2 * (x_test @ w > 0) - 1
    print("METRICS", np.mean(y_tr == y_test))
    # f1 calculation
    ind = (y_test == 1)
    tp = sum(y_test[ind] == y_tr[ind])
    fn = sum(y_test[ind] != y_tr[ind])
    ind = (y_test == -1)
    tn = sum(y_test[ind] == y_tr[ind])
    fp = sum(y_test[ind] != y_tr[ind])
    pr = tp / (tp + fp)
    rec = tp / (tp + fn)
    f1 = 2 * pr * rec / (pr + rec)
    print(f1)
    

In [None]:
experiment(1,1,1,1,1)
experiment(0,1,1,1,1)
experiment(1,0,1,1,1)
experiment(1,1,0,1,1)
experiment(1,1,1,0,1)
experiment(1,1,1,0,1)

1.2224081172381267 0
1.0802942149030645 10
1.0089608618312393 20
0.9686065271638179 30
0.9434353572903723 40
0.9265116495519762 50
0.9144590818552418 60
0.905482360782001 70
0.8985550498147118 80
0.8930542253511482 90
METRICS 0.7504418845614677
0.3601843894054223
nan 0
nan 10
nan 20
nan 30
nan 40
nan 50
nan 60
nan 70
nan 80
nan 90
METRICS 0.9111354909489852
nan


  pr = tp / (tp + fp)


1.2358608689215391 0
1.172238957247188 10
1.1269543551894587 20
1.0939242006831476 30
1.0691997242853597 40
1.0502154278096971 50
1.0352863467166007 60
1.0232862938801754 70
1.013447313473999 80
1.0052348814296548 90
METRICS 0.6962881696836716
0.3096425602660017


  loss = np.sum(weights * np.log(1 + np.exp(-y * (tx @ w)))) / len(y)


inf 0


  1 / (1 + np.exp(-t)),
  np.exp(t) / (1 + np.exp(t)))
  np.exp(t) / (1 + np.exp(t)))


inf 10
inf 20
inf 30
inf 50
inf 60
inf 70
inf 80
inf 90
METRICS 0.9111354909489852
nan
1.1977582837497216 0
1.5543156361076977 10
1.181049407121708 20
0.9960156802584094 30
0.9275894417693256 40
0.9019559035642835 50
0.8915926129212914 60
0.8860861433150916 70
0.8821138596375849 80
0.8788498159281175 90
METRICS 0.7462058877308466
0.3575065576299954
0.29619074308735366 0
0.2825697512396516 10
0.27336814026552736 20
0.2666162514759551 30
0.26147139806385383 40
0.25742937420193096 50
0.25417432763609704 60
0.25149965249193423 70
0.24926488771020627 80
0.24737127531030367 90
METRICS 0.9111659657463278
0.0006856359273225916
