In [11]:
from custom_helpers import *

### `Pre-processing of the Data`

In [12]:
def load_build_split(loc, degree, split_ratio=0.8, test=False):
    """Load the data, build the model and split as train, test samples."""
    path = loc
    
    print("Loading Data..."),
    y, x, ids = load_csv_data(path)
    
    print("\nReplacing missing values with most frequent ones...")
    x = replace_nan_with_frequent(x)
    
    print("\nOne-hot encding categorical features...")
    x, cat_columns = convert_categorical(x, 22)
    
    print("\nStandardizing initial features..."),
    x = standardize_columns(x)
    
    print("\nAdding features of degree %d..." % degree)
    x = poly_features(x, degree)
    
    print("\nAdding cross-column features..."),
    x = cross_column_features(x, 30)
    
    print("\nStandardizing all features..."),
    x = standardize_columns(x)

    print("\nAdding categorical features...")
    x = np.c_[x, np.array(cat_columns)]
    
    print("\nAdding intercept terms...")
    tx = np.c_[np.ones(x.shape[0]), x]
      
    if test:
        return tx, ids
                
    else:      
        print("\nSplitting data with ratio of %.1f...\n" % split_ratio),
        tx, tx_te, y, y_te = split_data(tx, y, split_ratio)
        
        return tx, y, tx_te, y_te
    
    return tx, y, tx_te, y_te

### `Required Functions for the Main Model`

In [13]:
def sigmoid(z):
    return 1 / (1 + np.exp(-z))

def reg_logistic_cost(y, tx, w, gamma):
    epsilon = 1e-8
    """cost for logistic regression with regularization"""
    sig = sigmoid(tx.dot(w));
    cost = (-y) * np.log(sig + epsilon) - (1-y) * np.log(1-sig + epsilon)
    reg = np.dot(w,w) * reg_parameter / (2 * len(y))
    return np.mean(cost) + reg

def reg_logistic_gradient(y, tx, w, gamma):
    """gradient for logistic regression with with regularization"""
    err = sigmoid(tx.dot(w)) - y
    grad = tx.T.dot(err) / len(err)
    reg = w * reg_parameter / len(err)
    return grad + reg, err

### `Main Model`

In [14]:
def reg_logistic_regression(y, tx, alpha, initial_w, reg_parameter):
    """regularized logistic regression using GD"""
    ws = [initial_w]
    losses = []
    w = initial_w
    epsilon = 1e-5
    loss_prev = 999.
    
    n_iter = 0 #DEBUG

    # for n_iter in range(max_iters): #DEBUG
    while True:
        # compute gradient, loss
        grad, _ = reg_logistic_gradient(y, tx, w, reg_parameter)
        # update weights using gradient
        w = w - alpha * grad
        # calculate loss
        loss = reg_logistic_cost(y, tx, w, reg_parameter)
        # store w and loss
        ws.append(w)
        losses.append(loss)
    
        print("GD({bi}/{ti}): loss={l}".format(bi=n_iter, ti="Unkown", l=loss,))
        n_iter += 1
        
        # if loss is no more decresing break the loop
        if loss_prev - loss < epsilon:
            break
            
        loss_prev = loss
    
    return losses, ws

### `Test -Different Parameters`

In [15]:
def test_reg_logistic_regresion(tx_tr, y_tr, tx_te, y_te, alpha, reg_parameter):
    initial_w = np.random.randn(tx_tr.shape[1])
    losses, ws = reg_logistic_regression(y_tr, tx_tr, alpha, initial_w, reg_parameter)
    y_pred = predict_labels(ws[-1], tx_te, 1, 0)
    
    return ws[-1], (1 - (abs(y_pred - y_te).sum() / len(y_te)))

In [16]:
degrees = [9]
alphas = [0.3]
reg_parameters = [0.01]
split_ratio = 0.8

best_deg = 0
max_acc = float()
w = list()

for degree in degrees:
    tx_tr, y_tr, tx_te, y_te = load_build_split("data/train.csv", degree, split_ratio)
    for alpha in alphas:
        for reg_parameter in reg_parameters:
            curr_w, acc = test_reg_logistic_regresion(tx_tr, y_tr, tx_te, y_te, alpha, reg_parameter)
            print("\nDegree: %1.0f, Alpha: %.3f, Lambda: %.3f >> Accuracy: %.5f"%(degree, alpha, reg_parameter, acc))
            if acc > max_acc:
                max_acc = acc
                w = curr_w
                best_deg = degree

Loading Data...

Replacing missing values with most frequent ones...

One-hot encding categorical features...

Standardizing initial features...

Adding features of degree 9...

Adding cross-column features...

Standardizing all features...

Adding categorical features...

Adding intercept terms...

Splitting data with ratio of 0.8...



  from ipykernel import kernelapp as app


GD(0/Unkown): loss=4.516524051875154
GD(1/Unkown): loss=4.373578667343298
GD(2/Unkown): loss=4.245426950576644
GD(3/Unkown): loss=4.128952026703934
GD(4/Unkown): loss=4.022268541792522
GD(5/Unkown): loss=3.9238010232596543
GD(6/Unkown): loss=3.8320905957249884
GD(7/Unkown): loss=3.746770448335244
GD(8/Unkown): loss=3.6672008053795153
GD(9/Unkown): loss=3.5927514971219647
GD(10/Unkown): loss=3.522706513845564
GD(11/Unkown): loss=3.456589107106755
GD(12/Unkown): loss=3.3939172677046217
GD(13/Unkown): loss=3.334237247406657
GD(14/Unkown): loss=3.2772070616175957
GD(15/Unkown): loss=3.2227176064206695
GD(16/Unkown): loss=3.1705824411340084
GD(17/Unkown): loss=3.1206936240735867
GD(18/Unkown): loss=3.0729487325036713
GD(19/Unkown): loss=3.0271085339153045
GD(20/Unkown): loss=2.983031796278539
GD(21/Unkown): loss=2.9406470089293717
GD(22/Unkown): loss=2.8998875943615334
GD(23/Unkown): loss=2.8606571750377623
GD(24/Unkown): loss=2.8228946670645945
GD(25/Unkown): loss=2.786469133326055
GD(26/U

### `Create Submission CSV`

In [None]:
tx, ids = load_build_split("data/test.csv", best_deg, test=True)
y_pred = predict_labels(w, tx)
create_csv_submission(ids, y_pred, "results.csv")