# PLab 6 Sample Code for Instructors

In [1]:
# Add import statements here
import numpy as np
import csv
import time
from scipy import stats

In [2]:
# To access files in your Google Drive, run this block and follow the instructions
# from google.colab import drive
# drive.mount('/content/gdrive')

In [3]:
# To test if the above block worked, run this block
# !ls '/content/gdrive/My Drive/'

 ## Find test error

The `find_test_error` function computes the test error of a linear classifier $w$. 

The hypothesis is assumed to be of the form $sign([1, x(N,:)] \cdot w)$.

Inputs:
* `w` is the weight vector
* `X` is the data matrix (without an initial column of 1's)
* `y` are the data labels (plus or minus 1)

Outputs:
* `test_error` is the binary error of $w$ on the data set $(X, y)$ error; this should be between 0 and 1. 

In [4]:
def find_test_error(w, X, y):
    # Find the sigmoid distribution
    sig = np.exp(np.dot(X, w))/(1+np.exp(np.dot(X, w)))
    # Determine binary classification result
    C = 0.5
    binary_class = []
    for s in sig:
        if s >= C:
            binary_class.append(1)
        else:
            binary_class.append(-1)

    # Determine binary classification error
    error_sum = 0
    for i in range(len(y)):
        if y[i] != binary_class[i]:
            error_sum += 1

    test_error = error_sum/len(y)
    return test_error

 ## Logistic Regression

The `logistic_reg`  learn a logistic regression model using gradient descent.

Inputs:
* `X` is the data matrix (without an initial column of 1's)
* `y` are the data labels (plus or minus 1)
* `w_init` is the initial value of the w vector ($d+1$ dimensional)
* `max_its` is the maximum number of iterations to run for
* `eta` is the learning rate

Outputs:
* t is the number of iterations gradient descent ran for
* w is the learned weight vector
* e_in is the in-sample (cross-entropy) error 

In [5]:
def logistic_reg(X, y, w_init, max_its, eta, thresh): # According to TA, “You could edit the function to take in that threshold value.” I can also use global var but this is bad in python

    # Define parameters
    N=len(X)
    w=w_init
    t=0
    while t < max_its:
        # Calculate gradient
        g_t = []
        for n in range(N):
            g_t.append((y[n][0]*X[n])/(1+np.exp(y[n][0]*np.dot(w, X[n]))))

        g_t = -np.mean(g_t, 0)

        # Test termination
        mag_g_t = abs(g_t)
        if all(m < thresh for m in mag_g_t):
            break

        # Update weight
        v_t = -g_t
        w = w+eta*v_t
        t+=1

    # Calculate the cross-entropy in-sample error
    e_in = []
    for n in range(N):
        e_in.append(np.log((1+np.exp(-y[n][0]*np.dot(w, X[n])))))
    e_in = np.mean(e_in)
    return t, w, e_in

## Run and Plot

Run your code and plot figures below

In [6]:
# Read files using csv library (function defined to prevent repeatedness)
def read_csv(file_name):
    set=[]
    # Read file
    with open(file_name, 'r') as f:
        file = list(csv.reader(f))[1:]
        # Convert values
        for l in file:
            row = []
            for n in l:
                try:
                    row.append(int(n))
                except ValueError:
                    try:
                        row.append(float(n))
                    except ValueError:
                        print("Unexpected value")
                        exit(1)
            set.append(row)
    return set

# Split into X and y
def split_X_y(set):
    X = []
    y = []
    for r in set:
        X.append([1]+r[0:-1])
        y.append([r[-1]])
    for i in range(len(y)):
        if y[i] == [0]:
            y[i] = [-1]
    return X, y

# Read training and testing files
train_set = read_csv("cleveland_train.csv")
test_set = read_csv("cleveland_test.csv")

# Get X and y for training
X = np.array(split_X_y(train_set)[0])
y = np.array(split_X_y(train_set)[1])

# Get X and y for testing
X_t = np.array(split_X_y(test_set)[0])
y_t = np.array(split_X_y(test_set)[1])

# Experiment with iterations
# Define input parameters
eta_0 = 0.00001
w_init = np.zeros(len(X[0]))
iterations = [10000, 100000, 1000000]
print("Experimenting with iterations...\n")
for iter in iterations:
    # Start training
    start = time.time()
    t, w, e_in = logistic_reg(X, y, w_init, iter, eta_0, 0.001)
    end = time.time()

    # Start testing
    test_error = find_test_error(w, X_t, y_t)
    training_error = find_test_error(w, X, y)

    # Print out the results
    print('Number of iterations: {}, Training time : {}s, In-sample Cross-Entropy error (Ein): {}, Binary error on the training set (Etrain): {}, Binary error on the test set (Etest): {} \n'.format(t,round(end - start, 5), round(e_in, 5), round(training_error, 5), round(test_error, 5)))



# Experiment with learning rate
# Define parameters
eta_0s=[0.01, 0.1, 1, 4, 5, 6, 7, 7.5, 7.6, 7.65]
w_init = np.zeros(len(X[0]))

# Find z-scores
ZX = np.append(np.ones((len(X), 1)), stats.zscore(X[:,1:]), axis=1)
ZX_t = np.append(np.ones((len(X_t), 1)), stats.zscore(X_t[:,1:]), axis=1)

print("Experimenting with learning rates...\n")
for eta_0 in eta_0s:
    # Start training
    # Using iterative termination condition of inf is equivalent to no iterations-based termination criteria
    # Only terminate when the magnitude of every element of the gradient is less than 10^−6
    start = time.time()
    t, w, e_in = logistic_reg(ZX, y, w_init, float("inf"), eta_0, 0.000001)
    end = time.time()

    # Start testing
    test_error = find_test_error(w, ZX_t, y_t)

    # Print out the results
    print('Learning rate (η0): {}, Number of iterations: {}, Training time : {}s, In-sample cross-Entropy error (Ein): {}, Binary error on the test set (Etest): {} \n'.format(eta_0, t, round(end - start, 5), round(e_in, 5), round(test_error, 5)))

# Other code here:

Experimenting with iterations...

Number of iterations: 10000, Training time : 15.06156s, In-sample Cross-Entropy error (Ein): 0.58471, Binary error on the training set (Etrain): 0.30921, Binary error on the test set (Etest): 0.31724 

Number of iterations: 100000, Training time : 127.60057s, In-sample Cross-Entropy error (Ein): 0.4937, Binary error on the training set (Etrain): 0.22368, Binary error on the test set (Etest): 0.2069 

Number of iterations: 1000000, Training time : 1251.69752s, In-sample Cross-Entropy error (Ein): 0.43535, Binary error on the training set (Etrain): 0.15132, Binary error on the test set (Etest): 0.13103 

Experimenting with learning rates...

Learning rate (η0): 0.01, Number of iterations: 23221, Training time : 28.42808s, In-sample cross-Entropy error (Ein): 0.40738, Binary error on the test set (Etest): 0.10345 

Learning rate (η0): 0.1, Number of iterations: 2318, Training time : 2.82869s, In-sample cross-Entropy error (Ein): 0.40738, Binary error on t