In [1]:
import random
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
from prettytable import PrettyTable
from sklearn.model_selection import train_test_split
from sklearn import metrics as m
# import foo

from prettytable import PrettyTable

%matplotlib inline

GLOB_Y:pd.DataFrame
GLOB_X:pd.DataFrame

# Math

In [2]:
def initialize_with_zeros(dim):
    w = np.zeros((dim,1))
    b = 0.
    return w, b

def sigmoid(z):
    s = 1./(1.+np.exp(-z))
    return s

def propagate(w, b, X, Y):
    m = X.shape[1]
    #print('number of objects = ',len(X))
    
    # FORWARD PROPAGATION (FROM X TO COST)
    A = sigmoid(np.dot(w.T,X)+b)                                 # compute activation
    cost = -(1./m)*np.sum(Y*np.log(A)+(1-Y)*np.log(1-A),axis=1)   # compute cost
    
    # BACKWARD PROPAGATION (TO FIND GRAD)
    dw = (1./m)*np.dot(X,(A-Y).T)
    db = (1./m)*np.sum(A-Y,axis=1)

    grads = {"dw": dw,
            "db": db}
    
    return grads, cost

def corr_val(value, coef, t):
    corr_val_x = value/(1-coef**t)
    return corr_val_x

def predict(w, b, X):
    
    m = X.shape[1]
    Y_prediction = np.zeros((1,m))
    w = w.reshape(X.shape[0], 1)
    
    # Compute vector "A" predicting the probabilities 
    A = sigmoid(np.dot(w.T,X)+b)
    
    for i in range(A.shape[1]):
        
        # Convert probabilities A[0,i] to actual predictions p[0,i]
        if (A[0,i]<=0.5):
            Y_prediction[0][i]=0
        else:
            Y_prediction[0][i]=1
    
    return Y_prediction

# Optimizers

In [3]:
def gd_optimize(X, Y, num_iterations, learning_rate, print_cost = False):

    costs = []

    X_trans = np.asarray(X).T
    Y_trans = np.asarray(Y).T

    # initialize parameters with zeros 
    w, b = initialize_with_zeros(X_trans.shape[0])
    
    for i in range(num_iterations):
        # Cost and gradient calculation 
        grads, cost = propagate(w,b,X_trans,Y_trans)
        
        # Retrieve derivatives from grads
        dw = grads["dw"]
        db = grads["db"]
        
        # update rule
        w -=learning_rate*dw
        b -=learning_rate*db
        
        # Record the costs
        costs.append(cost)
        
        # Print the cost every 100 training iterations
        if print_cost and i % 100 == 0:
            print ("Cost after iteration %i: %f" %(i, cost))
    
    params = {"w": w,
            "b": b}
    
    grads = {"dw": dw,
            "db": db}
    
    return params, grads, costs

# def sgd_optimiz(X, Y, num_iterations, learning_rate, print_cost = False):
    
#     costs = []

#     X_trans = np.asarray(X).T
#     Y_trans = np.asarray(Y).T

#     # initialize parameters with zeros 
#     w, b = initialize_with_zeros(X_trans.shape[0])
    
#     for i in range(num_iterations):
#         # Prepare a random subsample of data
#         sample_len = random.randint(100,200)
#         X_sample = X.sample(sample_len)
#         # print(len(X_sample))
#         Y_sample = GLOB_Y[X_sample.index]
#         X_trans = np.asarray(X_sample).T
#         Y_trans = np.asarray(Y_sample).T
#         # print('Y = ', len(Y_trans))
#         # print('X = ', len(X_trans))
#         # Cost and gradient calculation 
#         grads, cost = propagate(w,b,X_trans,Y_trans)
        
#         # Retrieve derivatives from grads
#         dw = grads["dw"]
#         db = grads["db"]
        
#         # update rule
#         w -=learning_rate*dw
#         b -=learning_rate*db
        
#         # Record the costs
#         costs.append(cost)
        
#         # Print the cost every 100 training iterations
#         if print_cost and i % 100 == 0:
#             print ("Cost after iteration %i: %f" %(i, cost))
    
#     params = {"w": w,
#             "b": b}
    
#     grads = {"dw": dw,
#             "db": db}
    
#     return params, grads, costs
    


In [4]:
def sgd_optimize(x, y, num_iterations,learn_rate, batch_size=200, dtype="float64", random_state=None, print_cost = False):

    costs = []
    dtype_ = np.dtype(dtype)

    x, y = np.array(x, dtype=dtype_).T, np.array(y, dtype=dtype_).T
    n_obs = x.shape[0]

    w, b = initialize_with_zeros(n_obs)
    x = x.T
    bth = 200
    num_iterations = int(num_iterations)

    for i in range(num_iterations):
        start = random.randint(1, 1400)
        stop = random.randint(1+start, start+bth)
        x_batch = x[start:stop]
        x_batch = x_batch.T
        y_batch = y[start:stop]

        grads, cost = propagate(w,b,x_batch,y_batch)
        
        # Retrieve derivatives from grads
        dw = grads["dw"]
        db = grads["db"]
        
        # update rule
        w -=learn_rate*dw
        b -=learn_rate*db  
        # Record the costs
        costs.append(cost)
            
        # Print the cost every 100 training iterations
        if print_cost and i % 100 == 0:
            print ("Cost after iteration %i: %f" %(i, cost))

        


    params = {"w": w,
            "b": b}
    
    grads = {"dw": dw,
            "db": db}
    
    return params, grads, costs

In [5]:
class AdamOptim():
    def __init__(self, zeta=0.01, beta1=0.9, beta2=0.999, epsilon=1e-8):
        self.v_dw, self.v_db = 0,0
        self.m_dw, self.m_db = 0,0
        self.beta1 = beta1
        self.beta2 = beta2
        self.epsilon = epsilon
        self.eta = zeta
    def update(self, t, w, b, dw, db):

        ## momentum beta 1
        # *** weights *** #
        self.m_dw = self.beta1*self.m_dw + (1-self.beta1)*dw
        # *** biases *** #
        self.m_db = self.beta1*self.m_db + (1-self.beta1)*db

        ## rms beta 2
        # *** weights *** #
        self.v_dw = self.beta2*self.v_dw + (1-self.beta2)*np.power( dw, 2)
        # *** biases *** #
        self.v_db = self.beta2*self.v_db + (1-self.beta2)*np.power( db, 2)

        ## bias correction
        m_dw_corr = self.m_dw/(1-self.beta1**t)
        m_db_corr = self.m_db/(1-self.beta1**t)
        v_dw_corr = self.v_dw/(1-self.beta2**t)
        v_db_corr = self.v_db/(1-self.beta2**t)

        ## update weights and biases
        w = w - self.eta*(m_dw_corr/(np.sqrt(v_dw_corr)+self.epsilon))
        b = b - self.eta*(m_db_corr/(np.sqrt(v_db_corr)+self.epsilon))
        return w, b
    def __del__(self):
        pass

In [6]:

def adam_optimize(X, Y, num_iterations, learning_rate, print_cost = False):
    
    costs = []

    X_trans = np.asarray(X).T
    Y_trans = np.asarray(Y).T

    # initialize parameters with zeros 
    w, b = initialize_with_zeros(X_trans.shape[0])
    
    adam = AdamOptim()
    adam.v_dw, adam.v_db = initialize_with_zeros(X_trans.shape[0])
    adam.m_dw, adam.m_db = initialize_with_zeros(X_trans.shape[0])
    adam.eta = learning_rate
    for i in range(num_iterations):
                
        # Cost and gradient calculation 
        grads, cost = propagate(w,b,X_trans,Y_trans)
        
        # Retrieve derivatives from grads
        dw = grads["dw"]
        db = grads["db"]
        
        w,b = adam.update(t=i+1, w=w,b=b, dw=dw, db=db)
        
        # Record the costs
        costs.append(cost)
        
        # Print the cost every 100 training iterations
        if print_cost and i % 100 == 0:
            print ("Cost after iteration %i: %f" %(i, cost))
    del adam
    
    params = {"w": w,
            "b": b}
    
    grads = {"dw": dw,
            "db": db}
    
    return params, grads, costs

In [7]:
# predict

def predict(w, b, X):
    
    m = X.shape[1]
    Y_prediction = np.zeros((1,m))
    w = w.reshape(X.shape[0], 1)
    
    # Compute vector "A" predicting the probabilities 
    A = sigmoid(np.dot(w.T,X)+b)
    
    for i in range(A.shape[1]):
        
        # Convert probabilities A[0,i] to actual predictions p[0,i]
        if (A[0,i]<=0.5):
            Y_prediction[0][i]=0
        else:
            Y_prediction[0][i]=1
    
    return Y_prediction

# model

def model(X_train, Y_train, X_test, Y_test, optim_fun = gd_optimize, num_iterations = 2000, learning_rate = 0.5, print_cost = False):
     
    X_train_trans = np.asarray(X_train).T
    Y_train_trans = np.asarray(Y_train).T
    X_test_trans = np.asarray(X_test).T
    Y_test_trans = np.asarray(Y_test).T

    # Gradient descent
    parameters, grads, costs = optim_fun(X_train, Y_train, num_iterations, learning_rate, print_cost)
    
    # Retrieve parameters w and b from dictionary "parameters"
    w = parameters["w"]
    b = parameters["b"]
    
    # Predict test/train set examples
    Y_prediction_test = predict(w, b, X_test_trans)
    Y_prediction_train = predict(w, b, X_train_trans)

    # Print train/test Errors
    traina = (100 - np.mean(np.abs(Y_prediction_train - Y_train_trans)) * 100)
    testa = (100 - np.mean(np.abs(Y_prediction_test - Y_test_trans)) * 100)

    text = f'train acc = {traina}\ntest acc = {testa}'

    d = {"costs": costs,
         "Y_prediction_test": Y_prediction_test, 
         "Y_prediction_train" : Y_prediction_train, 
         "w" : w, 
         "b" : b,
         "learning_rate" : learning_rate,
         "num_iterations": num_iterations}

     
    
    return d, text

# Work

In [8]:
# Prepare data
df = pd.read_csv("bioresponse.csv")
GLOB_Y=df['Activity'].values

# choosing numeric features
GLOB_X = df.drop(['Activity'], axis=1, inplace=False)
GLOB_X

# Create Train and Test samples
X_train, X_test, y_train, y_test = train_test_split(GLOB_X,GLOB_Y,test_size=0.25, random_state=68)

In [9]:
lern_rates = [0.0001, 0.0005, 0.001, 0.005, 0.01, 0.05]

table = PrettyTable()
gd = []
sgdo = []
sgd = []
adam = []

dic = {
    'gd': [gd,gd_optimize],
    # 'sgd_old': [sgdo,sgd_optimiz],
    'sgd': [sgd,sgd_optimize],
    'adam': [adam,adam_optimize]
}

table.field_names = ['Learning Rate', 'GD model', 'Stochastic GD model', 'ADAM model']

for rate in lern_rates:
    teta = []
    teta = [str(rate)]
    for key in dic.keys():
        # o = dic[key][1]
        z, rez = model(X_train, y_train, X_test, y_test, 
                            num_iterations = 2000, learning_rate = rate, 
                            print_cost = False, optim_fun = dic[key][1])
        dic[key][0].append(z)
        teta.append(rez)
    table.add_row(teta)

print(table)

  cost = -(1./m)*np.sum(Y*np.log(A)+(1-Y)*np.log(1-A),axis=1)   # compute cost
  cost = -(1./m)*np.sum(Y*np.log(A)+(1-Y)*np.log(1-A),axis=1)   # compute cost


+---------------+-------------------------------+-------------------------------+-------------------------------+
| Learning Rate |            GD model           |      Stochastic GD model      |           ADAM model          |
+---------------+-------------------------------+-------------------------------+-------------------------------+
|     0.0001    | train acc = 55.52790615001778 | train acc = 54.67472449342339 | train acc = 81.76324209029505 |
|               | test acc = 55.223880597014926 |  test acc = 54.05117270788913 |  test acc = 76.97228144989339 |
|     0.0005    | train acc = 68.68112335584786 | train acc = 68.57447564877356 | train acc = 87.55776750799858 |
|               |  test acc = 69.5095948827292  |  test acc = 70.25586353944563 |  test acc = 75.69296375266525 |
|     0.001     | train acc = 73.05367934589407 | train acc = 73.37362246711696 |  train acc = 89.4418769996445 |
|               |  test acc = 71.74840085287846 |  test acc = 71.64179104477611 |  test 