In [1]:

from bokeh.plotting import show, figure
from bokeh.io import output_notebook
from bokeh.layouts import column, row, gridplot

import numpy as np
from math import nan, isnan

import scipy.optimize as op
from time import time

In [2]:
output_notebook()

In [23]:
def prepare_sets(raw_data):
    raw_data = np.copy(raw_data)
    raw_data = np.insert(raw_data, 0, 1, axis=1)
    
    all_examples = raw_data.shape[0]
    
    train_examples = int(0.7 * all_examples)
    test_examples = all_examples - train_examples
    
    train_X = raw_data[:train_examples, :-1]
    train_y = raw_data[:train_examples, -1]
    
    test_X = raw_data[train_examples: train_examples + test_examples , :-1]
    test_y = raw_data[train_examples: train_examples + test_examples , -1]
    
    return train_X, train_y, test_X, test_y
    

In [24]:
def normalize_data(X):
    tmp_X = np.copy(X)
    
    mu_X = X.mean(axis=0)
    std_X = X.std(axis=0)
    
    mu_X[0] = 0
    std_X[0] = 1
    
    tmp_X -= mu_X
    tmp_X /= std_X
    
    tmp_X[0] = 1
    
    return tmp_X, mu_X, std_X

In [103]:
def hypothesis(X, Theta):
    return X.dot(Theta)


def lr_cost(Theta, X, y, lambd):
    
    m, n = X.shape
    
    H_y = hypothesis(X, Theta) - y
    
    return 1/(2*m) * (H_y.transpose().dot(H_y) + lambd*Theta[1:].transpose().dot(Theta[1:]))


def lr_gradient(Theta, X, y, lambd):
    
    m, n = X.shape
    
    H_y = hypothesis(X, Theta) - y
    
    tmp_Theta = np.copy(Theta)
    
    tmp_Theta[0] = 0
    
    return 1/m * (X.transpose().dot(H_y) + lambd*tmp_Theta)


def train_min(cost, grad, X, y, lambd, method = 'BFGS'):
    
    m, n = X.shape
    
    Theta = np.zeros(n)
    
    result = op.minimize(fun = cost, 
                         x0 = Theta, 
                         args = (X, y, lambd),
                         method = method,
                         jac = grad)
    
    return result['x'], result['fun']


def train_gd(X, y, lambd, alpha, iterations, get_plt_data = False):
    
    m, n = X.shape
    
    Theta = np.zeros(n)
    
    if get_plt_data:
        iter_x = np.arange(iterations)
        iter_y = np.zeros(iterations)
    
    for i in range(iterations):
        grad = lr_gradient(Theta, X, y, lambd)
        Theta -= alpha*grad
        
        if get_plt_data:
            iter_y[i] = lr_cost(Theta, X, y, lambd)


    if get_plt_data:
            return Theta, iter_x, iter_y

    else:
        return Theta, lr_cost(Theta, X, y, lambd)


def normal_eq(X, y):
    
    X_T = X.transpose()
    
    tmp = np.linalg.pinv(X_T.dot(X))
    
    return tmp.dot(X_T).dot(y)



def error(X, thetas, answers, normalized = False, mu_train = None, std_train = None):
    m, n = X.shape
    
    if normalized:
        X -= mu_train
        X /= std_train
    
    H_y = hypothesis(X, thetas) - answers
    
    return 1/(2*m) * (H_y.transpose().dot(H_y))

In [104]:
def training_analysis( train_algorithm, 
                       perf_atr, perf_atr_range, 
                       test_X, test_y, X, y, normalized = False, mu_train = None, std_train = None,
                       default_lambd = None, default_alpha = None, default_iterations = None,
                       print_progress = False):
    
    comp_time = []
    final_cost = [] 
    err = []
    all_thetas = []
    
    
    for i in perf_atr_range:
        
        time_start = time()
                
        if perf_atr == 'lambd':
            if train_algorithm == 'gd':
                thetas, cost = train_gd(X, y, i, default_alpha, default_iterations)
                
            else:
                thetas, cost = train_min(lr_cost, lr_gradient, X, y, i, method = train_algorithm)   
                
        if train_algorithm == 'gd':
            
            if perf_atr == 'alpha':
                thetas, cost = train_gd(X, y, default_lambd, i, default_iterations)

            if perf_atr == 'iterations':
                thetas, cost = train_gd(X, y, default_lambd, default_alpha, i)


        time_end = time()

        time_elapsed = time_end - time_start

        err_measured = error(test_X, thetas, test_y, normalized, mu_train, std_train)

        final_cost.append(cost)
        comp_time.append(time_elapsed)
        err.append(err_measured)
        all_thetas.append(thetas)
        
        if print_progress:
            print(cost, time_elapsed, err_measured)
    
    return [('method', train_algorithm),
            (perf_atr,  perf_atr_range), 
            ('computation_time[s]', comp_time), 
            ('final_cost', final_cost),  
            ('error', err)]


def get_plots(plot_data_list):
    
    plots = []
    
    all_title = plot_data_list[0][1]
    all_x_axis = plot_data_list[1][1]
    all_x_axis_label = plot_data_list[1][0]
    
    for plot_data in plot_data_list[2:]:
        
        plots.append( figure(title = all_title,
                             x_axis_label = all_x_axis_label , 
                             y_axis_label = plot_data[0] ) )
        
        plots[-1].line(x = all_x_axis, 
                       y = plot_data[1])
    
        
    return plots

In [105]:
# #costam

# raw_data = np.genfromtxt(r"C:\Users\Dell\Documents\CS\Machine Learning\Datasets\abalone_age.txt", delimiter=',', 
#                          usecols=(1, 2, 3, 4, 5, 6, 7, 8))
# raw_data2 = np.genfromtxt(r"C:\Users\Dell\Documents\CS\Machine Learning\Datasets\abalone_age.txt", delimiter=',', 
#                          usecols=(0), dtype=np.str)
# #I = 0, M = 1, F = 2
# raw_data2[raw_data2 == 'I'] = 0
# raw_data2[raw_data2 == 'M'] = 1
# raw_data2[raw_data2 == 'F'] = 2
# raw_data2 = raw_data2.astype(np.float64)
# raw_data = np.insert(raw_data, 0, raw_data2, axis=1)
# raw_data = np.insert(raw_data, 0, 1, axis=1)

# #raw_data[:, -1] = raw_data[:, -1] - 1

# X = raw_data [:, :-1]
# y = raw_data[:, -1]

# m, n = X.shape

# #X, mu_X, std_X = normalize_data(X)

# # infant = X[np.where(y == 0)]
# # male = X[np.where(y == 1)]
# # female = X[np.where(y == 2)]


# lambd = 1

# # plt = figure()
# # plt.circle(infant[:, 1], infant[:, 2], size = 10)
# # plt.x(male[:, 1], male[:, 2], color = "red", size=10)
# # plt.diamond(male[:, 1], male[:, 2], color = "green", size=10)
# # show(plt)

In [106]:
#BENZYNKA

# raw_data = np.genfromtxt(r"C:\Users\Dell\Documents\CS\Machine Learning\Datasets\miles_per_gallon.txt", 
#                          usecols=(0, 1, 2, 3, 4, 5, 6, 7))
# raw_data = np.insert(raw_data, 0, 1, axis=1)
# t = np.isnan(raw_data)
# raw_data[np.isnan(raw_data)] = 50

# all_examples = raw_data[:, :].shape[0]

# examples = int(0.7 * all_examples)
# test_examples = all_examples - examples

# X = raw_data[:examples , [0, 2, 3, 4, 5, 6, 7, 8]]
# y = raw_data[:examples, 1]
# y = 282.5/y

# test_X = raw_data[examples: examples + test_examples , [0, 2, 3, 4, 5, 6, 7, 8]]
# test_y = raw_data[examples: examples + test_examples , 1]
# test_y = 282.5/test_y

# m, n = X.shape

# X, mu_X, std_X = normalize_data(X)


# # lambd = 0

# # plt = figure()
# # plt.cross(X[:, 1], y)

In [110]:
raw_data = np.genfromtxt(r"C:\Users\Dell\Documents\CS\Machine Learning\Datasets\CASP.csv", 
                         delimiter = ',' , 
                         skip_header =  1)

raw_data = np.insert(raw_data[:, 1:], raw_data.shape[1] - 1,raw_data[:, 0], axis = 1)

In [111]:
train_X, train_y, test_X, test_y = prepare_sets(raw_data)
#train_X, mu_train, std_train = normalize_data(train_X)

In [91]:
plot_list = training_analysis('gd', 'lambd', np.arange(0, 1, 0.1),
                             test_X, test_y, train_X, train_y, True, mu_train, std_train,
                             default_alpha = 1/train_X.shape[0], default_iterations = 10000,
                             print_progress = True)

32.8765179994 16.51408886909485 33.6548970128


KeyboardInterrupt: 

In [None]:
show(row(get_plots(plot_list)))

In [123]:
theta = normal_eq(train_X, train_y)

In [124]:
 error(train_X, theta, train_y)

13.430579674544898

In [120]:
theta

array([  1.16823723e+01,   1.04693575e-03,   2.70493201e-03,
         4.59487593e+00,  -1.05431365e-01,  -3.75955833e-06,
        -2.28917231e-02,  -1.46010695e-04,   1.39667961e-02,
        -1.39944474e-01])