## Problem 1. Multicore Programming 

In [1]:
import pickle
import multiprocessing as mp
import numpy as np
import time

f = open('data_files.pl','rb')
data = pickle.load(f,encoding='bytes')
Xtrain,ytrain,Xtest,ytest = data

The original one.

In [2]:
def go_nn(Xtrain, ytrain, Xtest, ytest):
    correct =0
    for i in range(Xtest.shape[0]): ## For all testing instances
        nowXtest = Xtest[i,:]
        ### Find the index of nearest neighbor in training data
        dis_smallest = np.linalg.norm(Xtrain[0,:]-nowXtest) 
        idx = 0
        for j in range(1, Xtrain.shape[0]):
            dis = np.linalg.norm(nowXtest-Xtrain[j,:])
            if dis < dis_smallest:
                dis_smallest = dis
                idx = j
        ### Now idx is the index for the nearest neighbor
        
        ## check whether the predicted label matches the true label
        if ytest[i] == ytrain[idx]:  
            correct += 1
    acc = correct/float(Xtest.shape[0])
    return acc

start_time = time.time()
acc = go_nn(Xtrain, ytrain, Xtest, ytest)
print ("Accuracy %lf Time %lf secs.\n"%(acc, time.time()-start_time))

Accuracy 0.794000 Time 145.745534 secs.



The multiprocessing one.

In [3]:
import function as fn
start_time = time.time()
correct = 0
if __name__ == '__main__':
    pool = mp.Pool(processes = 4)
    tasks = [(Xtrain,ytrain,z) for z in Xtest ]
    results = pool.starmap(fn.f,tasks)
    pool.close()
for i in range(Xtest.shape[0]):
    if ytest[i] == ytrain[results[i]]:
        correct += 1
acc = correct/float(Xtest.shape[0])
print ("Accuracy %lf Time %lf secs.\n"%(acc, time.time()-start_time))

Accuracy 0.794000 Time 36.029794 secs.



## Problem 2 Parallel Gradient Descent

For the dataset, I will use the same data set as problem 1. 

Here is the single process with dense representation.

In [4]:
import math
from sklearn import metrics

def Logi(eta,X,y):
    """
    Gradient Descent with Fixed Step Size 
    """
    #set w0
    p = X.shape[1]
    n = X.shape[0]
    epsilon = 0.001
    w = np.matrix(np.zeros(p)).T
    
    def f1(w,x,y):
        g = 0
        for i in range(n):
            g += (-y[i]/(1+math.exp(y[i]*float(w.T.dot(x[i,].T)))))*x[i,].T
        return g/n+w
    
    g = f1(w,X,y)
    r0 = np.linalg.norm(g)
    for i in range(200):
        if np.linalg.norm(g)<epsilon*r0:
            break 
        g = f1(w,X,y)
        w = w - eta*g
    return w

def predict(w,x):
    s = x@w
    n =s.shape[0]
    y = np.zeros(x.shape[0])
    for i in range(n):
        y[i] = 1/(1+math.exp(-s[i]))
    y = np.sign(y-0.5+10**-15)#to get rid of those exact 0.5
    return y

start_time = time.time()
w = Logi(10**-2,Xtrain,ytrain)
end_time = time.time()

confusion = metrics.confusion_matrix(predict(w,Xtest),ytest)
acc = (confusion[0,0]+confusion[1,1])/sum(sum(confusion))

print ("Accuracy %lf Time %lf secs.\n"%(acc, end_time - start_time))

Accuracy 0.753000 Time 40.469853 secs.



The multiprocesses one.

In [5]:
import math
from sklearn import metrics
import function as fn

def Logi(eta,X,y):
    """
    Gradient Descent with Fixed Step Size 
    """
    
    if __name__ == '__main__':
        
        
        #set w0
        p = X.shape[1]
        n = X.shape[0]
        epsilon = 0.001
        w = np.matrix(np.zeros(p)).T
        
        def f1(w,A):
            data = [(w,a) for a in A]
            results = np.mean(pool.starmap(fn.f2,data),axis=0)
            return results+w
        
        pool = mp.Pool(processes = 4)
        A = np.column_stack((y,X))
        g = f1(w,A)
        r0 = np.linalg.norm(g)
        for i in range(200):
            if np.linalg.norm(g)<epsilon*r0:
                break 
            g = f1(w,A)
            w = w - eta*g
                
        pool.close()#i put it here for not to open and close repeatly which is time consuming.
    return w

def predict(w,x):
    s = x@w
    n =s.shape[0]
    y = np.zeros(x.shape[0])
    for i in range(n):
        y[i] = 1/(1+math.exp(-s[i]))
    y = np.sign(y-0.5+10**-15)#to get rid of those exact 0.5
    return y

start_time = time.time()
w = Logi(10**-2,Xtrain,ytrain)
end_time = time.time()

confusion = metrics.confusion_matrix(predict(w,Xtest),ytest)
acc = (confusion[0,0]+confusion[1,1])/sum(sum(confusion))

print ("Accuracy %lf Time %lf secs.\n"%(acc, end_time - start_time))

Accuracy 0.753000 Time 41.050649 secs.



I tried different methods but the code of the multi one is still took a long time. I check the time from parts by parts. 

data = [(w,a) for a in A]

It takes about 1/3 of the total time which i do not know how to reduce the time of it. Although the multiprogressing reduce some time but the preparation for it is time consuming. The results of this is both method takes about the same time.

For the question in the e-mail, kNN is time consuming in predict part(easy to model but hard to predict) while logistics is time consuming in the regression part (easy to predict but hard to regress).