In [1]:
import numpy as np
import matplotlib.pyplot as plt
import os
import pandas as pd
import scipy.io
from sklearn import linear_model
from sklearn import model_selection
import eknn 
from eknn import exclusive_lasso, EkNN_C,EkNN_R

#### Implementation using sklearn Lasso
- What's is the difference between the normal Lasso dealing with values represented as matrix and Group Lasso

In [2]:
X = np.random.randn(100,10) # Matrix of possible predictors
alphas = np.array([0, 0, 0, 1, 0, 0, 0, -1, 0, 0]) #Two nonzero predictors
y = X @ alphas + 2*np.random.randn(100) # simulated observations y
N_groups = 3
groups = np.random.rand(10,3)

xL2 = np.linalg.pinv(X) @ y # inital value for alphas / re-visit how the value is initiated
(X @ alphas).shape, y.shape

((100,), (100,))

In [3]:
N_groups = 3
groups = np.random.rand( N_groups,10)

# simulate groups vectors
for i in range(len(groups)): # number of groups = number of rows
    # print(len(groups[i,:]))
    for j in range(len(groups[i,:])): # number of obs = number columns
        # print(groups[i,j])
        if groups[i,j] == max(groups[:,j]):
            groups[i,j] = 1
        else:
            groups[i,j]= 0
groups

array([[0., 0., 1., 1., 0., 0., 1., 0., 0., 1.],
       [0., 0., 0., 0., 1., 1., 0., 0., 0., 0.],
       [1., 1., 0., 0., 0., 0., 0., 1., 1., 0.]])

In [4]:
reg = linear_model.LassoCV(cv=10).fit(X, y)

lasso = linear_model.Lasso(random_state=0, max_iter=10000)
lasso.fit(X, y)
lambda_ = np.logspace(-4, -0.5, 30) #

tuned_parameters = [{'alpha': lambda_}]

clf = model_selection.GridSearchCV(lasso, tuned_parameters, cv=10, refit=False) # cross-validation to find best lasso parameter
clf.fit(X, y)

In [5]:
lambda_ = clf.best_params_['alpha']

In [6]:
lambda_ = clf.best_params_['alpha']
XL1 = linear_model.Lasso(alpha=clf.best_params_['alpha'])
XL1.fit(X,y)
xL1 = XL1.coef_
xL1

array([ 0.        ,  0.        , -0.00378443,  0.89499062,  0.        ,
       -0.        , -0.01433763, -0.71441972,  0.19734567,  0.        ])

#### Implementation with `optimize.minimize` function 
- This allows flexible fix for L1,L2-norm but does not seem to force some coeficients to 0

In [6]:
def coef_groups(groups_vect, alphas):
    # tranposed_alphas = np.reshape(alphas, (1,10))
    # print(alphas.shape, tranposed_alphas.shape)
    return groups_vect @ (np.abs(alphas))

coef_vect = coef_groups(groups, alphas)
# groups.shape

def lasso_func(X, y, alphas, lambda_, coef_vect):
    residual = y - (X @ alphas)
    return (np.sum(residual ** 2) + lambda_ * (np.sum(coef_vect ** 2))) # Square root of residuals plus the lasso penalty
lasso_val = (X, y, alphas, lambda_, coef_vect)

In [18]:
from scipy.optimize import minimize

least_squares = lambda alphas: lasso_func( X, y, alphas, lambda_, coef_vect)
result = minimize(least_squares, xL2, method='SLSQP')
optimal_coef_1 = result.x
optimal_coef_1

array([ 0.06002476,  0.74582566, -0.3391788 ,  0.97598061, -0.02585283,
       -0.0350602 , -0.46250137, -1.18781273,  0.29315015,  0.39463826])

#### Implement EkNN-C algorithm 

In [7]:
# simualate class labels 
N_classes = 4
labels = np.random.rand(N_classes,10)

# simulate groups vector
for i in range(len(labels)): # number of groups = number of rows
    for j in range(len(labels[i,:])): # number of obs = number columns
        if labels[i,j] == max(labels[:,j]):
            labels[i,j] = 1
        else:
            labels[i,j]= 0
labels

array([[0., 0., 0., 0., 0., 0., 1., 1., 1., 0.],
       [0., 1., 0., 0., 1., 1., 0., 0., 0., 1.],
       [0., 0., 0., 1., 0., 0., 0., 0., 0., 0.],
       [1., 0., 1., 0., 0., 0., 0., 0., 0., 0.]])

In [20]:
K_NN = 5
CLASS_NUM = 4
coefs = optimal_coef_1
# optimal_coef_1
coefs = coefs.tolist()
coefs.sort(reverse=True)
max_coefs = coefs[:K_NN]
max_coefs

[0.9759806130781281,
 0.7458256626181003,
 0.3946382640834404,
 0.29315014713356635,
 0.060024757991566466]

In [21]:
k_largest_coefs = optimal_coef_1
len(k_largest_coefs)

for j in range(len(optimal_coef_1)):
    if k_largest_coefs[j] not in max_coefs:
        k_largest_coefs[j] = 0
k_largest_coefs

array([0.06002476, 0.74582566, 0.        , 0.97598061, 0.        ,
       0.        , 0.        , 0.        , 0.29315015, 0.39463826])

In [22]:
def class_coefs(class_index):
    c_coefs = np.zeros(10)
    for i in range(len(labels[class_index])):
    # print(i)
        if labels[class_index,i] == 1:
            c_coefs[i] = k_largest_coefs[i]
        else:
            c_coefs[i] = 0
    return c_coefs

c1_coefs = class_coefs(0)
c2_coefs = class_coefs(1)
c3_coefs = class_coefs(2)
c4_coefs = class_coefs(3)

In [23]:
class_coefs_list = [c4_coefs, c2_coefs, c3_coefs, c1_coefs]

In [25]:
pred = 0
coefs_sum = [np.sum(class_coefs_list[i]) for i in range(4)]
for c in range(4):
    if np.sum(class_coefs_list[c]) == max(coefs_sum):
        pred = c
pred+=1
pred

4

#### The EkNN-R algorithm
- Use distance from the test sample to observations with k nearest neighbors largest coeficients as weights in considering the class label.

In [16]:
# calculate distance
# develop observation vector
def create_obs_vect(X, ind):
    cols = X.shape[0]
    res = np.zeros(cols)
    for i in range(cols):
        res[i] = X[i,ind]
    return res

x_vects = []
for i in range(10):
    x_vects.append(create_obs_vect(X,i))
    

In [17]:
# calculate the distance of test sample to k nearest neighbors data points
def calc_distance(y, x, alpha):
    # calculate the distance of test sample to k nearest neighbors data points
    return np.linalg.norm(y - alpha*x, ord=2)

d = [] # array of distance
for i in range(10):
    if k_largest_coefs[i] == 0:
        d.append(0)
    else:
        d.append(calc_distance(y, x_vects[i], k_largest_coefs[i]))

In [18]:
weights = []
for i in range(10):
    if k_largest_coefs[i] == 0:
        weights.append(0)
    elif max(d) != min(d):
        weights.append((max(d) - d[i])/(max(d)-min(d)))
    else:
        weights.append(1)
weights


[0.0075798558575921535,
 0.00985955764875282,
 0,
 0.0811961605488578,
 0.0,
 0,
 0,
 0,
 0.003199274268976004,
 0]

In [19]:
class_coefs_mat = np.zeros((4,10))

In [20]:
for i in range(4):
    class_coefs_mat[i,:] = class_coefs_list[i]

In [21]:
sum_coefs_weights = class_coefs_mat @ weights

In [22]:
l_index = np.where(sum_coefs_weights==max(sum_coefs_weights))[0][0]

- The sum of products of weights and coefieicents coresponding to each class shows that `y` belongs to class 0

#### Implement EkNN-C on LSVT Voice rehabitation dataset 

##### 1. Load in dataset and implement grid search for groups assignment

In [28]:
lsvt_predictors = pd.read_excel('lsvt/LSVT_voice_rehabilitation.xlsx')

In [19]:
lsvt_predictors[:,10:]

NameError: name 'lsvt_predictors' is not defined

In [30]:
lsvt_response = pd.read_excel('lsvt/LSVT_voice_rehabilitation.xlsx',sheet_name=1)

In [33]:
lsvt_response['response'] = [1 if lsvt_response['Binary class 1=acceptable, 2=unacceptable'][i] == 1 else 0 for i in range(len(lsvt_response['Binary class 1=acceptable, 2=unacceptable'])) ]

In [36]:
lsvt_response = lsvt_response.drop('Binary class 1=acceptable, 2=unacceptable', axis=1)

In [37]:
lsvt_response

Unnamed: 0,response
0,1
1,0
2,0
3,1
4,0
...,...
121,0
122,0
123,1
124,0


In [43]:
lasso = linear_model.Lasso(random_state=0, max_iter=10000)
lasso.fit(X,y)

lambda_ = np.logspace(-4, -0.5, 30)

tuned_parameters = [{'alpha': lambda_, }]

clf = model_selection.GridSearchCV(lasso, tuned_parameters, cv=10, refit=False)

In [None]:
def coef_groups(groups_vect, alphas):
    # tranposed_alphas = np.reshape(alphas, (1,10))
    # print(alphas.shape, tranposed_alphas.shape)
    return groups_vect @ (np.abs(alphas))

coef_vect = coef_groups(groups, alphas)
# groups.shape

def lasso_func(X, y, alphas, lambda_, coef_vect):
    residual = y - (X @ alphas)
    return (np.sum(residual ** 2) + lambda_ * (np.sum(coef_vect ** 2))) # Square root of residuals plus the lasso penalty
lasso_val = (X, y, alphas, lambda_, coef_vect)

#### Try out eknn.py

In [7]:
reg =eknn.exclusive_lasso(X, y, groups,lambda_)

In [8]:
res = reg.lasso_optimize(xL2)

In [10]:
res.x

array([-3.22990081e-01, -3.97800516e-01, -2.06637630e-01,  9.26773997e-01,
        2.16819533e-01,  2.41753181e-01, -8.97647924e-02, -9.40681809e-01,
       -2.81250292e-01, -4.84906065e-04])

In [11]:
l_y = eknn.EkNN_C(X, res.x, labels,5)

In [12]:
c = l_y.predict()
c

2

In [13]:
knn_R = eknn.EkNN_R(X,y, res.x, labels, 5)

In [14]:
l_y = knn_R.predict()
l_y

2

#### Grid search for $\lambda$,  $\alpha$, and k

In [16]:
def group_encode(group_num):
    n = X.shape[1]
    group_pop = int(n/group_num)
    group_vect = []
    start = 0
    end = group_pop
    while end <= n:
        temp = np.zeros(X.shape[1])
        if end + group_pop > n:
            temp[start:n] = 1
            group_vect.append(temp)
            break
        else: temp[start:end] = 1
        start = end
        end += group_pop
        group_vect.append(temp)
    return group_vect
res = group_encode(3)
res

[array([1., 1., 1., 0., 0., 0., 0., 0., 0., 0.]),
 array([0., 0., 0., 1., 1., 1., 0., 0., 0., 0.]),
 array([0., 0., 0., 0., 0., 0., 1., 1., 1., 1.])]

In [22]:
from sklearn.metrics import accuracy_score

In [10]:
y_s = [X @ alphas + 2*np.random.randn(100) for _ in range(10)] 

In [14]:
true_labels = []  # true labels of the testing sample as array 

def make_classification(y, group_vect,lambda_,k):
    reg = eknn.exclusive_lasso(X,y,group_vect,lambda_)
    coefs = reg.lasso_optimize(xL2).x
    knn_R = eknn.EkNN_R(X, y, coefs, true_labels,k)
    return knn_R.predict()
    

def classification(lambda_, group_num, k,y_s, true_labels):
    # y: a set of numerous testing data
    group_vect = group_encode(group_num)
    preds = []
    for i in range(len(y_s)):
        preds.append(make_classification(y_s[i], group_vect, lambda_, k))
    return accuracy_score(preds, true_labels)

lambda_  = np.logspace(-4, -0.5, 30)
group_num = np.linspace(2,10,8)
grid_search = model_selection.GridSearchCV({'lambda_':lambda_,
                                            'group_num': group_num,}, 
                                           classification)

In [18]:
reg = eknn.exclusive_lasso(X,y_s[0],groups,lambda_)
reg.lasso_optimize(xL2).x

ValueError: The user-provided objective function must return a scalar value.

In [17]:
preds = classification(lambda_, 3, 5, y_s)

ValueError: The user-provided objective function must return a scalar value.