In [1]:
import numpy as np
import matplotlib.pyplot as plt
import os
import pandas as pd
import scipy.io
from sklearn import linear_model
from sklearn import model_selection
import eknn 
from eknn import exclusive_lasso, EkNN_C,EkNN_R
import eknn_poly
from eknn_poly import polynomial_reg, polynomial_EkNN_C, polynomial_EkNN_R,polynomial_EkNN
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split

## Testing simulated data for seperate classes

#### Simulate data

In [2]:
X = np.random.randn(100,10) # Matrix of possible predictors
alphas = np.array([0, 0, 0, 1, 0, 0, 0, -1, 0, 0]) #Two nonzero predictors
y = X @ alphas + 2*np.random.randn(100) # simulated observations y
N_groups = 3
groups = np.random.rand(10,3)
n = X.shape[1]

xL2 = np.linalg.pinv(X) @ y # inital value for alphas / re-visit how the value is initiated
(X @ alphas).shape, y.shape
X.shape

(100, 10)

In [3]:
# generate more test samples
Y_t = np.zeros((10, y.shape[0]))
for i in range(10):
    Y_t[i,:] = X @ alphas + 2*np.random.randn(100) 
Y = np.transpose(Y_t)

In [4]:
Y.shape

(100, 10)

In [5]:
alphas_0 = np.linalg.pinv(X) @ y
betas_0 = (np.linalg.pinv(X) ** 2) @ y
x0 = np.concatenate((alphas_0, betas_0))
alphas_0, betas_0, x0


(array([ 0.14185058, -0.04530994,  0.4111406 ,  1.07789654, -0.326754  ,
        -0.30831918, -0.00315372, -1.04024458,  0.35303013,  0.0870147 ]),
 array([-0.00384022, -0.0017296 , -0.00093667,  0.00794063,  0.00067934,
        -0.00667005,  0.00136659,  0.00144423,  0.00200513,  0.00657977]),
 array([ 1.41850579e-01, -4.53099424e-02,  4.11140599e-01,  1.07789654e+00,
        -3.26754000e-01, -3.08319185e-01, -3.15371688e-03, -1.04024458e+00,
         3.53030130e-01,  8.70147025e-02, -3.84022326e-03, -1.72960358e-03,
        -9.36670362e-04,  7.94062933e-03,  6.79336172e-04, -6.67005125e-03,
         1.36658985e-03,  1.44422632e-03,  2.00512802e-03,  6.57977259e-03]))

In [6]:
N_groups = 3
groups = np.random.rand( N_groups,10)

# simulate groups vectors
for i in range(len(groups)): # number of groups = number of rows
    # print(len(groups[i,:]))
    for j in range(len(groups[i,:])): # number of obs = number columns
        # print(groups[i,j])
        if groups[i,j] == max(groups[:,j]):
            groups[i,j] = 1
        else:
            groups[i,j]= 0
groups

array([[0., 0., 1., 1., 0., 1., 1., 0., 1., 0.],
       [1., 0., 0., 0., 1., 0., 0., 1., 0., 1.],
       [0., 1., 0., 0., 0., 0., 0., 0., 0., 0.]])

#### Test poly_EkNN_C

In [7]:
reg = polynomial_reg(X, y,groups, 0.1)

In [8]:
coefs = reg.lasso_optimize(x0)

In [9]:
coefficients = coefs.x

In [10]:
coefficients

array([-0.62662856,  0.05684912,  0.23231571,  1.08042277, -0.60776563,
        0.20982551,  1.31673867, -1.09483299, -0.10819392, -0.28329847,
        0.29439946,  0.38037757,  0.11124032,  2.92921439, -0.37120596,
       -0.48219435,  1.60937961,  0.0154832 ,  0.1828989 ,  0.47292418])

In [11]:
# simualate class labels 
N_classes = 4
labels = np.random.rand(N_classes,10)
x_labels = np.zeros(10)

# simulate groups vector
for i in range(len(labels)): # number of groups = number of rows
    for j in range(len(labels[i,:])): # number of obs = number columns
        if labels[i,j] == max(labels[:,j]):
            labels[i,j] = 1
            x_labels[j] = i
        else:
            labels[i,j]= 0
            # x_labels[j] = i
labels

array([[0., 0., 0., 1., 0., 0., 0., 0., 0., 0.],
       [0., 1., 0., 0., 0., 1., 0., 0., 1., 0.],
       [0., 0., 1., 0., 1., 0., 1., 0., 0., 1.],
       [1., 0., 0., 0., 0., 0., 0., 1., 0., 0.]])

In [12]:
x_labels

array([3., 1., 2., 0., 2., 1., 2., 3., 1., 2.])

In [13]:
clf = polynomial_EkNN_C(X, coefficients[:n], coefficients[n:], labels, 3)

In [14]:
pred = clf.classify()


In [15]:
pred

0

#### Test poly_EkNN_R

In [16]:
clf = polynomial_EkNN_R(X, y, coefficients[:n],coefficients[n:], labels, 3)

In [17]:
y.shape

(100,)

In [18]:
clf.distance()

array([ 0.        , 98.25426934,  0.        , 83.95228526,  0.        ,
        0.        , 88.25269118,  0.        ,  0.        ,  0.        ])

In [19]:
clf.weights()

array([0.        , 0.        , 0.        , 0.14556094, 0.        ,
       0.        , 0.10179281, 0.        , 0.        , 0.        ])

In [20]:
pred = clf.classify()

[0.58364657 0.         0.2978578  0.        ] (4,)


In [21]:
pred

0

#### Test poly_EkNN:

In [22]:
clf = polynomial_EkNN(0.1, N_groups, k=3)

In [23]:
X_transposed = np.transpose(X)

In [24]:
clf.fit(X_transposed, x_labels)

In [25]:
preds = clf.predict(Y)

[ 1.46538631e-01 -7.36048028e-02  2.90736960e-01  9.99841297e-01
 -1.92150621e-01  3.88684091e-01  9.74649260e-02 -1.10167759e+00
 -1.11827873e-02  9.41233155e-02  3.45162956e-05  4.06791460e-03
 -2.71322783e-03  9.74395487e-03  7.81272565e-04 -4.33073266e-03
 -2.01082515e-03 -3.77130694e-03  4.05251960e-03  2.88467299e-03]
[-0.31523316  0.4811973  -0.24883597  2.36763387 -0.13873011  1.1480762
  0.60475765 -1.35536856  0.30539178  0.35205589  0.55996388  0.27019888
  1.83834069  1.90856375 -0.0521686  -0.34089559 -0.37002694  2.12870504
 -0.02030181  0.23260138]
[0.39246374 0.         0.         0.        ] (4,)
[-0.01132332  0.2628769  -0.20961236  0.87219249 -0.0685122  -0.28116981
  0.52990891 -0.90989968 -0.30978374 -0.36530085  0.0029883   0.00317773
  0.00199635  0.01209211  0.00111853 -0.00151005  0.00654609  0.00770873
  0.00926665  0.00266254]
[ 0.06071535  0.81193774 -0.39440279  0.37114912 -0.57163839 -1.58097031
  0.35861517 -2.69071036 -0.54647294 -1.10019984 -0.06869835 

In [26]:
Y.shape

(100, 10)

In [27]:
x_labels, pred

(array([3., 1., 2., 0., 2., 1., 2., 3., 1., 2.]), 0)

In [28]:
accuracy_score(preds, x_labels)

0.2

#### LSVT data set test

In [29]:
lsvt_predictors = pd.read_excel('lsvt/LSVT_voice_rehabilitation.xlsx')
lsvt_predictors.shape

(126, 310)

In [30]:
lsvt_response = pd.read_excel('lsvt/LSVT_voice_rehabilitation.xlsx',sheet_name=1)
lsvt_response.shape

(126, 1)

In [31]:
train, test, train_labels, test_labels = train_test_split(lsvt_predictors, lsvt_response, test_size=0.33, random_state=42)

In [32]:
train.reset_index(drop=True, inplace=True)
test.reset_index(drop=True, inplace=True)
train_labels.reset_index(drop=True, inplace=True)
test_labels.reset_index(drop=True, inplace=True)

In [33]:
predictors_lsvt = train.columns
# generate predictor matrix X
# matrix rows = col number
# matrix column = row number 
train_cols, train_rows = train.shape[1], train.shape[0]
X_train = np.zeros((train_cols, train_rows))
for i,col in enumerate(predictors_lsvt):
    for j in range(train_rows):
        X_train[i,j] = train[col][j]

In [34]:
test_cols, test_rows = test.shape[1], test.shape[0]
X_test = np.zeros((test_cols, test_rows))
for i,col in enumerate(predictors_lsvt):
    for j in range(test_rows):
        X_test[i,j] = test[col][j]

In [35]:
lsvt_response.columns, lsvt_response

(Index(['Binary class 1=acceptable, 2=unacceptable'], dtype='object'),
      Binary class 1=acceptable, 2=unacceptable
 0                                            1
 1                                            2
 2                                            2
 3                                            1
 4                                            2
 ..                                         ...
 121                                          2
 122                                          2
 123                                          1
 124                                          2
 125                                          2
 
 [126 rows x 1 columns])

In [36]:
# try to generate a more generic way to generate the label matrix
y_train = np.zeros(train.shape[0])
for i in range(len(train_labels["Binary class 1=acceptable, 2=unacceptable"])):
    if train_labels["Binary class 1=acceptable, 2=unacceptable"][i] == 1:
        y_train[i] = 0
    else:
        y_train[i] = 1

In [37]:
y_test = np.zeros(test_labels.shape[0])
for i in range(len(test_labels["Binary class 1=acceptable, 2=unacceptable"])):
    if test_labels["Binary class 1=acceptable, 2=unacceptable"][i] == 1:
        y_test[i] = 0
    else:
        y_test[i] = 1

In [38]:
X_train_transposed = np.transpose(X_train)

In [39]:
# param_grid = {
#     'lambda_': np.logspace(-4, -0.5, 30),
#     'group_num': [i for i in range(2, 11,1)],
#     'k': [3, 5, 7, 10]
# }

# # Create the custom classifier instance
# clf = eknn_poly.polynomial_EkNN()

# # Use GridSearchCV to find the best hyperparameters
# grid_search = model_selection.GridSearchCV(clf, param_grid, cv=5)  # the scoring parameter can be changed
# grid_search.fit(X_train_transposed, y_train)# fit training data

# # Best parameters and best score
# best_params = grid_search.best_params_
# best_score = grid_search.best_score_

In [41]:
clf = eknn_poly.polynomial_EkNN(0.0001, 2, 3)
clf.fit(X_train_transposed, y_train)

preds = clf.predict(X_test)

[ 4.15998741e-04 -3.24052312e-02  3.81921483e-01 -2.10218906e-01
  4.30458077e-02  3.67942001e-02 -2.34732587e-01 -2.59327055e-02
  5.01909095e-02  2.37169789e-01 -1.71883529e-02  7.25064141e-02
 -1.92760000e-01  4.92950406e-01 -3.68565696e-02  2.13329709e-01
  1.68613286e-03  3.60987873e-01 -1.28438975e-01 -5.24773490e-02
 -2.11726101e-04 -1.21147912e-01  1.23378629e-02 -4.65230559e-02
  3.21625024e-02  5.63893673e-02 -3.87351282e-02 -1.74696113e-01
  2.33691228e-01  3.12654616e-01  1.12734177e-01  3.16200437e-02
 -4.18690851e-02 -3.83569785e-04 -5.01235958e-01 -1.02271554e-01
  5.43501344e-02 -1.24299200e-01  5.30589872e-02  1.33705198e-02
 -2.17544563e-01  4.44880376e-01  4.72997419e-02 -6.38907027e-02
 -4.92516474e-02 -5.21978081e-03 -3.24913276e-01  1.20091279e-01
  1.74941395e-03  4.19572723e-01 -1.86539510e-01  4.35502906e-01
  5.93193872e-04 -6.47348793e-02 -2.99450006e-04 -3.39184089e-01
  7.45536366e-02  2.81194392e-03 -2.44852736e-01  4.99176031e-01
  2.66177185e-01  9.00728

  return (np.sum(residual**2) + self.lambda_ * (np.sum(grouped_alphas ** 2)) + self.lambda_ * (np.sum(grouped_betas ** 2)) )
  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)
  df = fun(x) - f0


[nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan
 nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan
 nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan
 nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan
 nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan
 nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan
 nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan
 nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan
 nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan
 nan nan nan nan nan nan]
[0. 0.] (2,)
[-2.94200002e-03 -1.94271837e-01 -7.55688354e-01  2.15239312e-01
  7.89593186e-02  1.63851588e-01 -1.55328509e-02  1.29711209e-02
 -7.06092440e-02  9.89571514e-02 -2.86571542e-01 -5.18569320e-02
  3.12195661e-01  3.13995121e-01  1.69790331e-01 -7.66888770e-02
  2.84318364e-02  3.33815689e-01 -1.92239900

In [42]:
preds

array([1., 0., 0., 1., 1., 1., 1., 1., 1., 0., 0., 1., 1., 0., 1., 1., 0.,
       1., 1., 0., 0., 1., 1., 1., 1., 1., 1., 1., 1., 0., 0., 0., 1., 0.,
       1., 0., 1., 1., 1., 1., 0., 1.])

In [44]:
accuracy_score(preds, y_test)

0.5238095238095238