In [None]:
import numpy as np
import matplotlib.pyplot as plt
import os
import pandas as pd
import scipy.io
from sklearn import linear_model
from sklearn import model_selection
import eknn 
from eknn import exclusive_lasso, EkNN_C,EkNN_R
from sklearn.metrics import accuracy_score
import numpy as np
from sklearn.model_selection import train_test_split

#### Test on LSVT Voice rehabitation dataset 

**Pre-processing data set**
- In this section, the data set is loaded in as a Data Frame. This Data Frame is then split into training and testing set. 

- The data is then fit into matrix to match the input of the functions. 

In [None]:
lsvt_predictors = pd.read_excel('lsvt/LSVT_voice_rehabilitation.xlsx')
lsvt_predictors.shape

In [None]:
lsvt_response = pd.read_excel('lsvt/LSVT_voice_rehabilitation.xlsx',sheet_name=1)
lsvt_response.shape

In [None]:
data = pd.concat([lsvt_predictors, lsvt_response], axis=1, join='outer')
data

In [None]:
data_labels = []
for i in range(len(lsvt_response['Binary class 1=acceptable, 2=unacceptable'])):
    if lsvt_response['Binary class 1=acceptable, 2=unacceptable'][i] == 1:
        data_labels.append(0)
    else:
        data_labels.append(1)
data_labels = pd.Series(data_labels)

In [None]:
X_train, X_test, y_train ,y_test  = train_test_split(lsvt_predictors, data_labels, test_size=0.3)

In [None]:
X_train = X_train.reset_index(drop=True)
y_train = y_train.reset_index(drop=True)
X_test = X_test.reset_index(drop=True)
y_test = y_test.reset_index(drop=True)


In [None]:
X_train.shape, data.shape
predictors= X_train.columns

After the data is in good shape, we follows the procedures below to predict the class of the testing set.

##### Procedures for the Exclusive Lasso KNN classifier
- Extract X and Y matrix where X _(pxn)_ are compressed training predictors and label values. Y_(pxn)_ are compressed training predictors and labels values. Label values are the last row of the 2 matrices. 

- Extract `x_labels` and `y_true_labels`. `x_labels` vector is the same as `y_train` in ususal estimators. `y_true_labels` is vector is pass into the score function. Reformat the labels to `0...m` if the original ;labels are not formatted in this way. Seperate the labels from the predictors matrix for both `Y` and `X` vectors. 

- To match `sklearn` API, transpose `X` before pass into `.fit` method.

- Implement `GridSearchCV` to find best parameters. Then pass in `Y` and predict the labels for `Y`. Accuracy is provdided for the `score` function of the estimator. 

In [None]:
# generate predictor matrix X
# matrix rows = col number
# matrix column = row number 
train_cols = X_train.shape[1] 
train_rows = X_train.shape[0]
X_train_mat = np.zeros((train_cols, train_rows))
for i,col in enumerate(predictors):
    for j in range(train_rows):
        X_train_mat[i,j] = X_train[col][j]

In [None]:
# generate test sample matrix Y
# matrix rows = col number
# matrix column = row number
# y = pxn matrix where n = number of test samples 
test_cols = X_test.shape[1]  
test_rows = X_test.shape[0]
X_test_mat = np.zeros((test_cols, test_rows))
for i,col in enumerate(predictors):
    for j in range(test_rows):
        X_test_mat[i,j] = X_test[col][j]

In [None]:
# transpose X_train
X_transposed = np.transpose(X_train_mat)

In [None]:
X_train_mat.shape

In [None]:
train_labels = np.zeros(y_train.shape[0])
for i in range(len(y_train)):
    if y_train[i] == 1:
        train_labels[i] = 0
    else:
        train_labels[i] = 1

In [None]:
train_labels

In [None]:
# extract labels y as (n,) matrix
test_labels = np.zeros(y_test.shape[0])
for i in range(len(y_test)):
    if y_test[i] == 1:
        test_labels[i] = 0
    else:
        test_labels[i] = 1

`GridSearchCV` of `sklearn` is utilized for finding the bets values for number fo groups, $\lambda$ and $k$.

In [None]:
param_grid = {
    'lambda_':np.linspace(0.0000001, 1, 30) ,
    'group_num': [i for i in range(2,11)],
    'k': [3, 5, 7,10]
}

# Create the custom classifier instance
clf = eknn.ExclusiveLassoKNNClassifier()

# Use GridSearchCV to find the best hyperparameters
grid_search = model_selection.GridSearchCV(clf, param_grid, cv=5, scoring='accuracy')  # the scoring parameter can be changed
grid_search.fit(X_transposed, train_labels)# fit training data

# Best parameters and best score
best_params = grid_search.best_params_
best_score = grid_search.best_score_


In [None]:
lambda_ = best_params['lambda_']
group_num = best_params['group_num']
k = best_params['k']

In [None]:
best_params

In [None]:
X_transposed.shape

In [None]:
clf = eknn.ExclusiveLassoKNNClassifier(group_num=2, k=3, lambda_=0.0001)
clf.fit(X_transposed, train_labels)
score_lsvt = clf.score(X_test_mat,test_labels)
score_lsvt

In [None]:
print("Accuracy score of classifying LSVT data set:", round(score_lsvt*100,2))

The accuracy of this model is 55.26, which is low compared to the result in the paper which is 83.25. However, this might be due to the way we group the data points. In this experiment, data are evenly group corresponding to the number of group passed in (_refer to `eknn.py` file to understand how to data points are grouped_). Hence, this can significantly affect the result of the classification. In order to see if this is the case, I run the classifier on one more data set below. 

In [None]:
preds= clf.predict(X_test_mat)

In [None]:
preds, test_labels

Result of the classification is printed out along with the true labels of the testing set to clearly see the performace of the proposed algorithm.

#### Test on Arcene dataset

- Arcene data set are added in order to see the performance of the algorithm on new data set.

- Similarly, data are shpaed into matrices format before being passed into the functions

In [None]:
from ucimlrepo import fetch_ucirepo 
  
# fetch dataset 
connectionist_bench_sonar_mines_vs_rocks = fetch_ucirepo(id=151) 
  
# data (as pandas dataframes) 
data = connectionist_bench_sonar_mines_vs_rocks.data.features 
labels = connectionist_bench_sonar_mines_vs_rocks.data.targets 

In [None]:
X_train, X_test, y_train, y_test = train_test_split(data, labels, test_size=0.33, random_state=42)

In [None]:
X_train.reset_index(drop=True, inplace=True)
X_test.reset_index(drop=True, inplace=True)
y_train.reset_index(drop=True, inplace=True)
y_test.reset_index(drop=True, inplace=True)

In [None]:
predictors_sonars = X_train.columns

In [None]:
# generate predictor matrix X
# matrix rows = col number
# matrix column = row number 
train_cols = X_train.shape[1] 
train_rows = X_train.shape[0]
X = np.zeros((train_cols, train_rows))
for i,col in enumerate(predictors_sonars):
    for j in range(train_rows):
        X[i,j] = X_train[col][j]

In [None]:
# generate predictor matrix X
# matrix rows = col number
# matrix column = row number 
test_cols = X_test.shape[1] 
test_rows = X_test.shape[0]
X_test_mat = np.zeros((test_cols, test_rows))
for i,col in enumerate(predictors_sonars):
    for j in range(test_rows):
        X_test_mat[i,j] = X_test[col][j]

In [None]:
x_labels = np.zeros(y_train.shape[0])
for i in range(len(y_train["class"])):
    if y_train["class"][i] == "R":
        x_labels[i] = 0
    else:
        x_labels[i] = 1

In [None]:
test_labels = np.zeros(y_test.shape[0])
for i in range(len(y_test["class"])):
    if y_test["class"][i] == "R":
        test_labels[i] = 0
    else:
        test_labels[i] = 1

In [None]:
y_test = np.reshape(y_test, y_test.shape[0])
y_train = np.reshape(y_train, y_train.shape[0])

In [None]:
X_train_transposed = np.transpose(X)

`GridSearchCV` is implemented to find best tuning parameters.

In [None]:
param_grid = {
    'lambda_': np.logspace(-4, -0.5, 30),
    'group_num': [i for i in range(2, 11,1)],
    'k': [3, 5, 7, 10]
}

# Create the custom classifier instance
clf = eknn.ExclusiveLassoKNNClassifier()

# Use GridSearchCV to find the best hyperparameters
grid_search = model_selection.GridSearchCV(clf, param_grid, cv=10)  # the scoring parameter can be changed
grid_search.fit(X_train_transposed, y_train)# fit training data

# Best parameters and best score
best_params = grid_search.best_params_
best_score = grid_search.best_score_

In [None]:
best_params

In [None]:
y_test

In [None]:
X_train_transposed.shape # groups_vect has correct dimension

In [None]:
clf = eknn.ExclusiveLassoKNNClassifier(group_num=2, k=3, lambda_=0.0001)
clf.fit(X_train_transposed, x_labels)
preds = clf.predict(X_test_mat)
score_sonars = clf.score(X_test_mat, test_labels)
score_sonars

In [None]:
print("Accuracy score of classifying Sonar data set:", round(score_sonars*100,2))

For this dataset, the accuracy score of hte classifier is 71.01 which is pretty close to the score of 74.63 in the paper. The small difference might be due to the way we group the daat points. More research on how we group data points might help obtain similar classification result. 

#### Test on vehicle data - data with multiple (more than) class labels

In [None]:
from ucimlrepo import fetch_ucirepo 
  
# fetch dataset 
statlog_vehicle_silhouettes = fetch_ucirepo(id=149) 
  
# data (as pandas dataframes) 
vehicle = statlog_vehicle_silhouettes.data.features 
targets = statlog_vehicle_silhouettes.data.targets  


In [None]:
# vehicle
null_data = vehicle[vehicle.isnull().any(axis=1)]
vehicle.drop(index=752,inplace=True)

In [None]:
targets.drop(index=752, inplace=True)

In [None]:
targets.shape, vehicle.shape

In [None]:
targets['class'].unique()

In [None]:
labels=pd.Series(len(targets['class']))
for i,v in enumerate(targets['class']):
    if v == 'van':
        labels[i] = 0
    elif v == 'saab':
        labels[i] = 1
    elif v == 'bus':
        labels[i] = 2
    elif v == 'opel':
        labels[i] = 3
    else:
        labels[i] = 4
labels

In [None]:
X_train, X_test, y_train, y_test = train_test_split(vehicle, labels, test_size=0.33, random_state=42)

In [None]:
X_train.reset_index(drop=True, inplace=True)
X_test.reset_index(drop=True, inplace=True)
y_train.reset_index(drop=True, inplace=True)
y_test.reset_index(drop=True, inplace=True)

In [None]:
predictors_vehicles = X_train.columns
predictors_vehicles

In [None]:
# generate predictor matrix X
# matrix rows = col number
# matrix column = row number 
train_cols = X_train.shape[1] 
train_rows = X_train.shape[0]
X_train_mat = np.zeros((train_cols, train_rows))
for i,col in enumerate(predictors_vehicles):
    for j in range(train_rows):
        X_train_mat[i,j] = X_train[col][j]

In [None]:
# generate predictor matrix X
# matrix rows = col number
# matrix column = row number 
test_cols = X_test.shape[1] 
test_rows = X_test.shape[0]
X_test_mat = np.zeros((test_cols, test_rows))
for i,col in enumerate(predictors_vehicles):
    for j in range(test_rows):
        X_test_mat[i,j] = X_test[col][j]

In [None]:
X_train_transposed = np.transpose(X_train_mat)

In [None]:
X_train_transposed.shape

In [None]:
param_grid = {
    'lambda_': np.logspace(-4, -0.5, 30),
    'group_num': [i for i in range(2, 11,1)],
    'k': [3, 5, 7, 10,11]
}

# Create the custom classifier instance
clf = eknn.ExclusiveLassoKNNClassifier()

# Use GridSearchCV to find the best hyperparameters
grid_search_3 = model_selection.GridSearchCV(clf, param_grid, cv=10)  # the scoring parameter can be changed
grid_search_3.fit(X_train_transposed, y_train)# fit training data

# Best parameters and best score
best_params = grid_search_3.best_params_
best_score = grid_search_3.best_score_

In [None]:
lambda_ = best_params['lambda_']
group_num = best_params['group_num']
k = best_params['k']
best_params

In [None]:
X_test

In [None]:
clf = eknn.ExclusiveLassoKNNClassifier(group_num=group_num, k=3, lambda_=lambda_)
clf.fit(X_train_transposed, y_train)
score_lsvt = clf.score(X_test_mat, y_test)
score_lsvt

#### Uitilize unsupervised learning to group data points