In [119]:
import os
import pandas as pd
from sklearn import model_selection,neighbors,linear_model,metrics

In [120]:
# Finding the folder and its files inside and reading the files to Pandas dataframe
files_in_folder = os.listdir('EMG_data/01/')
df1 = pd.read_csv(f'EMG_data/01/{files_in_folder[0]}', sep='\t')
df2 = pd.read_csv(f'EMG_data/01/{files_in_folder[1]}', sep='\t')
s1_data = pd.concat([df1, df2])

In [121]:
# Creating dataframe for features by leaving only channels' columns, thereby dropping 'class' and 'time'
X = s1_data.drop(columns=['class','time'])

# Creating dataframe for labels by leaving only the 'class' column
y = s1_data['class']

# Splitting up the data for training and test set in the usual 80/20 split
X_train, X_test, y_train, y_test = model_selection.train_test_split(X,y,test_size=0.2)

# Normalization was not needed as it did not change significantly the final scores
# Mean+std and min/max values are in the same range per each channel
X.describe()

Unnamed: 0,channel1,channel2,channel3,channel4,channel5,channel6,channel7,channel8
count,121170.0,121170.0,121170.0,121170.0,121170.0,121170.0,121170.0,121170.0
mean,-8e-06,-9e-06,-1e-05,-1.1e-05,-1.4e-05,-1.2e-05,-1e-05,-9e-06
std,0.000135,0.000132,0.00012,0.000151,0.000193,0.000148,0.000121,0.000133
min,-0.00116,-0.00113,-0.00106,-0.00112,-0.00128,-0.00112,-0.00109,-0.00102
25%,-3e-05,-4e-05,-5e-05,-4e-05,-4e-05,-4e-05,-3e-05,-3e-05
50%,-1e-05,-1e-05,-1e-05,-1e-05,-1e-05,-1e-05,-1e-05,-1e-05
75%,1e-05,2e-05,3e-05,2e-05,1e-05,1e-05,1e-05,1e-05
max,0.00127,0.00127,0.00127,0.00127,0.00127,0.00127,0.00127,0.00127


In [122]:
# CONCLUSION:
# Logistic regression produced a low accuracy of ~64% but K-nearest neighbors ~98%.
# LR is linear but KNN supports non-linear solutions but is slower.
# In this case of 8 channels the classification is definitely non-linear, so KNN is better option.

# Logistic regression for multi-class classification
lm = linear_model.LogisticRegression(multi_class='multinomial',solver='newton-cg',max_iter=1000)
lm = lm.fit(X_train, y_train)

# K-nearest neighbors with n-neighbors chosen to give the best accuracy and precision for this exercise
knn = neighbors.KNeighborsClassifier(n_neighbors=1)
knn = knn.fit(X_train, y_train)

# Comparison of accuracies for the two methods
print("Logistic regression accuracy: %f" % lm.score(X_test, y_test))
print("KNN accuracy: %f" % knn.score(X_test, y_test))

# Predicting the labels using KNN model and calculating precision of the classification
y_test_predict = knn.predict(X_test)
print('KNN prediction precision: %f' % round(metrics.precision_score(y_test,y_test_predict,average='weighted'),6))

Logistic regression accuracy: 0.650862
KNN accuracy: 0.981596
KNN prediction precision: 0.981624


In [123]:
# CONCLUSION:
# Both accuracy and precision are over 99% when the same trained model was used on subject 3 data
# I would say that KNN works great in this exercise and logistic regression is just not enough

# Creating features and labels dataframes for subject 3
files_in_folder = os.listdir('EMG_data/03/')
df1 = pd.read_csv(f'EMG_data/03/{files_in_folder[0]}', sep='\t')
df2 = pd.read_csv(f'EMG_data/03/{files_in_folder[1]}', sep='\t')
s3_data = pd.concat([df1, df2])
X_3 = s1_data.drop(columns=['class','time'])
y_3 = s1_data['class']

# Using KNN model created with subject 1 data to predict labels for subject 3
y_3_predict = knn.predict(X_3)

# Finding accuracy and precision of the classification
print('KNN prediction accuracy for subject 3: %f' % round(metrics.accuracy_score(y_3,y_3_predict),6))
print('KNN prediction precision for subject 3: %f' % round(metrics.precision_score(y_3,y_3_predict,average='weighted'),6))

KNN prediction accuracy for subject 3: 0.994074
KNN prediction precision for subject 3: 0.994082
