In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import os
from scipy import stats
import glob
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder,OneHotEncoder
from matplotlib.pyplot import figure
%matplotlib inline

In [2]:
# Read the training file
df_train_csv = pd.read_csv('files/train.csv')

In [3]:
# Extract the individual subjects from training set
df_train_csv_subjects = list(dict(df_train_csv['Subject'].value_counts()).keys())

In [4]:
list_of_dfs = [] # To store the dataframes

def parse_subject_files(subject,all_files):
    '''
    Input: Subject in the train file and all the instances(files) of that subject
    
    1. Loop through all the files(instances) of the subject present the subject folder
    2. Validating the subject in the train file.
    3. Add the list of dataframes into list_of_dfs
    
    '''
    print(subject,' : started processing.....')
    
    for filename in all_files:
        df = pd.read_csv(filename, header=None)
        if filename.split(sep='\\')[1] in list(df_train_csv['Datafile'].str
                                                                      .partition('/')[2]):
            current_label = (df_train_csv[df_train_csv['Datafile']==
                                          (''.join([subject,'/',filename.split(sep='\\')[1]]))]
                                          ['Label'].values[0])
            df['Label'] = current_label
            list_of_dfs.append(df)
            
    print(subject,' : completed processing')

In [5]:
# Loop through each subject in the train file and call the parse_subject_files definition
for i in range(len(df_train_csv_subjects)):
    path = r'/Users/sanyu/OneDrive - Jacobs University/2nd Sem/Data Mining/bbdc2019/files/'+df_train_csv_subjects[i]
    all_files = glob.glob(path + "/*.csv")
    parse_subject_files(df_train_csv_subjects[i],all_files)

Subject06  : started processing.....
Subject06  : completed processing
Subject13  : started processing.....
Subject13  : completed processing
Subject19  : started processing.....
Subject19  : completed processing
Subject02  : started processing.....
Subject02  : completed processing
Subject12  : started processing.....
Subject12  : completed processing
Subject03  : started processing.....
Subject03  : completed processing
Subject17  : started processing.....
Subject17  : completed processing
Subject07  : started processing.....
Subject07  : completed processing
Subject09  : started processing.....
Subject09  : completed processing
Subject05  : started processing.....
Subject05  : completed processing
Subject04  : started processing.....
Subject04  : completed processing
Subject18  : started processing.....
Subject18  : completed processing
Subject11  : started processing.....
Subject11  : completed processing
Subject08  : started processing.....
Subject08  : completed processing
Subjec

In [None]:
type(list_of_dfs)

In [6]:
# Concat all the files in a single frame
df_train_main = pd.concat(list_of_dfs, axis=0)

In [7]:
# Rename the columns
# A 'ground truth' column named as Label has been added as our y-variable
df_train_main.columns = ['EMG1',
                'EMG2',
                'EMG3',
                'EMG4',
                'Airborne',
                'ACC upper X',
                'ACC upper Y',
                'ACC upper Z',
                'Goniometer X',
                'ACC lower X',
                'ACC lower Y',
                'ACC loewr Z',
                'Goniometer Y',
                'Gyro upper X',
                'Gyro upper Y',
                'Gyro upper Z',
                'Gyro lower X',
                'Gyro lower Y',
                'Gyro lower Z',
                'Label']

In [None]:
df_train_main.shape

In [8]:
label_encoder = LabelEncoder()
df_train_main['Label'] = label_encoder.fit_transform(df_train_main['Label'])

In [9]:
z_scores = stats.zscore(df_train_main)
abs_z_scores = np.abs(z_scores)
filtered_entries = (abs_z_scores < 1).all(axis=1)
df_train_main = df_train_main[filtered_entries]

In [10]:
x = df_train_main.iloc[:,:-1].values
y = df_train_main.iloc[:,19].values

y = y.reshape(-1,1)

onehotencoder = OneHotEncoder(categories='auto')
y = onehotencoder.fit_transform(y).toarray()

In [None]:
y.shape

In [11]:
# Splitting the data into train-test set
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=0.20)

# Feature Scaling 
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
x_train = sc.fit_transform(x_train)
x_test = sc.transform(x_test)

In [12]:
# Principal Component Analysis
from sklearn.decomposition import PCA
pca = PCA(n_components = 8) #found this to be the optimal number of columns 
x_train = pca.fit_transform(x_train)
x_test = pca.transform(x_test)

In [None]:
# For the original dataframe
figure(num=None, figsize=(20, 18), dpi=80, facecolor='w', edgecolor='k')
#sns.heatmap(df_train_main.corr(), annot=True, fmt=".2%")

In [None]:
# after PCA we found 8 columns which can be selected 
figure(num=None, figsize=(8, 5), dpi=80, facecolor='w', edgecolor='k')
#sns.heatmap(pd.DataFrame(x_train).corr(), annot=True)

In [None]:
pd.DataFrame(x_train).head()

In [None]:
figure(num=None, figsize=(8, 5), dpi=80, facecolor='w', edgecolor='k')
plt.figure(figsize=(8,8),facecolor='red',edgecolor='blue')
#df['N'].hist(by=df['Letter'], figsize = (16,18))
df_train_main.groupby('Label').hist(figsize = (8,5))

In [None]:
import warnings 
import numpy as np
import pandas as pd
from pylab import rcParams
import matplotlib.pyplot as plt
from sklearn import neighbors
from matplotlib.colors import ListedColormap
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
# filter warnings
warnings.filterwarnings("ignore")

warnings.filterwarnings("ignore")

knn = KNeighborsClassifier(n_neighbors=3)    
knn.fit(x_train, y_train)
from sklearn import metrics
# predict the response
y_pred = knn.predict(x_test)

#print(metrics.accuracy_score(y, y_pred))



# evaluate and return  accuracy return accuracy_score(y_test, pred)


In [None]:
print("Accuracy:",metrics.accuracy_score(y_test, y_pred))

In [None]:
from sklearn import tree
dt = tree.DecisionTreeClassifier(random_state=0)
dt = dt.fit(x_train, y_train)
tree.plot_tree(dt)

In [None]:
from sklearn.tree import DecisionTreeRegressor
dt = DecisionTreeRegressor().fit(x_train, y_train)
predicted = dt.predict(x_test)
expected = y_test

plt.scatter(expected, predicted) 

plt.plot([0, 50], [0, 50], '--k') 

In [None]:
print("Accuracy:",metrics.accuracy_score(y_test, y_pred))

In [None]:
cross_val = KNeighborsClassifier()
from sklearn.model_selection import cross_val_score
cross_val_score(cross_val, x_train, y_train, cv=5)

In [None]:
from sklearn.neural_network import MLPClassifier
mlp = MLPClassifier(hidden_layer_sizes=(10, 10, 10), max_iter=1000)
mlp.fit(x_train, y_train)

In [None]:
predictions = mlp.predict(x_test)

In [None]:
predictions

In [None]:
from sklearn.metrics import classification_report, confusion_matrix
#print(confusion_matrix(y_test,predictions))
print(classification_report(y_test,predictions))

In [None]:
clf_report = classification_report(y_test,predictions,output_dict=True)
sns.heatmap(pd.DataFrame(clf_report).iloc[:-1, :].T, annot=True)