In [1]:
import pandas as pd
import numpy as np
import plotly.express as px
import plotly.figure_factory as ff
import matplotlib.pyplot as plt
import plotly.graph_objects as go

#import models
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import LassoCV

#import attribute selection methods
from sklearn.feature_selection import VarianceThreshold
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import RFE
from sklearn.feature_selection import SelectFromModel
from sklearn.feature_selection import mutual_info_classif, f_classif
from sklearn.ensemble import AdaBoostRegressor
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.svm import SVR, SVC

#import metrics
from sklearn.metrics import confusion_matrix,accuracy_score,precision_score,recall_score,roc_auc_score,f1_score,matthews_corrcoef, classification_report
from sklearn.model_selection import cross_val_score


#import imbalanced learn strategies
from sklearn.pipeline import Pipeline
from imblearn.over_sampling import RandomOverSampler
from imblearn.under_sampling import RandomUnderSampler
from imblearn.over_sampling import SMOTE
from imblearn.combine import SMOTETomek

In [30]:
#upload recorded activity files
from google.colab import files
uploaded = files.upload()

lay=pd.read_csv('Lay1.csv',sep=';',index_col='index')
stand=pd.read_csv('Standing1.csv',sep=';',index_col='index')
walk=pd.read_csv('Walking1.csv',sep=';',index_col='index')
run=pd.read_csv('Run1.csv',sep=';',index_col='index')
sit=pd.read_csv('Sit1.csv',sep=';',index_col='index')
stairs_up=pd.read_csv('StairsUp1.csv',sep=';',index_col='index')
stairs_down=pd.read_csv('StairsDown1.csv',sep=';',index_col='index')

#rename columns and concat to full dataframe
activities=[lay,stand,walk,run,sit,stairs_up,stairs_down]
names=['lay','stand','walk','run','sit','stairs_up','stairs_down']
named_act=[]

for act,name in zip(activities,names):
    act=act.rename(columns={'x':'x_%s'% name,'y':'y_%s'% name,'z':'z_%s'% name})
    named_act.append(act)

df=pd.concat([named_act[0],named_act[1],named_act[2],named_act[3],named_act[4],named_act[5],named_act[6]],axis=1).dropna(axis=0)

Saving Walking1.csv to Walking1 (4).csv
Saving Standing1.csv to Standing1 (4).csv
Saving StairsUp1.csv to StairsUp1 (4).csv
Saving StairsDown1.csv to StairsDown1 (4).csv
Saving Sit1.csv to Sit1 (4).csv
Saving Run1.csv to Run1 (4).csv
Saving Lay1.csv to Lay1 (4).csv


In [32]:
def plotActivity(activity,name):
  fig = go.Figure()
  fig.add_trace(go.Scatter(x=activity.index, y=activity.x,
                    mode='lines+markers',
                    name='x',))
  fig.add_trace(go.Scatter(x=activity.index, y=activity.y,
                    mode='lines+markers',
                    name='y'))
  fig.add_trace(go.Scatter(x=activity.index, y=activity.z,
                    mode='lines+markers',
                    name='z'))
  fig.update_layout(title=name,width=1000,height=500)
  return fig

In [33]:
for i,j in zip(activities,names):
  plotActivity(i[0:300],j).show()

In [5]:
x_data=df[df.columns[::3]]
y_data=df[df.columns[1::3]]
z_data=df[df.columns[2::3]]

In [6]:
fig = ff.create_distplot([x_data[c] for c in x_data.columns], x_data.columns, bin_size=.1)
fig.show()

In [7]:
fig = ff.create_distplot([y_data[c] for c in y_data.columns], y_data.columns, bin_size=.1)
fig.show()

In [None]:
fig = ff.create_distplot([z_data[c] for c in z_data.columns], z_data.columns, bin_size=.1)
fig.show()

In [46]:
#define stats as attributes
stats=df.describe().iloc[1:]

#create multiple dataframes using sliding window of size 30
frames = []

for i in range(0,len(activities[0]-31)):
  ax=activities[0]['x'][i:(i+30)]
  ay=activities[0]['y'][i:(i+30)]
  az=activities[0]['z'][i:(i+30)]
  xyz=pd.DataFrame(data=[ax,ay,az]).transpose()
  xyz=xyz.describe()[1:]
  frames.append(xyz)

transformed=pd.DataFrame(columns=[['mean_x', 'std_x', 'min_x', '25%_x', '50%_x', '75%_x', 'max_x', 'mean_y', 'std_y', 'min_y',
       '25%_y', '50%_y', '75%_y', 'max_y', 'mean_z', 'std_z', 'min_z', '25%_z', '50%_z', '75%_z',
       'max_z']])

for i in range(0,len(frames)):
  data=pd.DataFrame(pd.concat([frames[i].transpose().iloc[0],frames[i].transpose().iloc[1],frames[i].transpose().iloc[2]],axis=0)).transpose()
  transformed=transformed.append(pd.DataFrame(data.iloc[0].values.reshape(1,-1), columns=list(transformed)), ignore_index=True)

transformed['activity']=names[0]

#create train test data

In [51]:
def getWindows(full,name): #create multiple dataframes using sliding window of size 30
  
  #create windows of size 30 and store in frames
  frames = []
  for i in range(0,len(full-31)): 
    ax=full['x'][i:(i+30)]
    ay=full['y'][i:(i+30)]
    az=full['z'][i:(i+30)]
    xyz=pd.DataFrame(data=[ax,ay,az]).transpose()
    xyz=xyz.describe()[1:]
    frames.append(xyz)

  #calculate dstribution metrics for each window
  transformed=pd.DataFrame(columns=[['mean_x', 'std_x', 'min_x', '25%_x', '50%_x', '75%_x', 'max_x', 'mean_y', 'std_y', 'min_y',
       '25%_y', '50%_y', '75%_y', 'max_y', 'mean_z', 'std_z', 'min_z', '25%_z', '50%_z', '75%_z',
       'max_z']])
  
  for i in range(0,len(frames)):
    data=pd.DataFrame(pd.concat([frames[i].transpose().iloc[0],frames[i].transpose().iloc[1],frames[i].transpose().iloc[2]],axis=0)).transpose()
    transformed=transformed.append(pd.DataFrame(data.iloc[0].values.reshape(1,-1), columns=list(transformed)), ignore_index=True)

  #add name of activity as target variable
  transformed['Activity']=name
  return transformed

In [52]:
df=pd.DataFrame(columns=[['mean_x', 'std_x', 'min_x', '25%_x', '50%_x', '75%_x', 'max_x', 'mean_y', 'std_y', 'min_y',
       '25%_y', '50%_y', '75%_y', 'max_y', 'mean_z', 'std_z', 'min_z', '25%_z', '50%_z', '75%_z',
       'max_z','Activity']])

for i in range(0,len(activities)):
  df=df.append(getWindows(activities[i],names[i]))

df.columns=['mean_x', 'std_x', 'min_x', '25%_x', '50%_x', '75%_x', 'max_x', 'mean_y', 'std_y', 'min_y',
       '25%_y', '50%_y', '75%_y', 'max_y', 'mean_z', 'std_z', 'min_z', '25%_z', '50%_z', '75%_z',
       'max_z','Activity']
df.head()

Unnamed: 0,mean_x,std_x,min_x,25%_x,50%_x,75%_x,max_x,mean_y,std_y,min_y,25%_y,50%_y,75%_y,max_y,mean_z,std_z,min_z,25%_z,50%_z,75%_z,max_z,Activity
0,0.814293,0.013637,0.792389,0.804848,0.814254,0.82468,0.843506,-0.277403,0.013086,-0.306259,-0.286305,-0.278793,-0.266807,-0.256851,-0.503801,0.01055,-0.526703,-0.512489,-0.503639,-0.496456,-0.483292,lay
1,0.814973,0.013342,0.792389,0.805275,0.81459,0.82468,0.843506,-0.276208,0.011947,-0.300705,-0.285412,-0.277587,-0.266807,-0.256851,-0.503339,0.010593,-0.526703,-0.512489,-0.502204,-0.495873,-0.483292,lay
2,0.815312,0.013549,0.792389,0.805275,0.81459,0.826656,0.843506,-0.275565,0.012162,-0.300705,-0.285412,-0.274956,-0.264766,-0.256851,-0.502298,0.010686,-0.526703,-0.510772,-0.50103,-0.49511,-0.483292,lay
3,0.8159,0.013714,0.792389,0.805275,0.815361,0.827858,0.843506,-0.274869,0.012278,-0.300705,-0.285412,-0.272713,-0.263367,-0.256851,-0.502029,0.010628,-0.526703,-0.510772,-0.500519,-0.49511,-0.483292,lay
4,0.816537,0.013153,0.792389,0.806823,0.815361,0.827858,0.843506,-0.274062,0.012281,-0.300705,-0.284004,-0.271606,-0.262882,-0.256851,-0.501382,0.010367,-0.526703,-0.508998,-0.500122,-0.49511,-0.483292,lay


In [54]:
activity_count=pd.DataFrame(df.Activity.value_counts())
fig=px.pie(activity_count,names=activity_count.index,values='Activity',
           title='% Share of Activity Classes in Target Variable',height=400,width=700,
           color_discrete_sequence=px.colors.sequential.RdBu)

fig.show()

In [58]:
#dimensionality reduction using PCA and tSNE
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn import metrics
from sklearn.metrics import pairwise_distances,silhouette_score

df = df.dropna(axis=0)
df2 = df.drop(columns=['Activity'],axis=1).copy()

scaler=StandardScaler()
scaled=scaler.fit_transform(df2)

pca=PCA(n_components=2,random_state=3)
reduced=pca.fit_transform(scaled)
scaled_pca=scaler.fit_transform(reduced)

pca_data=pd.DataFrame(scaled_pca,df.Activity).reset_index()
pca_data
px.scatter(pca_data,x=0,y=1,color='Activity',
           height=600,width=800,title='Two dimensional Principle Component Analysis by Activity')

In [59]:
#calculate silhouette_score for with all features
labels = pca_data.Activity
X = pca_data[[0,1]]
total_ch=metrics.silhouette_score(X, labels, metric='euclidean')

#calculte silhouette_score when dropping one feature
CH_index = pd.DataFrame(columns=['feature dropped','new_silhouette_score'])
for i in pca_data.Activity.unique():
  subset=pca_data[pca_data.Activity != i]
  labels = subset.Activity
  X = subset[[0,1]]
  ch=metrics.silhouette_score(X, labels, metric='euclidean')
  CH_index=CH_index.append({'feature dropped':i,'new_silhouette_score':ch},ignore_index=True)
CH_index

#assess distance
CH_index['change'] =  round((CH_index['new_silhouette_score'] - total_ch),3)
CH_index

Unnamed: 0,feature dropped,new_silhouette_score,change
0,lay,0.37768,-0.054
1,stand,0.432675,0.001
2,walk,0.432661,0.001
3,run,0.485489,0.053
4,sit,0.40207,-0.03
5,stairs_up,0.56755,0.136
6,stairs_down,0.527379,0.095


In [60]:
from sklearn.model_selection import train_test_split

y = df.Activity
x = df.drop(['Activity'],axis=1)

# Divide data into training and validation subsets
X_train, X_valid, y_train, y_valid = train_test_split(x, y, train_size=0.8,random_state=0)

In [61]:
from sklearn.tree import DecisionTreeClassifier

#define model and hyper parameters
tree_model = DecisionTreeClassifier(random_state=0)

accuracies = []

for i in range(1,len(df.columns)):
  pipe = Pipeline(steps=[('feature_selection',SelectKBest(f_classif, k=i)),
                        ('model', tree_model)
                             ])
  pipe.fit(X_train,y_train)
  preds=pipe.predict(X_valid)
  accuracy=accuracy_score(y_valid,preds)
  accuracies.append([i,accuracy])
  
accuracies=pd.DataFrame(accuracies,columns=['n_Features','Accuracy_Score'])
px.line(accuracies,x='n_Features',y='Accuracy_Score')

In [66]:
  print(classification_report(y_valid,preds))

              precision    recall  f1-score   support

         lay       1.00      1.00      1.00       101
         run       1.00      0.98      0.99        58
         sit       1.00      1.00      1.00        86
 stairs_down       0.99      0.97      0.98        72
   stairs_up       0.96      0.96      0.96        69
       stand       0.98      1.00      0.99        80
        walk       1.00      1.00      1.00        88

    accuracy                           0.99       554
   macro avg       0.99      0.99      0.99       554
weighted avg       0.99      0.99      0.99       554



In [68]:
#evaluate - Selected K-Best 
selector=SelectKBest(f_classif, k=16)
selector.fit_transform(X_train,y_train),
anova=pd.DataFrame(selector.scores_,index=X_train.columns).sort_values(by=0,ascending=False).iloc[:16]
anova=anova.reset_index()
anova=anova.rename(columns={'index':'Feature',0:'F-Score'})
fig = px.bar(anova, x='Feature', y='F-Score',
             title='Feature Selection based on ANOVA F-Score (SelectKBest)',
             height=400,width=800)
fig.update_traces(marker_color='lightsalmon')
fig.show() 

In [64]:
px.box(df,y='75%_y',x='Activity')