> # Time windowing experiment

In [0]:
## initial setup for google drive and colab
import glob
import importlib
from google.colab import drive
drive.mount("/content/gdrive", force_remount=True)
import os
pathtodata = '/content/gdrive/My Drive/Activity Recognition from Single Chest-Mounted Accelerometer/'
os.chdir(pathtodata)


In [0]:
import pandas as pd
import numpy as np
import os, sys

# number of observation per second based on dataset documentation
sample_rate = 52  

# number of skipped datapoints to start next window
sliding_size = int(.2 * sample_rate)  

def mean_crossing_rate(col):
    # col = np.array(values)
    normalized = col - col.mean()  # to make elements of array possitive or negetive
    return ((normalized[:-1] * col[1:]) < 0).sum()  # Zero-Crossing_rate
#################################################################


def FS1(window): 
  # only mean

    avgs = list(window.mean()[:-1])
    ## select the most frequent label as the label of the window
    label = window.iloc[:, -1].mode()[0]  
    avgs.append(label)
    return avgs
#################################################################


def FS2(window):  
  # Mean and std

    features = []
    features.append(np.array(window.mean()[:-1]))
    features.append(np.array(window.std()[:-1]))
    features = np.hstack(features).tolist()

    label = window.iloc[:, -1].mode()[0] ## select the most frequent label as the label of the window

    features.append(label)

    return features
#################################################################


def FS3(window):  
  # mean, std,max,min and zero-crossing-rate

    features = []
    features.append(np.array(window.mean()[:-1]))
    features.append(np.array(window.std()[:-1]))
    features.append(np.array(window.min()[:-1]))
    features.append(np.array(window.max()[:-1]))
    mean_crossing = [mean_crossing_rate(window.iloc[:, i].values) for i in range(window.shape[1] - 1)]
    features.append(np.array(mean_crossing))

    features = np.hstack(features).tolist()

    label = window.iloc[:, -1].mode()[0]  ## select the most frequent label as the label of the window
    features.append(label)
    return features
#################################################################


def windowing_dataset(dataset, win_size, feature_extraction_function, subject_id, overlap=False):
    windowed_dataset = []
    win_count = 0
    if overlap:
        step_size = sliding_size  # for Overlapping technique
    else:
        step_size = win_size  # for Non-overlapping technique

    for index in range(0, dataset.shape[0], step_size):

        start = index
        end = start + win_size

        if (end <= dataset.shape[0]):  # to assure all of windows are equal in size
            window = dataset.iloc[start:end, :].reset_index(drop=True)
            win_count = win_count + 1
            features = feature_extraction_function(window)

            windowed_dataset.append(features)

    final = pd.DataFrame(windowed_dataset)
    final.insert(0, 'group', subject_id)  # to use in Subject CV
    return final
#################################################################


def Preprocessing(dataset_path, output_path, overlapping):

    features_functions = [FS1, FS2, FS3]
    win_sizes = np.linspace(.25, 3, 12, endpoint=True)
    for win_size in win_sizes:

        print("Start for win size {}".format(win_size))
        datapoints_per_window = int(win_size * sample_rate)

        for feature_function in features_functions:

            print(feature_function.__name__)

            windowed_dataset = []

            for subject in range(1, 16):
                file_path = dataset_path + "{}.csv".format(subject)
                #file_path = dataset_path + '/subject{0}_ideal.log'.format(subject)
                acc_cols = []
                for i in range(2, 117, 13):  # indices of accelarations
                    indices = list(range(i, i + 3))
                    acc_cols.extend(indices)

                acc_cols.append(119)  # label index

                tmp_db = pd.read_csv(file_path, header=1)#, usecols=acc_cols, sep='\t')
                tmp_db.columns = list(range(tmp_db.shape[1]))  # re-index the columns

                transformed_db = windowing_dataset(tmp_db, datapoints_per_window, feature_function, subject,
                                                   overlap=overlapping)

                windowed_dataset.append(transformed_db)

            final_dataset = pd.DataFrame()
            print("Merging!")
            final_dataset = final_dataset.append(windowed_dataset, ignore_index=True)

            if overlapping:
                out_folder_name = 'Overlapping_windowed'
            else:
                out_folder_name = 'Non-overlapping_windowed'

            os.makedirs('{}/{}'.format(output_path, out_folder_name), exist_ok=True)

            os.makedirs('{}/{}/FS1'.format(output_path, out_folder_name), exist_ok=True)
            os.makedirs('{}/{}/FS2'.format(output_path, out_folder_name), exist_ok=True)
            os.makedirs('{}/{}/FS3'.format(output_path, out_folder_name), exist_ok=True)

            if (feature_function == FS1):
                final_dataset.to_csv('{}/{}/FS1/dataset{}.csv'.format(output_path, out_folder_name, win_size), sep='\t',
                                     index=False)
            elif (feature_function == FS2):
                final_dataset.to_csv('{}/{}/FS2/dataset{}.csv'.format(output_path, out_folder_name, win_size), sep='\t',
                                     index=False)
            else:
                final_dataset.to_csv('{}/{}/FS3/dataset{}.csv'.format(output_path, out_folder_name, win_size), sep='\t',
                                     index=False)
#################################################################


In [0]:
Preprocessing(dataset_path=pathtodata, output_path=pathtodata, overlapping=0)

In [0]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from matplotlib.patches import Patch
from sklearn.model_selection import (LeaveOneGroupOut, ShuffleSplit)
import warnings
warnings.filterwarnings("ignore")
np.random.seed(1)

def plot_group_class(classes, groups):

    fig, ax = plt.subplots()
    ax.scatter(range(len(groups)),  [.5] * len(groups), c=groups, marker='_',
               lw=50,cmap=plt.cm.tab20)
    ax.scatter(range(len(groups)),  [3.5] * len(groups), c=classes, marker='_',
               lw=50,cmap=plt.cm.tab20b)
    ax.set(ylim=[-1, 5], yticks=[.5, 3.5],
           yticklabels=['Subject', 'Class'], xlabel="Sample index")

    ax.legend([Patch(color='navy')],
              ['Dominant class'], loc=(1.003,.94))
    plt.tight_layout()
    fig.subplots_adjust(right=.75)

    #project_root = os.path.dirname(os.path.dirname(__file__))
    output_folder = os.path.join(pathtodata, 'Figures')
    plt.savefig('{}/Class_subject.png'.format(output_folder))


def plot_cv_indices(cv, X, y, group, ax, n_splits, lw=10):

    splits = cv.split(X=X, y=y, groups=group)

    for index, (training, test) in enumerate(splits):
        indices = np.array([np.nan] * len(X))
        indices[test] = 1
        indices[training] = 0
        ax.scatter(range(len(indices)), [index + .5] * len(indices),
                   c=indices, marker='_', lw=lw, cmap=plt.cm.coolwarm
                   )
    ax.scatter(range(len(X)), [index + 1.5] * len(X),
               c=y, marker='_', lw=lw, cmap=plt.cm.tab20)

    ax.scatter(range(len(X)), [index + 2.5] * len(X),
               c=group, marker='_', lw=lw, cmap=plt.cm.tab20)

    if(isinstance(cv,LeaveOneGroupOut)):
      n_splits=cv.get_n_splits(groups=group)

    yticklabels = list(range(n_splits)) + ['class', 'subject']
    ax.set(yticks=np.arange(n_splits+2) + .5, yticklabels=yticklabels,
           xlabel='Sample index', ylabel="CV iteration",
           ylim=[n_splits+2.2, -.2], xlim=[0, len(X)])
  
    return ax
#################################################################



def plot_cv(dataset,CVs,n_splits):

    if(len(CVs)==0):
        raise  ValueError('There is any CV to plot.')

    dataset = pd.read_csv(dataset, sep='\t')

    groups = dataset['group'].values.ravel()

    gh = dataset['group'].value_counts(sort=True)
    X = dataset.iloc[:, 1:-1].values

    Y = dataset.iloc[:, dataset.shape[1] - 1].values

    output_folder = os.path.join(pathtodata, 'Non-overlapping_windowed')

    for cv in CVs:
        if (cv==LeaveOneGroupOut):
            cur_cv=cv()

        else:
            cur_cv=cv(n_splits=n_splits)

        fig_name = type(cur_cv).__name__
        fig, ax = plt.subplots(figsize=(8, 5))
        plot_cv_indices(cv=cur_cv,X=X,y=Y,group=groups,ax=ax,n_splits=n_splits)

        ax.legend([Patch(color='r'), Patch(color='b')],
                  ['Testing set', 'Training set'], loc=(1.02, .8))

        plt.tight_layout()
        fig.subplots_adjust(right=.7)
        plt.savefig('{}/{}.png'.format(output_folder,fig_name))

    return True

def test_plot_cv():
  assert plot_cv
#################################################################


In [0]:
## Code starts here
# This function reads the dataset0.5.cvs from /Data folder and plot the classes and subjects and also the user specified
# Cross-validation process and save in /Figures Folder

cvs = [LeaveOneGroupOut,ShuffleSplit]
import os

plot_cv(dataset=pathtodata+'Non-overlapping_windowed/FS1/dataset0.25.csv',CVs=cvs,n_splits=10)

In [0]:
#################################################################
def plot_csv(csv_file, ax):
    colors = {'NB': 'green', 'KNN': 'orange', 'DT': 'blue', 'NCC': 'r'}
    results = pd.read_csv(csv_file)

    windows = results.pop('window-size')

    for col in results:
        max = results[col].idxmax()
        ax.plot(windows, results[col], label=col, c=colors[col.strip()])
        ax.plot(windows[max], results[col][max], 'r*', label='peak')
    return ax

#################################################################
def plot_results(path):
    folders = list(filter(lambda x: os.path.isdir(os.path.join(path, x)), os.listdir(path)))

    for folder in folders:
        p = os.path.join(path, folder)

        files = glob.glob('{0}/*.csv'.format(p))

        if not files:
            continue

        for file in files:
            files_name = os.path.splitext(os.path.basename(file))[0]

            if ('non' in folder):

                figure_title = '{}-non-overlapping'.format(files_name)
            else:

                figure_title = '{}-overlapping'.format(files_name)

            fig, ax = plt.subplots()

            plot_csv(file, ax)

            fig.subplots_adjust(right=.84)

            # remove duplicate labels from legend

            handles, labels = ax.get_legend_handles_labels()
            handle_list, label_list = [], []
            for handle, label in zip(handles, labels):
                if label not in label_list:
                    handle_list.append(handle)
                    label_list.append(label)

            max_label_index = label_list.index('peak')
            max_label = label_list.pop(max_label_index)
            handle = handle_list.pop(max_label_index)

            handle_list.append(handle)
            label_list.append(max_label)

            # sort to always be in a same order
            label_list, handle_list = zip(*sorted(zip(label_list, handle_list), key=lambda t: t[0]))

            ax.legend(handle_list, label_list, loc=(1.004, .72))

            plt.xlabel('Windows Size (s)')
            plt.ylabel('f1_score')
            plt.ylim([0.2, 1])
            #plt.title(figure_title)

            plt.savefig('{}/{}.png'.format(output_folder, figure_title))

    return True




In [0]:
plot_results(path=input_path)
def test_plot_results():
    assert plot_results(input_path)