# TEAM V - Final Project for Data Visualization (IS590DV Fall 2019)
by Dhwani Parekh, Wenyi Shang, Akshat Sharma, Anirudh Sharma, Tre Tomaszewski

## [WISDM Smartphone and Smartwatch Activity and Biometrics Dataset Data Set](https://archive.ics.uci.edu/ml/datasets/WISDM+Smartphone+and+Smartwatch+Activity+and+Biometrics+Dataset+)

From
> Smartphone and Smartwatch-Based Biometrics Using Activities of Daily Living. IEEE Access, 7:133190-133202, Sept. 2019.

<style>table {margin:0;} </style>

| Data File Group | Total Size | Total Files | Instances |
|-:|-:|-:|-:|
|`Phone/Accel`| 250MB | 51  | 4,804,404 |
|`Phone/Gyro` | 205MB | 51  | 3,608,635 |
|`Watch/Accel`| 196MB | 51  | 3,777,048 |
|`Watch/Gyro` | 190MB | 51  | 3,440,344 |
|All          | 1.1GB | 204 | 15,630,426|

In [219]:
%matplotlib ipympl
from pathlib import Path
from IPython.display import display
import ipywidgets
import ipywidgets as widgets
from ipywidgets import interact, interactive
import traitlets

import numpy as np
import pandas as pd

import matplotlib
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
import bqplot

# Thanks to `ImportanceOfBeingErnest` from https://stackoverflow.com/questions/47404653/pandas-0-21-0-timestamp-compatibility-issue-with-matplotlib
pd.plotting.register_matplotlib_converters()

In [3]:
subject_ids = [p.stem.split('_')[0] for p in Path('../data/processed/wisdm/merged_sensors/phone/').glob('*.csv')]
activity_name_key = {k: v for k, v in [line.strip().split(' = ') for line in Path('../references/wisdm_activity_key.txt').read_text().strip().split('\n')]}
activity_letter_key = {v: k for k, v in [line.strip().split(' = ') for line in Path('../references/wisdm_activity_key.txt').read_text().strip().split('\n')]}

In [77]:
def get_by_subject_id(subject_id='1600'):
    subject_id = str(subject_id) # force into a string
    device_dict = {
        'phone': pd.read_csv('../data/processed/wisdm/merged_sensors/phone/{}_phone.csv'.format(subject_id)).drop(columns=['timestamp']),#, parse_dates=['timestamp']) The time isn't really needed and adds a lot of loading time
        'watch': pd.read_csv('../data/processed/wisdm/merged_sensors/watch/{}_watch.csv'.format(subject_id)).drop(columns=['timestamp'])#, parse_dates=['timestamp'])
    }

    for k,v in device_dict.items():
        v.insert(2, 'activity_name', v['activity_code'].replace(activity_letter_key))
    
    return device_dict

all_phone_df = pd.concat([pd.read_csv(f) for f in Path('../data/processed/wisdm/merged_sensors/phone/').glob('*.csv')], ignore_index=True)
all_watch_df = pd.concat([pd.read_csv(f) for f in Path('../data/processed/wisdm/merged_sensors/watch/').glob('*.csv')], ignore_index=True)

## Part I: General Data Analytics

In [220]:
plt.ioff()
fig, ax = plt.subplots()
fig.canvas.layout.width='800px'
fig.canvas.layout.height='600px'
ax.bar(['phone', 'watch'], [len(all_phone_df), len(all_watch_df)], align='center', alpha=0.5, color='blue')
ax.set_ylabel('Number of records')
fig.suptitle('Records per device')
fig.show()

Canvas(layout=Layout(height='600px', width='800px'), toolbar=Toolbar(toolitems=[('Home', 'Reset original view'…

In [221]:
plt.ioff()
rps_phone_count  = all_phone_df['subject_id'].value_counts().sort_index()
rps_watch_count  = all_watch_df['subject_id'].value_counts().sort_index()

fig, axs = plt.subplots(2,1)
axs[0].set_title('Records for Phone by Subject')
axs[1].set_title('Records for Watch by Subject')

axs[0].grid(axis='x', alpha=0.1, zorder=-1)
axs[1].grid(axis='x', alpha=0.1, zorder=-1)

xlabels = [str(i) for i in rps_phone_count.index]

axs[0].bar(xlabels, rps_phone_count.values, alpha=0.8, color='blue', zorder=5)
axs[0].xaxis.set_tick_params(labelrotation=40)
axs[0].set_xticklabels(xlabels, va='top', ha='right', rotation_mode='anchor')
axs[0].set_xlabel('Subject ID', fontsize=12)
axs[0].set_ylabel('Number of Records', fontsize=12)


axs[1].bar(xlabels, rps_watch_count.values, alpha=0.8, color='orange', zorder=5)
axs[1].xaxis.set_tick_params(labelrotation=40)
axs[1].set_xticklabels(xlabels, va='top', ha='right', rotation_mode='anchor')
axs[1].set_xlabel('Subject ID', fontsize=12)
axs[1].set_ylabel('Number of Records', fontsize=12)

fig.canvas.layout.height='1200px'

fig.subplots_adjust(hspace=0.4)

widgets.VBox([fig.canvas])

VBox(children=(Canvas(layout=Layout(height='1200px'), toolbar=Toolbar(toolitems=[('Home', 'Reset original view…

In [225]:
plt.ioff()
rpa_phone_count = all_phone_df.copy().replace({'activity_code': activity_letter_key})['activity_code'].value_counts()
rpa_watch_count = all_phone_df.copy().replace({'activity_code': activity_letter_key})['activity_code'].value_counts()

fig, axs = plt.subplots(2,1)
axs[0].set_title('Records for Phone by Activity')
axs[1].set_title('Records for Watch by Activity')

axs[0].grid(axis='x', alpha=0.1, zorder=-1)
axs[1].grid(axis='x', alpha=0.1, zorder=-1)

xlabels = [str(i) for i in rpa_phone_count.index]

axs[0].bar(xlabels, rpa_phone_count.values, alpha=0.8, color='blue', zorder=5)
axs[0].xaxis.set_tick_params(labelrotation=40)
axs[0].set_xticklabels(xlabels, va='top', ha='right', rotation_mode='anchor')
axs[0].set_xlabel('Activity ID', fontsize=12)
axs[0].set_ylabel('Number of Records', fontsize=12)


axs[1].bar(xlabels, rpa_watch_count.values, alpha=0.8, color='orange', zorder=5)
axs[1].xaxis.set_tick_params(labelrotation=40)
axs[1].set_xticklabels(xlabels, va='top', ha='right', rotation_mode='anchor')
axs[1].set_xlabel('Activity ID', fontsize=12)
axs[1].set_ylabel('Number of Records', fontsize=12)

fig.canvas.layout.height='1200px'

fig.subplots_adjust(hspace=0.4)

widgets.VBox([fig.canvas])

VBox(children=(Canvas(layout=Layout(height='1200px'), toolbar=Toolbar(toolitems=[('Home', 'Reset original view…

## Part II: Aggregate Analytics
Below is an interactive bqplot project to display the aggregated/derived data of the dataset. It allows users to select subject ID (1600-1650), phone vs watch, coordinates (x_accel, y_accel, z_accel, x_gyro, y_gyro, z_gyro), and aggregation type (in total, there are 5 aggregation types: sum, mean, max, min, count). Users can select anything they want a from the four dropdowns, and the results will display with a barplot of the data of every activity correspondingly. By this interactive plot, users can learn about the aggregated results of the dataset.

In [226]:
plt.ion()
@interact(Subject_ID = subject_ids)
def get_subject(Subject_ID):
    ID=Subject_ID
    
    @interact(Device = ['phone', 'watch'])
    def get_device(Device):
        device=Device
    
        @interact(Coordinate = ['x_accel', 'y_accel','z_accel','x_gyro','y_gyro','z_gyro'])
        def get_coordinate(Coordinate):
            coordinate=Coordinate
    
            @interact(Aggregate = ['sum', 'mean','max','min','count'])
            def change_aggregation(Aggregate):
                if Aggregate=='sum':
                    y = get_by_subject_id(ID).get(device).groupby('activity_code')[coordinate].sum()
                if Aggregate=='mean':
                    y = get_by_subject_id(ID).get(device).groupby('activity_code')[coordinate].mean()
                if Aggregate=='max':
                    y = get_by_subject_id(ID).get(device).groupby('activity_code')[coordinate].max()
                if Aggregate=='min':
                    y = get_by_subject_id(ID).get(device).groupby('activity_code')[coordinate].min()
                if Aggregate=='count':
                    y = get_by_subject_id(ID).get(device).groupby('activity_code')[coordinate].count()
                bin_x_sc = bqplot.OrdinalScale()
                bin_x_ax = bqplot.Axis(scale = bin_x_sc,label='activity code')
                bin_y_sc = bqplot.LinearScale()
                bin_y_ax = bqplot.Axis(scale = bin_y_sc,orientation='vertical',label='value')
                bars = bqplot.Bars(x = y.index,y = y,scales = {'x': bin_x_sc, 'y': bin_y_sc})
                fig = bqplot.Figure(marks = [bars], axes = [bin_x_ax, bin_y_ax])
                display(fig)

interactive(children=(Dropdown(description='Subject_ID', options=('1600', '1601', '1602', '1603', '1604', '160…

In [227]:
plt.ioff()
fig = plt.figure()
fig.suptitle('Scatter Plot of the WISDM Data Cumulative Sum over Time')
fig.canvas.layout.width = "1000px"
fig.canvas.layout.height = "1000px"

t = None
cb = None
cmap = None

def update_scatter(subject_id=None, device=None, activity=None, stepsize=None):
    global t, cb, cmap
    Subject_ID = subject_id or 1600
    Device = device or 'phone'
    Activity = activity or 'A'
    StepSize = stepsize or 10
        
    x = get_by_subject_id(Subject_ID).get(Device).groupby('activity_code').get_group(Activity)[['x_accel','y_accel','z_accel','x_gyro','y_gyro','z_gyro']]
#     ax = Axes3D(fig)
    ax = fig.add_subplot(111, projection='3d')
    
    smooth_x = x[::StepSize]
    
    t = ax.scatter(smooth_x['x_accel'],smooth_x['y_accel'],smooth_x['z_accel'], c = plt.cm.jet(np.linspace(0,1,len(smooth_x))))
    ax.set_title('Subject #{} `{}` ({}) using {}'.format(Subject_ID, activity_letter_key.get(Activity).capitalize(), Activity, Device.capitalize()))
    ax.set_xlabel('X-Axis')
    ax.set_ylabel('Y-Axis')
    ax.set_zlabel('Z-Axis')
    fig.canvas.draw()
    fig.canvas.flush_events()
    
    

def get_subject(change):
    update_scatter(subject_id=change.new)
    
def get_device(change):
    update_scatter(device=change.new)
    
def get_activity(change):
    update_scatter(activity=change.new)
    
def get_stepsize(change):
    update_scatter(stepsize=change.new)

subject_dd = widgets.Dropdown(options = subject_ids, description='Subject ID:', value='1600')
device_dd = widgets.Dropdown(options = ['phone', 'watch'], description='Device Type:', value='phone')
activity_dd = widgets.Dropdown(options=activity_name_key, description='Activity:', value='A')
stepsize_sl = widgets.IntSlider(min=1, max=20, value=10)
    
subject_dd.observe(get_subject, names='value')
device_dd.observe(get_device, names='value')
activity_dd.observe(get_activity, names='value')
stepsize_sl.observe(get_stepsize, names='value')

update_scatter()
widgets.VBox([widgets.HBox([subject_dd, device_dd, activity_dd]), widgets.HBox([widgets.Label(value='Steps between Points:'), stepsize_sl]), fig.canvas])

VBox(children=(HBox(children=(Dropdown(description='Subject ID:', options=('1600', '1601', '1602', '1603', '16…

Below is an interactive 3D plot, showing curves of cumulative sum of any activity of any user. Because the cumulative summation is not a single value but a set of data points, we create a 3D scatter plot to display it, and colorize it with different colors, to show the development of the curve. Users can select the subject ID and activity they want, and a 3D curve will be displayed respectively, and the legend will show what it is.

In [228]:
plt.ioff()
fig = plt.figure()
fig.suptitle('Scatter Plot of the WISDM Data Cumulative Sum over Time')
fig.canvas.layout.width = "1000px"
fig.canvas.layout.height = "1000px"

t = None
cb = None
cmap = None

def update_scatter(subject_id=None, device=None, activity=None, stepsize=None):
    global t, cb, cmap
    Subject_ID = subject_id or 1600
    Device = device or 'phone'
    Activity = activity or 'A'
    StepSize = stepsize or 10
        
    x = get_by_subject_id(Subject_ID).get(Device).groupby('activity_code').get_group(Activity)[['x_accel','y_accel','z_accel','x_gyro','y_gyro','z_gyro']].cumsum(axis = 0)
#     ax = Axes3D(fig)
    ax = fig.add_subplot(111, projection='3d')
    
    smooth_x = x[::StepSize]
    
    t = ax.scatter(smooth_x['x_accel'],smooth_x['y_accel'],smooth_x['z_accel'], c = plt.cm.jet(np.linspace(0,1,len(smooth_x))))
    ax.set_title('Subject #{} `{}` ({}) using {}'.format(Subject_ID, activity_letter_key.get(Activity).capitalize(), Activity, Device.capitalize()))
    ax.set_xlabel('X-Axis')
    ax.set_ylabel('Y-Axis')
    ax.set_zlabel('Z-Axis')
    fig.canvas.draw()
    fig.canvas.flush_events()
    
    

def get_subject(change):
    update_scatter(subject_id=change.new)
    
def get_device(change):
    update_scatter(device=change.new)
    
def get_activity(change):
    update_scatter(activity=change.new)
    
def get_stepsize(change):
    update_scatter(stepsize=change.new)

subject_dd = widgets.Dropdown(options = subject_ids, description='Subject ID:', value='1600')
device_dd = widgets.Dropdown(options = ['phone', 'watch'], description='Device Type:', value='phone')
activity_dd = widgets.Dropdown(options=activity_name_key, description='Activity:', value='A')
stepsize_sl = widgets.IntSlider(min=1, max=20, value=10)
    
subject_dd.observe(get_subject, names='value')
device_dd.observe(get_device, names='value')
activity_dd.observe(get_activity, names='value')
stepsize_sl.observe(get_stepsize, names='value')

update_scatter()
widgets.VBox([widgets.HBox([subject_dd, device_dd, activity_dd]), widgets.HBox([widgets.Label(value='Steps between Points:'), stepsize_sl]), fig.canvas])

VBox(children=(HBox(children=(Dropdown(description='Subject ID:', options=('1600', '1601', '1602', '1603', '16…

## Part III: Adjusted Data

In [229]:
plt.close('all')
plt.ioff()

fig = plt.figure()
fig.canvas.layout.width = '1000px'
fig.canvas.layout.height = '1000px'

class AdjustedPlot():
    def __init__(self, fig_):
        
        self.fig = fig_
        self.fig.suptitle('Horizontal (Top-Down) Instantateous Accelerations over Time')
        
        # Data Initialization
        self.view_span = 200
        self.subject_id = '1600'
        self.device = 'phone'
        self.activity_code = 'A'

        self.output = widgets.Output()
        self.update_data()
        
        # Initial Plot
        self.ax = self.fig.add_subplot(111, projection='3d')
        self.lines = self.ax.scatter(self.df.x_accel, self.df.z_accel, self.df.y_accel, c=plt.cm.jet(np.linspace(0,1,len(self.df))))
        self.ax.set_xlabel('X-Axis')
        self.ax.set_ylabel('Z-Axis')
        self.ax.set_zlabel('Y-Axis')
        self.update_view()
        
        # Widgets
        self.subject_dropdown = widgets.Dropdown(options=subject_ids, value=self.subject_id, description='Subject ID:')
        self.device_dropdown = widgets.Dropdown(options=['phone', 'watch'], value=self.device, description='Subject ID:')
        self.activity_dropdown = widgets.Dropdown(options=activity_name_key, value=self.activity_code, description='Activity:')
        self.view_slider = widgets.IntSlider(min=0, max=self.df.shape[0]-self.view_span, value=1.0, description='View Frame:')

        # Observers
        self.subject_dropdown.observe(self.on_change_subject(), names='value')
        self.device_dropdown.observe(self.on_change_device(), names='value')
        self.activity_dropdown.observe(self.on_change_activity(), names='value')
        self.view_slider.observe(self.on_change_view(), names='value')
        

    def update_data(self, subject_id=None, device=None, activity_code=None):
        if subject_id or not self.subject_id:
            self.subject_id = subject_id or '1600'
        
        if device or not self.device:
            self.device = device or 'phone'
            

        if activity_code or not self.activity_code:
            self.activity_code = activity_code or 'A'

        self.subject_dfs = get_by_subject_id(self.subject_id).copy()
        self.df = self.subject_dfs[self.device]
        self.df = self.df[self.df['activity_code'] == self.activity_code].reset_index(drop=True)[['x_accel', 'y_accel', 'z_accel']]
        self.df.y_accel = np.subtract(self.df.y_accel, 9.807)
        self.df = self.df.cumsum(axis=0)
        
    def update_canvas(self):
        self.ax.relim()
        self.fig.canvas.draw()
        self.fig.canvas.flush_events()
        
    def update_plot(self):
        # https://stackoverflow.com/questions/41602588/matplotlib-3d-scatter-animations
#         self.lines[0].set_data(self.df.x_accel, self.df.z_accel)
#         self.lines[0].set_3d_properties(self.df.y_accel)

        self.lines.set_offsets([self.df.x_accel, self.df.z_accel, self.df.y_accel])
        self.view_slider.max = self.df.shape[0] - self.view_span
        self.view_slider.value = 1
        
        
    def update_view(self, start=0):
        part = self.df.iloc[start: start + self.view_span]
        self.ax.set_xlim((part.x_accel.min(), part.x_accel.max()))
        self.ax.set_ylim((part.z_accel.min(), part.z_accel.max())) 
        self.ax.set_zlim((part.y_accel.min(), part.y_accel.max()))

        
    def on_change_subject(self):
        def callback(change):
            self.update_view()
            self.update_data(subject_id=change.new)
            self.update_plot()
            self.update_canvas()
            self.update_view()
        return callback
    
    def on_change_device(self):
        def callback(change):
            self.update_view()
            self.update_data(device=change.new)
            self.update_plot()
            self.update_canvas()
            self.update_view()
            with self.output:
                    print(self.df.iloc[:2])
        return callback
        
    def on_change_activity(self):
        print('Change Activity Initialized.')
        def callback(change):
            self.update_view()
            self.update_data(activity_code = change.new)
            self.update_plot()
            self.update_canvas()
            self.update_view()
        return callback

    def on_change_view(self):
        print('Change View Initialized.')
        def callback(change):
            self.update_view(change.new)
            self.update_canvas()
        return callback
    
    def _p(self, *args):
        with self.output:
            print(*args)


ajp = AdjustedPlot(fig)

# widgets.HBox([ajp.activity_dropdown, ajp.output])
plt.ion()
widgets.HBox([widgets.VBox([widgets.HBox([ajp.subject_dropdown, ajp.device_dropdown, ajp.activity_dropdown]), ajp.view_slider, fig.canvas]), ajp.output])

Change Activity Initialized.
Change View Initialized.


HBox(children=(VBox(children=(HBox(children=(Dropdown(description='Subject ID:', options=('1600', '1601', '160…