# Quantify some metrics from all data files

Assess total duration of all data, sampling rate for each of the files, and unique activity labels.

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib as mpl
import scipy.signal as signal
import datetime

%matplotlib notebook

## List all files and initialize variables

In [3]:
files = !ls *xq*.csv

In [4]:
duration = []
notes = []
activity_start = []
fs = []

## Loop through each file and quantify total duration, activitiy notes, and sampling rate

### Note: First 3 files have notes in the 'notes' column, whereas all other files have it in the 'fw_version' column

In [8]:
# first 3 files have notes in the ' notes' column

for i in files[0:3]:
    data = pd.read_csv(i, usecols = ['#time', 'seq', 'note'])

    # calculate total recording duration
    data['#time'] = [datetime.datetime.fromtimestamp(s/1000.0) for s in data['#time']]
    duration.append(data['#time'].iloc[-1] - data['#time'].iloc[0]) # calculate duration of recordings
    
    # calculate sampling rate
    seq_int = data['seq'].diff() # change in seq number between consecutive samples
    seq_breaks = np.where(seq_int < 1) # indices where change was neg
    seq_breaks = seq_breaks[0]-1
    if len(seq_breaks) == 0: # if no sequence breaks, fs = total samples / total seconds
        temp_fs = len(data['#time']) / datetime.timedelta.total_seconds(data['#time'].iloc[-1] - data['#time'].iloc[0])
    else: # if there is a sequence break, fs = samples until first break / total seconds until first break
        temp_fs = (seq_breaks[0]) / datetime.timedelta.total_seconds(data['#time'].iloc[seq_breaks[0]] - data['#time'].iloc[0])
    fs.append(temp_fs)
    
    # look for activity notes
    temp = list(data['note'].unique())
    temp = [x for x in temp if str(x) != 'nan']    
    notes.append(temp)
    
    del temp 
    del temp_fs    

In [11]:
# remaining files have notes in the 'fw_version' column

for i in files[4:]:
    data = pd.read_csv(i, usecols = ['#time', 'seq', 'fw_version'])

    # calculate total recording duration
    data['#time'] = [datetime.datetime.fromtimestamp(s/1000.0) for s in data['#time']]
    duration.append(data['#time'].iloc[-1] - data['#time'].iloc[0]) # calculate duration of recordings
    
    # calculate sampling rate
    seq_int = data['seq'].diff() # change in seq number between consecutive samples
    seq_breaks = np.where(seq_int < 1) # indices where change was neg
    seq_breaks = seq_breaks[0]-1
    if len(seq_breaks) == 0: # if there is a sequence break, fs = samples until first break / total seconds until first break
        temp_fs = len(data['#time']) / datetime.timedelta.total_seconds(data['#time'].iloc[-1] - data['#time'].iloc[0])
    else:
        temp_fs = (seq_breaks[0]) / datetime.timedelta.total_seconds(data['#time'].iloc[seq_breaks[0]] - data['#time'].iloc[0])
    temp_fs=int(temp_fs)
    fs.append(temp_fs)
    
    # look for activity notes
    temp = list(data['fw_version'].unique())
    temp = [x for x in temp if str(x) != 'nan']    
    notes.append(temp)
    del temp 
    del temp_fs

In [10]:
# total duration of all recordings

total_duration = datetime.timedelta()

for b in np.arange(0,len(duration)):
    total_duration = total_duration + duration[b]

total_duration

Timedelta('51 days 12:07:39.173000')