In [1]:
import os
import numpy as np
import pandas as pd
import pickle
from tqdm import tqdm

In [2]:
data_path = '../dataset/WISDM_at_v2.0/WISDM_at_v2.0_unlabeled_raw.txt'
max_interval = 1000
min_timestamp = 10
min_step = 10
freq = '50ms'

In [3]:
cache_dir = 'unlabeled'

if len(os.listdir(cache_dir)):
    print(f'{cache_dir}/* already exists')
else:
    print(f'writing {cache_dir}')
    with open(data_path, 'r') as file:
        prev_user = None
        prev_timestamp = None
        series = None
        series_count = 0
        for line in tqdm(file):
            cols = line[:-2].split(',')

            if len(cols) != 6: continue

            user, activity, timestamp, x_acc, y_acc, z_acc = cols
            timestamp = int(timestamp)
            x_acc = float(x_acc)
            y_acc = float(y_acc)
            z_acc = float(z_acc)

            if timestamp < 0: continue

            item = (timestamp, activity, x_acc, y_acc, z_acc)
            if prev_user == user and timestamp - prev_timestamp < max_interval and timestamp >= prev_timestamp:
                if prev_timestamp != timestamp:
                    series.append(item)
            else:
                if series and len(series) >= min_timestamp:
                    with open(os.path.join(cache_dir, f'{series_count}.pkl'), 'wb') as cache_file:
                        pickle.dump(series, cache_file)
                        series_count += 1
                series = [item]
            prev_user = user
            prev_timestamp = timestamp

        if series and len(series) >= min_timestamp:
            with open(os.path.join(cache_dir, f'{series_count}.pkl'), 'wb') as cache_file:
                pickle.dump(series, cache_file)
                series_count += 1

unlabeled/* already exists


In [4]:
if 0:
    series_lens = []
    for fname in tqdm(os.listdir(cache_dir)):
        with open(os.path.join(cache_dir, fname), 'rb') as file:
            series = pickle.load(file)
        series_lens.append(len(series))

    series_lens = np.asarray(series_lens)

    import matplotlib.pyplot as plt
    %matplotlib inline
    plt.hist(series_lens[np.where(series_lens < 100)], 100)
    plt.show()
    plt.hist(series_lens[np.where(series_lens < 1000)], 100)
    plt.show()
    plt.hist(series_lens, 100)
    plt.show()

100%|█████████████████████████████████████████████████████████████████████████████| 3721/3721 [00:21<00:00, 171.26it/s]


In [6]:
cache_dir = 'unlabeled'
cache_dir_df = 'unlabeled_df'

if len(os.listdir(cache_dir_df)):
    print(f'{cache_dir_df}/* already exists')
else:
    print(f'writing {cache_dir_df}')
    #dfs = []
    series_count = 0
    for fname in tqdm(sorted(os.listdir(cache_dir), key=lambda k:int(os.path.splitext(k)[0]))):
        with open(os.path.join(cache_dir, fname), 'rb') as file:
            series = pickle.load(file)
        
        df = pd.DataFrame(series)

        try:

            df.index = pd.to_datetime(df[[0]][0], unit='ms')

            df_type = df[[1]].resample(freq).nearest()

            df_acc = df[[2, 3, 4]].resample(freq).mean().interpolate(method='linear')

            df = df_type.join(df_acc)

            if len(df) < min_step: continue
            #dfs.append(df)
        except:
            print(df)
            
        with open(os.path.join(cache_dir_df, f'{series_count}_{fname[:-4]}.pkl'), 'wb') as cache_file:
            pickle.dump(df, cache_file)
            series_count += 1

unlabeled_df/* already exists


In [7]:
def preview(path):
    with open(os.path.join(cache_dir, path), 'rb') as file:
        series = pickle.load(file)
    print(series[:10])
    df = pd.DataFrame(series)

    try:

        df.index = pd.to_datetime(df[[0]][0], unit='ms')

        df_type = df[[0, 1]].resample(freq).nearest()

        df_acc = df[[2, 3, 4]].resample(freq).mean().interpolate(method='linear')

        df = df_type.join(df_acc)

        if len(df) < min_step: 
            print(len(df))
            return
        #dfs.append(df)
    except:
        print(df)
        
    print(df)

In [8]:
if 0:
    preview('0.pkl')

[(1377283015208, 'NoLabel', 0.06810174, -0.027240695, 9.833891), (1377283015258, 'NoLabel', 0.10896278, -0.06810174, 9.997335), (1377283015308, 'NoLabel', 0.10896278, -0.14982383, 9.997335), (1377283015358, 'NoLabel', 0.06810174, -0.027240695, 10.147159), (1377283015408, 'NoLabel', 0.06810174, -0.14982383, 10.065437), (1377283015458, 'NoLabel', 0.06810174, -0.06810174, 9.915613), (1377283015508, 'NoLabel', 0.027240695, -0.06810174, 9.915613), (1377283015558, 'NoLabel', 0.027240695, -0.06810174, 9.997335), (1377283015608, 'NoLabel', 0.027240695, -0.027240695, 9.997335), (1377283015658, 'NoLabel', 0.027240695, -0.06810174, 10.024576)]
                                     0        1         2         3          4
0                                                                             
2013-08-23 18:36:55.200  1377283015208  NoLabel  0.068102 -0.027241   9.833891
2013-08-23 18:36:55.250  1377283015258  NoLabel  0.108963 -0.068102   9.997335
2013-08-23 18:36:55.300  1377283015308  NoL