In [34]:
import numpy as np
import pickle
import os
from haversine import haversine
from math import floor
from collections import defaultdict
import random
import gzip
from tqdm import tqdm 
import pandas as pd
import time
from tqdm import tqdm

In [35]:
# Cleaned endomondo dataset.... (simply cleaned for cases with abnormal readings..)
data_path = "endomondoHR_proper.json"

# Attribute embedding features....
attrFeatures = ['userId', 'sport', 'gender']
    
# Percentage of data splits....
trainValidTestSplit = [0.8, 0.1, 0.1]
targetAtts = ["sport"]
    
# Time sequnce inputs.... (contextual features...)
inputAtts = ["distance", "altitude", "time_elapsed", "heart_rate"]

# splits file
trainValidTestFN = "endomondoHR_proper_temporal_dataset.pkl"

In [36]:
def parse(path):
    if 'gz' in path:
        f = gzip.open(path, 'rb')
        for l in f.readlines():
            yield(eval(l.decode('ascii')))
    else:
        f = open(path, 'rb')
        for l in f.readlines():
            yield(eval(l))

def process(line):
    return eval(line)

In [37]:
with open("endomondoHR_proper_temporal_dataset.pkl", "rb") as f:
    trainingSet, validationSet, testSet, contextMap = pickle.load(f)
    print("train/valid/test set size = {}/{}/{}".format(len(trainingSet), len(validationSet), len(testSet)))
    print("******Dataset split loaded******") 

train/valid/test set size = 132755/16604/17058
******Dataset split loaded******


In [38]:
print(trainingSet)

[325462471, 326542063, 326542030, 330817777, 343482936, 349767861, 351555391, 357257906, 358164671, 358700141, 359283997, 360298984, 361662390, 361714712, 366415571, 366977747, 367471127, 369236357, 371302193, 372368431, 380802496, 381185226, 56411015, 56410980, 56410972, 55956264, 56410962, 56410955, 56410946, 56410923, 56410908, 56410900, 56410894, 56410883, 56410873, 56410843, 56410795, 56410786, 56410758, 55952464, 56410731, 57180613, 57180618, 57180620, 59163507, 59947257, 60759631, 62570224, 64120929, 64787535, 65898908, 66146488, 67839082, 67981538, 68425997, 79997223, 80738689, 81461230, 82864947, 83187428, 86531348, 87587102, 87887639, 89840069, 92105680, 92617449, 96040265, 96326794, 97651556, 98366546, 99738274, 100175531, 101014954, 106731407, 106731396, 106731385, 108018046, 108312891, 111108255, 111530739, 112377412, 113417873, 113417863, 115784007, 116231713, 116910273, 117412273, 151285227, 151755675, 152420540, 153373575, 156440479, 161130128, 161130120, 165521275, 167

In [39]:
count = 1000
data = []
with open('endomondoHR_proper.json') as f:
    for i,l in enumerate(f):
        # if i == count:
        #     break
        data.append(eval(l))

In [40]:
for d in data:
    key = 'url'
    del d[key]
    key = 'speed'
    if key in d:
        del d[key]

In [41]:
print(data[0].keys())

dict_keys(['longitude', 'altitude', 'latitude', 'sport', 'id', 'heart_rate', 'gender', 'timestamp', 'userId'])


In [42]:
isDerived = ['time_elapsed', 'distance', 'derived_speed', 'delta_hr', 'since_begin', 'since_last']
isNominal = ['gender', 'sport']
isSequence = ['altitude', 'heart_rate', 'latitude', 'longitude'] + isDerived

In [43]:
idxMap = defaultdict(int)

for idx,d in enumerate(data):
    idxMap[d['id']] = idx

contextMap2 = {}

for wid in contextMap:
    context = contextMap[wid]
    contextMap2[idxMap[wid]] = (context[0], context[1], [idxMap[wid] for wid in context[2]])

contextMap = contextMap2

In [45]:
trimmed_workout_len = 450

In [46]:
for idx,d in tqdm(enumerate(data), position = 0,leave = True):
    for k in isDerived:
        if k == 'time_elapsed':
            timestamps = d['timestamp']
            initialTime = timestamps[0]
            d[k] = [x-initialTime for x in timestamps]
        
        elif k == 'delta_hr':
            heart_rate = d['heart_rate']

            d[k] = [0] + [heart_rate[i+1]-heart_rate[i] for i in range(len(heart_rate)-1)]
        
        elif k == 'distance':
            lats = d['latitude']
            longs = d['longitude']
            indices = range(1,len(lats))
            distances = [0]
            distances.extend([haversine([lats[i-1],longs[i-1]], [lats[i],longs[i]]) for i in indices])

            d[k] = distances

            timestamps = d['timestamp']
            indices = range(1,len(timestamps))
            times = [0]
            times.extend([timestamps[i]-timestamps[i-1] for i in indices])

            speed = [0]
            for i in indices:
                try:
                    curr_speed = 3600*distances[i]/times[i]
                    speed.append(curr_speed)
                except:
                    speed.append(speed[i-1])
            
            d['derived_speed'] = speed
        
        elif k == 'since_last':
            if idx in contextMap:
                total_time = contextMap[idx][0]
            else:
                total_time = 0
            
            d[k] = np.ones(trimmed_workout_len) * total_time
        
        elif k == 'since_begin':
            if idx in contextMap:
                total_time = contextMap[idx][1]
            else:
                total_time = 0
            
            d[k] = np.ones(trimmed_workout_len) * total_time

1000it [00:04, 229.69it/s]


In [47]:
def buildEncoder(classLabels):
    # Constructs a dictionary that maps each class label to a list 
    # where one entry in the list is 1 and the remainder are 0
    encodingLength = len(classLabels)
    encoder = {}
    mapper = {}
    for i, label in enumerate(classLabels):
        encoding = [0] * encodingLength
        encoding[i] = 1
        encoder[label] = encoding
        mapper[label] = i
    return encoder, mapper

In [48]:
def computeMeanStd(varSums, numDataPoints, attributes):
    #print("Computing variable means and standard deviations")
    
    # assume each data point has 500 time step?! is it correct?
    numSequencePoints = numDataPoints * 500 
    
    variableMeans = {}
    for att in varSums:
        variableMeans[att] = varSums[att] / numSequencePoints
    
    varResidualSums = defaultdict(float)
    
    for numDataPoints, currData in enumerate(data):
        # loop each continuous attribute
        for att in attributes:
            dataPointArray = np.array(currData[att])
            # add to the variable running sum of squared residuals
            diff = np.subtract(dataPointArray, variableMeans[att])
            sq = np.square(diff)
            varResidualSums[att] += np.sum(sq)

    variableStds = {}
    for att in varResidualSums:
        variableStds[att] = np.sqrt(varResidualSums[att] / numSequencePoints)
        
    return variableMeans,variableStds

In [49]:
variableSums = defaultdict(float)
classLabels = defaultdict(set)

for d in tqdm(data):
    att = 'userId'
    user = d[att]

    classLabels[att].add(user)

    for att in isNominal:
        val = d[att]
        classLabels[att].add(val)
    
    for att in isSequence:
        variableSums[att] += sum(d[att])
    
oneHotEncoders = {}
oneHotMap = {}
encodingLengths = {}

for att in isNominal:
    oneHotEncoders[att],oneHotMap[att] = buildEncoder(classLabels[att])
    encodingLengths[att] = len(classLabels[att])

att = 'userId'
oneHotEncoders[att], oneHotMap[att] = buildEncoder(classLabels[att]) 
encodingLengths[att] = 1

for att in isSequence:
    encodingLengths[att] = 1

numDataPoints = len(data)
variableMeans, variableStds = computeMeanStd(variableSums,numDataPoints,isSequence)
    
    

100%|██████████| 1000/1000 [15:11<00:00,  1.10it/s]


In [50]:
def median_smoothing(seq,context_size):
    if context_size==1:
        return seq
    
    seq_len = len(seq)
    smoothed_seq = []
    for i in range(seq_len):
        cont_diff = (context_size - 1) / 2
        context_min = int(max(0, i-cont_diff))
        context_max = int(min(seq_len, i+cont_diff))
        median_val = np.median(seq[context_min:context_max])
        smoothed_seq.append(median_val)

    return smoothed_seq

def scaleData(data,att,zMultiple=2):
    mean,std = variableMeans[att], variableStds[att]
    diff = [d-mean for d in data]
    zScore = [d/std for d in diff]

    return [x*zMultiple for x in zScore]

In [51]:
targets = ['heart_rate', 'derived_speed', 'delta_hr']

for idx, d in tqdm(enumerate(data),position = 0, leave = True):
    for tAtt in targets:
        tar_data = median_smoothing(d[tAtt],1)
        tar_data = scaleData(tar_data,tAtt)

        data[idx]["tar_"+tAtt] = tar_data

    for att in isSequence:
        in_data = d[att]
        data[idx][att] = scaleData(in_data,att)

0it [00:00, ?it/s]

1000it [00:06, 144.36it/s]


In [52]:
print(oneHotMap)

{'gender': {'male': 0, 'unknown': 1, 'female': 2}, 'sport': {'run': 0, 'tennis': 1, 'rowing': 2, 'bike (transport)': 3, 'kayaking': 4, 'orienteering': 5, 'mountain bike': 6, 'bike': 7}, 'userId': {854368: 0, 4419649: 1, 430859: 2, 3905196: 3, 9815069: 4, 279317: 5, 4007546: 6, 10921915: 7, 11889307: 8, 6479229: 9, 4969375: 10}}
