In [1]:
from google.colab import drive
drive.mount('/content/drive')

import pandas as pd
import numpy as np
from geopy.distance import geodesic
from math import sqrt
import random

import matplotlib.pyplot as plt


Mounted at /content/drive


In [4]:
# Read data

%%time

train_df = pd.read_csv('/content/drive/MyDrive/P3 Fitrec Dataset/Generated datsets fresh/TRAIN_DATA.csv')
test_df = pd.read_csv('/content/drive/MyDrive/P3 Fitrec Dataset/Generated datsets fresh/TEST_DATA.csv')

print(train_df.shape)
print(test_df.shape)
display(train_df.head())

(53053, 28)
(13264, 28)


Unnamed: 0,id,userId,gender,sport,duration,calories,distance,avg_heart_rate,longitude,latitude,...,validate,avg_alti,change_alti,max_alti,min_alti,diff_alti,avg_speed,Cluster,route,route_id
0,321096209,9422215,male,run,4562,707.0,12.32541,142.808,"[-51.2151208, -51.2150449, -51.2148496, -51.21...","[-23.3441176, -23.3441733, -23.3443335, -23.34...",...,True,557.1204,264.0,582.0,544.2,37.8,9.792181,5,"('run', '5')",4
1,235150366,2060912,male,run,1101,266.0,3.26,155.146,"[-1.1467345, -1.1467678, -1.1467934, -1.146828...","[52.8835047, 52.8834938, 52.8834532, 52.883402...",...,True,53.1904,43.0,76.4,47.2,29.2,10.61489,0,"('run', '0')",1
2,355803233,7038373,male,run,3149,697.65,11.109125,140.65,"[13.419533, 13.419299, 13.419221, 13.419105, 1...","[49.757715, 49.757574, 49.757353, 49.757227, 4...",...,True,263.499296,358.674,324.0,230.0,94.0,12.553901,5,"('run', '5')",4
3,339538932,3714939,male,bike,7386,807.0,62.04549,113.87,"[175.6174659356475, 175.61758738942444, 175.61...","[-40.3581553325057, -40.35807637497783, -40.35...",...,True,18.242,227.2,76.0,-1.6,77.6,29.617781,0,"('bike', '0')",6
4,643461337,3275003,male,bike,3431,102.0,21.03097,199.892,"[-2.6546424441039562, -2.6548170391470194, -2....","[52.793660620227456, 52.79367495328188, 52.793...",...,True,86.1496,285.4,181.6,46.4,135.2,21.859656,3,"('bike', '3')",7


CPU times: user 31.3 s, sys: 4.7 s, total: 36 s
Wall time: 1min 10s


# 3. Find workout records that return to start point at the end

In [5]:
# Define function to compute distance based on Latitude, Longitude, Altitude
def geodis(lat_0, lon_0, alt_0, lat_1, lon_1, alt_1):
    dis = geodesic((lat_0, lon_0), (lat_1, lon_1)).km
    dis = sqrt(dis**2 + (alt_0/1000-alt_1/1000)**2)
    return dis

# Define function to check if a workout record has returned to start point at the end


def isback(df_row, num_to_check):


    '''
    df_row: a row of dataframe
    num_to_check: number of points to check

    1. We take num_to_check points at the beginning of workout route and num_to_check points
    at the end of the workout route

    2. We compute the distances between each point at the beginning with all points at the end respectively

    3. If one distance is smaller than threshold, then we return 1 else we return 0

    '''

    if df_row.sport == 'run':
        thres = 0.02
    else:
        thres = 0.04

    lat_head = eval(df_row.latitude)[0:num_to_check]
    lon_head = eval(df_row.longitude)[0:num_to_check]
    alt_head = eval(df_row.altitude)[0:num_to_check]
    lat_tail = eval(df_row.latitude)[-num_to_check:]
    lon_tail = eval(df_row.longitude)[-num_to_check:]
    alt_tail = eval(df_row.altitude)[-num_to_check:]

    dis_list = []

    for i in range(0, num_to_check):
        dis = [geodis(lat_head[i], lon_head[i], alt_head[i], lat_tail[j],
                      lon_tail[j], alt_tail[j]) for j in range(0, num_to_check)]
        dis_list.extend(dis)

    if min(dis_list) < thres:
        return 1
    else:
        return 0

In [6]:
%%time

train_df['isback'] = train_df.apply(lambda x: isback(x, 5), axis=1)
test_df['isback'] = test_df.apply(lambda x: isback(x, 5), axis=1)

print(train_df.isback.value_counts())

0    31032
1    22021
Name: isback, dtype: int64
CPU times: user 11min 16s, sys: 1.64 s, total: 11min 18s
Wall time: 11min 35s


# 4. Extend workout routes

In [7]:
# Extract rows where workout route has returned to starting point at the end

adjust_train_df = train_df[train_df.isback==1].copy()
adjust_test_df = test_df[test_df.isback==1].copy()

In [8]:
# Define function to create new sequential features

def update_sequence(df_row, max_extend_point):


    '''
    df_row: a row of dataframe
    max_extend_point: maximum number of points to extend


    1. We randomly draw a number as the number of points to extend:
    ext_len

    2. We extract the number of points from start of workout sequence:
    lat_head, lon_head, alt_head, distance_head

    3. We generate Gaussian noise and add to the latitude and longitude of the
    extracted sequence:
    lat_head_noise, lon_head_noise

    4. We re-calculate distance sequence due to added noise to latitude and longitude

    5. Because we will extend a number of points to sequence data, we also randomly
    remove same number of points from original sequence so that the sequence length
    is unchanged:
    tmp_alt, tmp_distance

    6. For dropped points, we re-calculate the distance and speed of neighbouring point

    7. For altitude and distance, we concatenate extracted points from beginning of
    sequence and original sequence with points randomly dropped:
    alt_head+tmp_alt, distance_head+tmp_distance

    8. For speed, heart rate, we keep original sequence with points randomly dropped:
    tmp_heart, tmp_speed

    9. We also return the index of the last point from the original sequence:
    complete_idx
    '''

    # Randonly draw the number of points to extend
    ext_len = random.randrange(30, max_extend_point)

    # Extract number of points from start of workout sequence
    lat_head = eval(df_row.latitude)[0:ext_len]
    lon_head = eval(df_row.longitude)[0:ext_len]
    alt_head = eval(df_row.altitude)[0:ext_len]
    distance_head = eval(df_row.derived_distance)[0:ext_len]

    # Generate Gaussian noise
    max_noise_lat = np.absolute(np.array(lat_head).mean()/100000000.)
    max_noise_lon = np.absolute(np.array(lat_head).mean()/100000000.)

    noise_lat = np.random.normal(0, max_noise_lat, ext_len)
    noise_lon = np.random.normal(0, max_noise_lon, ext_len)

    # Add Gaussian noise to latitude and longitude of extended route
    lat_head_noise = np.add(lat_head, noise_lat)
    lon_head_noise = np.add(lon_head, noise_lon)

    # Update distance array based on new latitude and longitude with noise
    dis_tail = distance_head[-1]
    distance_head = np.array([geodis(lat_head_noise[idx], lon_head_noise[idx], alt_head[idx],
                                    lat_head_noise[idx+1], lon_head_noise[idx+1], alt_head[idx+1]) for idx in range(len(distance_head)-1)])
    distance_head = np.append(distance_head, dis_tail)


    # Sample indices to drop from original route
    # we don't want to touch the head and tail point
    drop_indices = random.sample(range(1, 498), ext_len)

    # Adjust distance and speed due to dropped points

    # Get value from each cell for each feature
    tmp_lat = eval(df_row.latitude)
    tmp_lon = eval(df_row.longitude)
    tmp_alt = eval(df_row.altitude)
    tmp_heart = eval(df_row.heart_rate)
    tmp_speed = eval(df_row.derived_speed)
    tmp_distance = eval(df_row.derived_distance)
    tmp_timestamp = eval(df_row.timestamp)

    tmp_df = pd.DataFrame(data=[tmp_lat[:499],
                                tmp_lon[:499],
                                tmp_alt[:499],
                                tmp_heart[:499],
                                tmp_speed[:499],
                                tmp_distance[:499],
                                tmp_timestamp[:499]]).T

    tmp_df.rename(columns={0: 'latitude',
                            1: 'longitude',
                            2: 'altitude',
                            3: 'heart_rate',
                            4: 'derived_speed',
                            5: 'derived_distance',
                            6: 'timestamp'}, inplace=True)

    # Adjust distance and speed due to dropped points
    for idx in drop_indices:

        # Find idx of previous row in case the row is already deleted
        prev_idx = idx-1
        while prev_idx not in tmp_df.index:
            prev_idx -= 1

        # Find idx of next row in case the row is already deleted
        next_idx = idx+1
        while next_idx not in tmp_df.index:
            next_idx += 1

        # idx point will be deleted, we add idx point distance to the distance at previous point
        tmp_df.loc[prev_idx, 'derived_distance'] += tmp_df.loc[idx,
                                                                'derived_distance']
        # Re-calculate speed based on new distance for previous point
        tmp_df.loc[prev_idx, 'derived_speed'] = tmp_df.loc[prev_idx, 'derived_distance'] / \
            ((tmp_df.loc[next_idx, 'timestamp'] -
                tmp_df.loc[prev_idx, 'timestamp'])/3600)
        # Drop row at idx point
        tmp_df.drop([idx], inplace=True)

    # Get reduced feature arrays
    tmp_lat = tmp_df.latitude.to_numpy()
    tmp_lon = tmp_df.longitude.to_numpy()
    tmp_alt = tmp_df.altitude.to_numpy()
    tmp_heart = tmp_df.heart_rate.to_numpy()
    tmp_speed = tmp_df.derived_speed.to_numpy()
    tmp_distance = tmp_df.derived_distance.to_numpy()

    # Store idx where original workout completes
    complete_idx = tmp_lat.shape[0]-1

    # Update distance between last point of original workout route to first point of extended route
    tmp_distance[-1] = geodis(tmp_lat[-1], tmp_lon[-1], tmp_alt[-1],
                              lat_head_noise[0], lon_head_noise[0], alt_head[0])

    # Extend altitude sequence
    tmp_alt = np.append(tmp_alt, alt_head)

    # Extend distance sequence
    tmp_distance = np.append(tmp_distance, distance_head)

    # Total distance
    tmp_distance_sum = np.sum(tmp_distance)

    return str(list(tmp_alt)), str(list(tmp_distance)), str(list(tmp_heart)), str(list(tmp_speed)), complete_idx, tmp_distance_sum

In [9]:
# Create new sequential features for training set

%%time

adjust_train_df['altitude_adjusted'], \
    adjust_train_df['distance_adjusted'], \
    adjust_train_df['heart_rate_adjusted'], \
    adjust_train_df['speed_adjusted'], \
    adjust_train_df['complete_idx'], \
    adjust_train_df['distance_adjusted_sum'] = zip(
        *adjust_train_df.apply(lambda x: update_sequence(x, 100), axis=1))

CPU times: user 31min 6s, sys: 16 s, total: 31min 22s
Wall time: 31min 41s


In [10]:
np.isnan(adjust_train_df['distance_adjusted_sum'].to_numpy()).any()

False

In [11]:
# Create new sequential features for test set

%%time

adjust_test_df['altitude_adjusted'], \
    adjust_test_df['distance_adjusted'], \
    adjust_test_df['heart_rate_adjusted'], \
    adjust_test_df['speed_adjusted'], \
    adjust_test_df['complete_idx'], \
    adjust_test_df['distance_adjusted_sum'] = zip(
        *adjust_test_df.apply(lambda x: update_sequence(x, 100), axis=1))

CPU times: user 7min 40s, sys: 3.96 s, total: 7min 44s
Wall time: 7min 45s


In [12]:
np.isnan(adjust_test_df['distance_adjusted_sum'].to_numpy()).any()

False

# 5. Concatenate datasets

In [13]:
# Concatenate the subset that returned to starting point and the subset that didn't return to starting point for training data

%%time

non_adjust_train_df = train_df[train_df.isback != 1].copy()
non_adjust_train_df['altitude_adjusted'] = non_adjust_train_df.altitude
non_adjust_train_df['distance_adjusted'] = non_adjust_train_df.derived_distance

# Randomly chop off 0-9 points at tail
non_adjust_train_df['complete_idx'] = non_adjust_train_df.apply(lambda x: random.randrange(490, 499), axis=1)
non_adjust_train_df['speed_adjusted'] = non_adjust_train_df.apply(lambda x: str(eval(x.derived_speed)[:x.complete_idx]), axis=1)
non_adjust_train_df['heart_rate_adjusted'] = non_adjust_train_df.apply(lambda x: str(eval(x.heart_rate)[:x.complete_idx]), axis=1)
non_adjust_train_df['distance_adjusted_sum'] = non_adjust_train_df.distance

train_df = pd.concat([non_adjust_train_df, adjust_train_df], ignore_index=True)

CPU times: user 1min 11s, sys: 435 ms, total: 1min 12s
Wall time: 1min 12s


In [14]:
# Concatenate the subset that returned to starting point and the subset that didn't return to starting point for test data

%%time

non_adjust_test_df = test_df[test_df.isback != 1].copy()
non_adjust_test_df['altitude_adjusted'] = non_adjust_test_df.altitude
non_adjust_test_df['distance_adjusted'] = non_adjust_test_df.derived_distance

# Randomly chop off 0-9 points at tail
non_adjust_test_df['complete_idx'] = non_adjust_test_df.apply(lambda x: random.randrange(490, 499), axis=1)
non_adjust_test_df['speed_adjusted'] = non_adjust_test_df.apply(lambda x: str(eval(x.derived_speed)[:x.complete_idx]), axis=1)
non_adjust_test_df['heart_rate_adjusted'] = non_adjust_test_df.apply(lambda x: str(eval(x.heart_rate)[:x.complete_idx]), axis=1)
non_adjust_test_df['distance_adjusted_sum'] = non_adjust_test_df.distance

test_df = pd.concat([non_adjust_test_df, adjust_test_df], ignore_index=True)

CPU times: user 17.9 s, sys: 128 ms, total: 18.1 s
Wall time: 18.2 s


In [15]:
print(len(train_df))
print(len(test_df))

53053
13264


In [16]:
# https://drive.google.com/file/d/1gJTFdyq_yfRK1-PD3Y_S-UkDJIrseBVu/view?usp=sharing
train_df.to_csv('/content/drive/MyDrive/P3 Fitrec Dataset/Generated datsets fresh/TrainData_adjusted.csv',index=0)
# https://drive.google.com/file/d/1-4WQdpOogRog7Jg_sP5SrOb802UTqpGc/view?usp=sharing
test_df.to_csv('/content/drive/MyDrive/P3 Fitrec Dataset/Generated datsets fresh/TestData_adjusted.csv',index=0)

In [19]:
train_df.head()

Unnamed: 0,id,userId,gender,sport,duration,calories,distance,avg_heart_rate,longitude,latitude,...,Cluster,route,route_id,isback,altitude_adjusted,distance_adjusted,complete_idx,speed_adjusted,heart_rate_adjusted,distance_adjusted_sum
0,235150366,2060912,male,run,1101,266.0,3.26,155.146,"[-1.1467345, -1.1467678, -1.1467934, -1.146828...","[52.8835047, 52.8834938, 52.8834532, 52.883402...",...,0,"('run', '0')",1,0,"[72.2, 72.2, 72.0, 72.0, 72.0, 72.0, 72.0, 72....","[0.0025427584834721174, 0.004835919853031221, ...",497,"[4.576965270249811, 8.123506808293383, 8.83475...","[84, 84, 84, 84, 91, 97, 104, 104, 104, 114, 1...",3.26
1,355803233,7038373,male,run,3149,697.65,11.109125,140.65,"[13.419533, 13.419299, 13.419221, 13.419105, 1...","[49.757715, 49.757574, 49.757353, 49.757227, 4...",...,5,"('run', '5')",4,0,"[251.674, 251.0, 250.0, 250.0, 250.0, 250.0, 2...","[0.023003297430670828, 0.025232499492122442, 0...",490,"[16.562374150082995, 14.480830064118322, 13.74...","[88, 88, 88, 88, 88, 88, 88, 123, 173, 188, 15...",11.109125
2,339538932,3714939,male,bike,7386,807.0,62.04549,113.87,"[175.6174659356475, 175.61758738942444, 175.61...","[-40.3581553325057, -40.35807637497783, -40.35...",...,0,"('bike', '0')",6,0,"[26.4, 24.2, 23.2, 23.2, 23.2, 23.2, 23.2, 23....","[0.01370920815625349, 0.021231172720794157, 0....",490,"[12.33828734062814, 12.738703632476494, 19.811...","[79, 81, 83, 94, 99, 100, 108, 119, 115, 109, ...",62.04549
3,643461337,3275003,male,bike,3431,102.0,21.03097,199.892,"[-2.6546424441039562, -2.6548170391470194, -2....","[52.793660620227456, 52.79367495328188, 52.793...",...,3,"('bike', '3')",7,0,"[68.2, 68.2, 68.4, 68.4, 68.8, 69.6, 69.8, 70....","[0.011850857189731395, 0.0487571934559195, 0.0...",491,"[42.66308588303302, 25.075128063044314, 15.527...","[158, 158, 159, 159, 161, 162, 163, 163, 169, ...",21.03097
4,236276882,2369854,male,run,3071,658.0,9.69188,153.306,"[6.691067, 6.6909392, 6.6908031, 6.6906923, 6....","[49.1992962, 49.1991181, 49.1989493, 49.198776...",...,0,"('run', '0')",1,0,"[64.8, 65.0, 64.8, 64.4, 65.2, 64.8, 64.8, 65....","[0.021880497505619087, 0.021222938946147168, 0...",498,"[13.128298503371452, 12.799595473696934, 12.05...","[143, 143, 144, 139, 151, 153, 150, 150, 149, ...",9.69188


In [21]:
train_df.shape

(53053, 35)

In [23]:
test_df.shape

(13264, 35)

In [22]:
train_df.columns

Index(['id', 'userId', 'gender', 'sport', 'duration', 'calories', 'distance',
       'avg_heart_rate', 'longitude', 'latitude', 'altitude', 'timestamp',
       'heart_rate', 'speed', 'url', 'derived_distance', 'derived_speed',
       'time_elapsed', 'validate', 'avg_alti', 'change_alti', 'max_alti',
       'min_alti', 'diff_alti', 'avg_speed', 'Cluster', 'route', 'route_id',
       'isback', 'altitude_adjusted', 'distance_adjusted', 'complete_idx',
       'speed_adjusted', 'heart_rate_adjusted', 'distance_adjusted_sum'],
      dtype='object')

In [17]:
np.isnan(train_df['distance_adjusted_sum'].to_numpy()).any()

False

In [18]:
np.isnan(test_df['distance_adjusted_sum'].to_numpy()).any()

False