## Initial cleanup
#### Incoming dataframe may look different for other file types

In [1]:
# Library imports

import pandas as pd
import numpy as np
import re

In [2]:
# read the source data file

df = pd.read_parquet('./data/source-data.parquet')

In [8]:
# review the dataframe

df.head()

Unnamed: 0,Client MAC Address,MAC Address,time,VLAN_ID,tonnage,activeTime,date,month,wday,geometry,type
0,00:00:00:57:b4:49,a8:9f:ec:fc:30:86,2019-11-22T17:00:00Z,822,0.245171,22.896,2019-11-22,Nov,Fri,"c(-75.160178, 39.942893)",outdoor
2,00:04:4b:25:23:3b,a8:9f:ec:da:ad:52,2019-11-11T02:00:00Z,1226,0.485443,125.977,2019-11-11,Nov,Mon,"c(-75.159325, 39.939919)",outdoor
1,00:04:4b:25:23:3b,00:0d:67:7b:aa:08,2019-11-23T05:00:00Z,843,0.002167,3.0,2019-11-23,Nov,Sat,"c(-75.157825, 39.944691)",outdoor
3,00:08:22:00:99:48,a8:9f:ec:da:b3:fa,2019-11-19T16:00:00Z,612,0.003054,3.724,2019-11-19,Nov,Tue,"c(-75.16045, 39.941096)",outdoor
5,00:08:22:00:99:48,a8:9f:ec:fc:30:b6,2019-11-19T16:00:00Z,1224,0.003543,8.515,2019-11-19,Nov,Tue,"c(-75.156092, 39.941646)",outdoor


In [7]:
# enabling sorted view of dataframe by client mac & time

df = df.sort_values(["Client MAC Address","time"], ascending=[True,True])

In [9]:
# Since this dataframe contains lat, long in geometry form - we need to recapture the lat-long
# without getting into the complication of installing geopandas to do something that regex
# can also accomplish without exceeding complexity

# define regex expressions
lon_match = re.compile(r'c\((.*),\s+?.*\)')
lat_match = re.compile(r'c\(.*,\s+?(.*)\)')

# apply regex transformations
df['lat'] = df['geometry'].str.extract(lat_match)
df['lon'] = df['geometry'].str.extract(lon_match)

In [10]:
# confirming that the lat-long transforms are numeric

df['lon'] = pd.to_numeric(df['lon'])
df['lat'] = pd.to_numeric(df['lat'])

## Adding the geometry calculations

#### Step 1: Use the Haversine calculator for distance between points

This uses the ‘haversine’ formula to calculate the great-circle distance between two points – 
that is, the shortest distance over the earth’s surface – 
giving an ‘as-the-crow-flies’ distance between the points

#### Step 2: Use basic math to infer directional changes

Use Latitude2-latitude1 to make guidance on (East/West movement)
Similar logic for Longitudianl changes to do North/South movement

#### Step 3: Clean-up excess columns

Remove columns that are not needed

#### Notes:

These calculations could potentially be made simpler (or could use movingpandas to infer directly)
the distance calculator can take upwards of 1200s (20 minutes to execute)


In [11]:
def haversine_np(lon1, lat1, lon2, lat2):
    """Caculate the haversine formula"""
    lon1, lat1, lon2, lat2 = map(np.radians,[lon1, lat1, lon2, lat2])
    
    dlon = lon2 - lon1
    dlat = lat2 - lat1
    
    a = np.sin(dlat/2.0)**2 + np.cos(lat1)*np.cos(lat2)*np.sin(dlon/2.0)**2
    
    c = 2 * np.arcsin(np.sqrt(a))
    ft = 6367 * c * 1000 * 3.28084 # the np formula returns values in kms
    return ft

def distance(x):
    y = x.shift()
    return haversine_np(x['lat'], x['lon'], y['lat'], y['lon']).fillna(0)

In [12]:
# Apply the distance formula to the dataframe

df['distance'] = df.groupby('Client MAC Address').apply(distance).reset_index(level=0, drop=True)

In [13]:
# Calculating directional changes
df['EastDiff'] = df.groupby('Client MAC Address')['lat'].diff()
df['NorthDiff'] = df.groupby('Client MAC Address')['lon'].diff()

# fill NA's with null (0) - for stationary clients
df['EastDiff'] = df['EastDiff'].fillna(0)
df['NorthDiff'] = df['NorthDiff'].fillna(0)



In [14]:
# Outline movment direction for the client (compassDirection)
# and movement as perceived by the AP (compassDirection _AP)
# please note that the movement perceived by the AP will be
# the exact opposite of the directional movement as perceived
# by the client

# applying directions from the client's perspective
df['EastWest'] = df.EastDiff.apply(lambda x: "East" if x >0 else ("stationary" if x == 0 else "West"))
df['NorthSouth'] = df.NorthDiff.apply(lambda x: "North" if x >0 else ("stationary" if x == 0 else "South"))
df['compassDirection'] = df[['NorthSouth','EastWest']].apply(lambda x: '-'.join(x), axis=1)

# applying directions from AP's perspective
df['EastWest_toAP'] = df.EastWest.apply(lambda x: "East" if x == "West" else ("stationary" if x == "stationary" else "West"))
df['NorthSouth_toAP'] = df.NorthSouth.apply(lambda x: "South" if x == "North" else ("stationary" if x == "stationary" else "North"))
df['compassDirection_AP'] = df[['NorthSouth_toAP','EastWest_toAP']].apply(lambda x: '-'.join(x), axis=1)

In [15]:
# remove columns that are not needed in the final table

df = df.drop(columns=["geometry", "EastDiff", 
                              "NorthDiff", "EastWest",
                             "NorthSouth", "EastWest_toAP",
                             "NorthSouth_toAP"])

In [16]:
# Saving the file

df.to_parquet('./data/df_newEnrichedrows1.parquet', compression = 'snappy')