In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from geopy import distance

In [None]:
#Data Preprocessing

In [None]:
# converts a raw data file, test flag takes a small subset of the first file (the first threshold rows)
def convertToJson(inputFile, outputFile, test=False, threshold=1000):
    
    # read lines into memory
    with open(inputFile, 'r') as inFile:
        count = 0
        lines = []
        for line in inFile:
            lines.append(line)
            count +=1
            if count > threshold and test:
                break
        print(count)

    # write lines out to json file, last line needs to not have comma otherwise pandas throws error
    with open(outputFile, 'w') as outFile:
        outFile.write('[\n')
        k = len(lines)
        for i, line in enumerate(lines[:-1]):
            res = line.replace('}','},')
            outFile.write(res)
        outFile.write(lines[-1])
        outFile.write(']\n')


In [None]:
# creates a data frame given json file jsonData
def createDataFrame(jsonData):
    df = None
    with open(jsonData,'r') as data:
        df = pd.read_json(data)
    return df


In [None]:
#Driver data demo

In [None]:
fileName1 = 'rio_bq_2019000000000000'
outputFileName1 = fileName1+'.json'
df1 = convertToJson(fileName1, outputFileName1, test=True, threshold=10000)
                
fileName2 = 'rio_bq_2019000000000001'
outputFileName2 = fileName2+'.json'
df2 = convertToJson(fileName2, outputFileName2, test=True, threshold=10000)
                
df1 = createDataFrame(outputFileName1)
df2 = createDataFrame(outputFileName2)

frames = [df1, df2]
df = pd.concat(frames, ignore_index=True)

In [None]:
#Time data manipulation

In [None]:

def convertDateTime(df):
    df['localtime'] = pd.to_datetime(df['localtime'])
    return df

df = convertDateTime(df)

fig, ax = plt.subplots()
top100 = df['ClientMacAddr'].value_counts()[:10]
top100.plot.bar(ax=ax)

In [None]:
# Generate time series data for desired mac addresses

In [None]:
from datetime import timedelta

def createInitialAddresses(df, startTime):
    endTime = startTime + timedelta(hours=1)
    timeFrame = df[(df['localtime'] >= startTime) & (df['localtime'] < endTime)]
    addrCounts = df['ClientMacAddr'].value_counts()[:20].to_dict()
    return list(addrCounts.keys())

def generateMacAddrLocationTimeSeries(df, addrList,starTime , endTime):
    timeFrame = df[(df['localtime'] >= startTime) & (df['localtime'] < endTime)]
    macAddrLocation = dict()
    for addr in addrList:
        macAddrLocation[addr] = []
    
    for time in range(0, 160, 20):
        frameStartTime = startTime + timedelta(minutes=time)
        frameEndTime = frameStartTime + timedelta(minutes=10)
        frame = df[(df['localtime'] >= frameStartTime) & (df['localtime'] < frameEndTime)]
        for addr in macAddrLocation:
            records = frame.loc[frame['ClientMacAddr'] == addr]
            if records.empty:
                macAddrLocation[addr].append(None)
            else:
                macAddrLocation[addr].append(records.iloc[0])
    return macAddrLocation
        
import datetime
# knnGroupSoloClassifier(df, addrList)

startTime = datetime.datetime(2019,8,1,3,0,0,0, datetime.timezone.utc ) 
endTime = datetime.datetime(2019,8,1,6,0,0,0, datetime.timezone.utc)
addrList = createInitialAddresses(df, startTime)
macAddrLocation = generateMacAddrLocationTimeSeries(df, addrList, startTime, endTime)
macAddrLocation

In [None]:
# Compute distances from neighbor to neighbor

In [None]:
def computeNeighborDistance(df, addrList,starTime , endTime):
    timeFrame = df[(df['localtime'] >= startTime) & (df['localtime'] < endTime)]
    macAddrLocation = dict()
    for addr in addrList:
        macAddrLocation[addr] = []
    
    distances = dict()
    for addr in addrList:
        distances[addr] = dict()
        
    for time in range(0, 160, 20):
        frameStartTime = startTime + timedelta(minutes=time)
        frameEndTime = frameStartTime + timedelta(minutes=10)
        frame = df[(df['localtime'] >= frameStartTime) & (df['localtime'] < frameEndTime)]
        for addr in addrList:
            coord_1 = (-22.81152051717558, -43.24944357396987)
            for m,record in frame.iterrows():
                distances[addr][record['ClientMacAddr']] = str(distance.vincenty(coord_1, (record['lat'],record['lng'])))

    return distances
        
    
startTime = datetime.datetime(2019,8,1,3,0,0,0, datetime.timezone.utc ) 
endTime = datetime.datetime(2019,8,1,6,0,0,0, datetime.timezone.utc)
addrList = ['3c:a8:2a:78:83:94']
distances = computeNeighborDistance(df, addrList, startTime, endTime)

distances

In [None]:
len(distances['3c:a8:2a:78:83:94'])