In [1]:
from google.colab import drive
drive.mount('/content/drive')


Mounted at /content/drive


In [2]:
!pip install shap

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting shap
  Downloading shap-0.41.0-cp39-cp39-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (572 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m572.4/572.4 KB[0m [31m29.4 MB/s[0m eta [36m0:00:00[0m
Collecting slicer==0.0.7
  Downloading slicer-0.0.7-py3-none-any.whl (14 kB)
Installing collected packages: slicer, shap
Successfully installed shap-0.41.0 slicer-0.0.7


In [3]:
import pandas as pd
import seaborn as sns
import numpy as np
import shap
from sklearn.model_selection import train_test_split, cross_val_score, StratifiedKFold, KFold, RandomizedSearchCV
from sklearn.metrics import accuracy_score, make_scorer, classification_report, confusion_matrix

import matplotlib.pyplot as plt
from matplotlib.colors import ListedColormap
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.naive_bayes import GaussianNB
import xgboost as xgb
from xgboost import XGBClassifier

# Umwandlung JSON-Request zu CSV
Dieser Code muss nur ausgeführt werden, wenn die Erhebung von Referenzdaten statt mit dem Beacon Backend Projekt, mit hook.ubeac.io erfasst wurden. In diesem Codesegment werden die JSON-Files verarbeitet und die Informationen so aufbereitet, dass diese zu denselben CSV-Files führen wie bei dem Beacon Backend Projekt.

In [79]:
import glob, os

path = "/content/drive/MyDrive/Colab Notebooks/IndoorTrackingML/DATA/"
whitelistMajorMinor = [100029571,100029586,100656666,1000411111,1000422222,1000454480,1230329194]

os.chdir(path+"JSON/")
dictionary_list = []

for jsonFile in glob.glob("*.json"):
  filename = os.path.splitext(jsonFile)[0]

  df = pd.read_json(path+"/JSON/"+filename+".json",)

  #Iterates through all gateway traces
  for traceIndex in range(len(df)):
    currentTrace = df.iloc[traceIndex]
    
    #Splits data string as it is not castable as json string
    #Two step is required to filter out , signs
    traceEntries = currentTrace.content.split('"')[3::2]
    for beaconIndex in range(len(traceEntries)):
      beaconData = traceEntries[beaconIndex].split(',')

      if len(beaconData[4]) >= 56: 
        #Extracts beacon data out of hex value
        beaconType = beaconData[4][6:18]
        uuid = beaconData[4][18:50]
        major = int(beaconData[4][50:54],16)
        minor = int(beaconData[4][54:58],16)
        majorminor = int(str(major)+str(minor))
        timestamp = beaconData[5]

        #Checks if current iteration should be skipped as signal is not of beacons
        if majorminor not in whitelistMajorMinor:
          continue
        #General signal power strength set for beacons
        measuredPower = -60

        #Specific power was set for beacon with below majorminor value
        if majorminor == 100656666:
          measuredPower = -75

        #Id of the tag of the Beacon
        tagId = beaconData[1]
        gateway = beaconData[2]
        rssi = int(beaconData[3])
        
        #Calculates distance with given formula
        distance = (10 ** ((measuredPower - rssi) / (10 * 3)))

        #Creates beacon entry as dictionary entry for efficent dataframe transformation
        beacon = {'_id':tagId, 'distance':distance, 'gateway':gateway, 
                'major':major, 'majorminor':majorminor, 'minor':minor, 
                'rssi':rssi, 'timestamp':timestamp,'type':beaconType,
                'uuid':uuid}
        #Adds beacon to dictionary
        dictionary_list.append(beacon)
#Transforms dicitonary to dataframe (more efficent then concat)
transformedDf = pd.DataFrame.from_dict(dictionary_list)

#Cleaning of dublicates / multi entries which were recived with same timestamp
transformedDf[["distance","rssi"]] = transformedDf.groupby(["timestamp","majorminor","gateway","uuid"])[["distance","rssi"]].transform('median')
transformedDf = transformedDf.drop_duplicates().sort_values(by=["timestamp","gateway"]).reset_index(drop=True)

transformedDf.to_csv(r''+path+"/CSV/CombinedGatewayData.csv", index=False)

# Datenanalyse und -aufbereitung
In diesem Codesegment werden die Daten analysiert und aufbereitet

In [82]:
import glob, os

path = "/content/drive/MyDrive/Colab Notebooks/IndoorTrackingML/DATA/CSV/"

#List of timestamps in which the measurements were taken at each location
timestampList = [1679153940,1679154030,1679154120,1679154211,1679154301,1679154391,1679154482,
1679154571,1679154661,1679154751,1679154841,1679154931,1679155023]

#All tags in which the measure points are placed index equal to measuring point order  
tagValues = ["Room1","Room1","Room1","Room1","Room1","OOB","OOB","Room2","Room2","Room2","Room2","Room2"]
#All specific tags in which the measure points are placed index equal to measuring point order
specificTagValues = ["M1","M2","M3","M4","M5","M6","M7","M8","M9","M10","M11","M12"]

os.chdir(path)

frame = list()

for csvFile in glob.glob("*.csv"):
  df = pd.read_csv(path+csvFile)

  #This iterates through all data which are within the valid timestamp
  #-1 is required as the index is calculated +1 to get the lower and upper boundary
  #of each timemeasurement
  for index in range(len(timestampList)-1):
    selection = df.loc[(df.timestamp >= timestampList[index]) & 
                       (df.timestamp < timestampList[index+1])].copy()
    selection[["tag","specificTag"]] = [tagValues[index],specificTagValues[index]]
    frame.append(selection)
  
masterDf = pd.concat(frame).reset_index(drop=True)
masterDf.head(10)

Unnamed: 0,_id,distance,gateway,major,majorminor,minor,rssi,timestamp,type,uuid,tag,specificTag
0,C630A073DD08,0.502664,E06B09BAC79F,10002,100029586,9586,-51.0,1679153940,1AFF4C000215,0112233445566778899AABBCCDDEEFF0,Room1,M1
1,DC0D300F720A,1.84785,E06B09BAC79F,12303,1230329194,29194,-68.0,1679153940,1AFF4C000215,D546DF97475747EFBE093E2DCBDD0C77,Room1,M1
2,04EE03C47286,0.681292,E06B09BAC79F,10004,1000454480,54480,-55.0,1679153940,1AFF4C000215,FDA50693A4E24FB1AFCFC6EB07647825,Room1,M1
3,DC0D300F7252,0.429866,E06B09BAC79F,10065,100656666,6666,-64.0,1679153940,1AFF4C000215,FDA50693A4E24FB1AFCFC6EB07647825,Room1,M1
4,C4FE51F6A878,0.769494,E06B09BAC79F,10002,100029571,9571,-56.5,1679153940,1AFF4C000215,0112233445566778899AABBCCDDEEFF0,Room1,M1
5,087CBF000028,1.165914,E06B09BAC79F,10004,1000411111,11111,-62.0,1679153940,1AFF4C000215,FDA50693A4E24FB1AFCFC6EB07647825,Room1,M1
6,F0F8F2044F23,1.467799,E06B09BAC79F,10004,1000422222,22222,-65.0,1679153940,1AFF4C000215,FDA50693A4E24FB1AFCFC6EB07647825,Room1,M1
7,04EE03C47286,0.867821,E06B09BAC79F,10004,1000454480,54480,-58.0,1679153941,1AFF4C000215,FDA50693A4E24FB1AFCFC6EB07647825,Room1,M1
8,C630A073DD08,0.572725,E06B09BAC79F,10002,100029586,9586,-52.5,1679153941,1AFF4C000215,0112233445566778899AABBCCDDEEFF0,Room1,M1
9,DC0D300F720A,2.154435,E06B09BAC79F,12303,1230329194,29194,-70.0,1679153941,1AFF4C000215,D546DF97475747EFBE093E2DCBDD0C77,Room1,M1


In [84]:
###TODO: Overwork this logic 
#The feature extention with adding the gateway name as suffix to columns should be 
#added to this solution as well as the previous already used


#Columns in CSV Files
#_id,distance,gateway,major,majorminor,minor,rssi,timestamp,beaconType,uuid
features = ["distance","major","majorminor","minor","rssi","gateway"]
joinParameter = ["_id","type","uuid","tag","specificTag","timestamp"]

#Dinamical adapting on number of gateways used for experiment
gateways = masterDf.gateway.unique()

#Creating new Dataframes
adaptedMasterDf = pd.DataFrame()
tempDf = pd.DataFrame()

#Preparing Df for easy use
timeGatewayDf = masterDf.loc[:,["timestamp", "gateway"]].drop_duplicates()

#Repeat until every Timestamp and Gateway have combined / adapted to oneanother
while len(timeGatewayDf) > 0:
  currentLowestTimestamp = timeGatewayDf.timestamp.min()
  
  for gateway in gateways:
    timestamp = timeGatewayDf.loc[timeGatewayDf.gateway == gateway].timestamp.min()
    #Gets all beacon entries of current gateway with specified timestamp
    beaconMeasurementsOfGatewayDf = masterDf.loc[(masterDf.gateway == gateway) & (masterDf.timestamp == timestamp)].copy()
    #Sets the lowest timestamp to simplify join of dataframes
    beaconMeasurementsOfGatewayDf.timestamp = currentLowestTimestamp
    #Columns which could be used as feature are expanded with Name of Gateway as suffix
    beaconMeasurementsOfGatewayDf.columns = [col + '_' + gateway if col in features else col for col in beaconMeasurementsOfGatewayDf.columns]
    
    #If timestamp is not in range (ex. post not transmitted) current for iteration is aborted
    #as it has to be used for next iteration and combined with fitting post of other gateway
    if(timestamp > (currentLowestTimestamp + requestIntervalInSeconds)):
      beaconMeasurementsOfGatewayDf.drop(beaconMeasurementsOfGatewayDf.index, inplace=True)
    else:
      timeGatewayDf.drop(timeGatewayDf[(timeGatewayDf.gateway == gateway) & (timeGatewayDf.timestamp == timestamp)].index, inplace=True)
    
    #Checks if temp Dataframe already has entries and has to be merged or a copy is "dumped" in
    if len(tempDf) > 0:
      tempDf = pd.merge(tempDf,beaconMeasurementsOfGatewayDf, how='outer', on=joinParameter).copy()
    else:
      tempDf = beaconMeasurementsOfGatewayDf.copy()
  adaptedMasterDf = pd.concat([adaptedMasterDf, tempDf])
  tempDf.drop(tempDf.index , inplace=True)
adaptedMasterDf.reset_index(drop=True, inplace=True)

NameError: ignored

In [None]:
sns.lmplot(x='rssi_E06B09BAC79F', y='rssi_F3499FDED02E', data=adaptedMasterDf, hue='tag', fit_reg=False)
plt.show()

for tag in adaptedMasterDf.tag.unique():
  df = adaptedMasterDf.loc[adaptedMasterDf.tag == tag]
  sns.lmplot(x='rssi_E06B09BAC79F', y='rssi_F3499FDED02E', data=df, hue='specificTag', fit_reg=False)
  plt.show()