In [1]:
import os
import csv
import glob
import json
import pickle
import collections
import numpy as np
import pandas as pd
from tqdm import tqdm
from pathlib import Path
from typing import List, Tuple, Any

import multiprocessing
from multiprocessing import Pool

import dask
from dask.distributed import wait
from dask.distributed import Client, wait, LocalCluster

In [2]:
# set n_workers to number of cores
client = Client(n_workers=multiprocessing.cpu_count(), threads_per_worker=1)
client

0,1
Client  Scheduler: tcp://127.0.0.1:46721  Dashboard: http://127.0.0.1:8787/status,Cluster  Workers: 4  Cores: 4  Memory: 7.23 GiB


In [3]:
floor_map = {"B2": -2, "B1": -1, "F1": 0, "F2": 1, "F3": 2, "F4": 3, "F5": 4, "F6": 5, "F7": 6, "F8": 7, "F9": 8,
             "1F": 0, "2F": 1, "3F": 2, "4F": 3, "5F": 4, "6F": 5, "7F": 6, "8F": 7, "9F": 8}


minCount = 1
rssiFillerValue = -999.0
dtFillerValue   = 1000.0
freqFillerValue = 0
outputDir = '.'
sampleCsvPath = 'sample_submission.csv'
buildingBssidPklFilePath = "buildingBssids.pkl"

In [4]:
def input_dir() -> Path:
    return Path('.')

def generate_target_buildings() -> List[str]:
    ssubm = pd.read_csv(sampleCsvPath)
    ssubm_df = ssubm["site_path_timestamp"].apply(lambda x: pd.Series(x.split("_")))
    return sorted(ssubm_df[0].value_counts().index.tolist()) # type: ignore

def extract_wps_wifis(file: Path) -> Tuple[List[str], List[str]]:
    wps = []
    wifis = []
    with open(file) as f:
        for row in csv.reader(f, delimiter="\t", doublequote=True):
            if row[1] == "TYPE_WAYPOINT":
                # x
                row[2] = float(row[2])  # type: ignore
                # y
                row[3] = float(row[3])  # type: ignore
                wps.append([int(row[0]), row[2], row[3]])
            elif row[1] == "TYPE_WIFI":
                # wifi signal value
                row[4] = int(row[4])  # type: ignore
                wifis.append(row)
    wps = sorted(wps, key=lambda x: x[0])  # timestamp
    wifis = sorted(wifis, key=lambda x: x[0])  # timestamp
    return wps, wifis

In [5]:
def generateFloorBssids_train(building : str):
    """
    for a given building in train set, this function calculates
    the unique wifi bssids within each floor
    
    returns :  a dict with keys = {building}_{floor}, values = list(set(floorBssids))
    """
    building_path = input_dir() / 'train' / building
    floorBssids = {}
    folders = sorted(building_path.glob('*'))
    for folder in folders:
        folderData = []
        files = folder.glob("*.txt")
        for file in files:
            _, wifiData = extract_wps_wifis(file)
            folderData.extend([t[3] for t in wifiData])
        floorBssids[f"{folder.name}"] = list(set(folderData))
    return floorBssids

In [6]:
def generateIntersectingBssids(floorBssids):
    """
    given a dict with floor -> bssid list mapping,
    returns : list of bssids which occur in more than 1 floor 
    """
    commonBssids = []
    for k1, v1 in floorBssids.items():
        for k2, v2 in floorBssids.items():
            if (k1 != k2):
                intersectingBssids = list(set(v1).intersection(set(v2)))
                commonBssids.extend(intersectingBssids)
    return commonBssids

In [7]:
def printBssidsInfo(floorBssids):
    """
    given a dict with floor -> bssid list mapping,m
    Function calcuates total number of bssids and number 
    of unique bssids and prints result
    """
    totalBssids = []
    for k,v in floorBssids.items():
        ## print(f"{k} has {len(v)} total bssids")
        totalBssids.extend(v)
    
    print(f"Totally, There are {len(totalBssids)} bssids")
    print(f"There are {len(set(totalBssids))} unique bssids")

In [8]:
def generateUniqueFloorBssids(floorBssids, commonBssids):
    """
    floorBssids : a dict with floor -> bssid list mapping
    commonBssids : list of bssids which are present in more than one floor
    """
    uniqueFloorBssids = {}
    for k,v in floorBssids.items():
        uniqueFloorBssids[k] = list(set(v) - set(commonBssids))    
    return uniqueFloorBssids

In [9]:
def generateFloorUniqueBssidData(buildingsList):
    floorUniqueBssidData = {}
    for building in buildingsList:
        floorBssids = generateFloorBssids_train(building)    # consumes most time
        commonBssids = generateIntersectingBssids(floorBssids)
        uniqueBssids = generateUniqueFloorBssids(floorBssids, commonBssids)

        print(building)
        print(f"building unique bssids information")
        printBssidsInfo(floorBssids)
        print(f"Floor unique bssids information")
        printBssidsInfo(uniqueBssids)
        print('-----------------------------------')

        floorUniqueBssidData[building] = uniqueBssids
    
    with open("floorUniqueBssidData.json", "w") as outfile: 
        json.dump(floorUniqueBssidData, outfile)

In [10]:
class wifiAPFloorMapping:
    def __init__(self, building, floorUniqueBssidMapPath):
        self.building = building
        self.floorList = self.getBuildingFloorList()
        self.uniqueBssidsMap = self.getFloorUniqueBssids(floorUniqueBssidMapPath)
        self.outputData = self.generateOutputData()
    
    def getFloorUniqueBssids(self, floorUniqueBssidMapPath):
        with open(floorUniqueBssidMapPath, "r") as infile:
            floorUniqueBssidMap = json.load(infile)
        return floorUniqueBssidMap.get(self.building, None)
    
    def getBuildingFloorList(self):
        buildingPath = input_dir() / 'train' / self.building
        folders = sorted(buildingPath.glob('*'))
        return [folder.name for folder in folders] + ['common']

    def generateOutputData(self):
        outputData = {'pathName' : []}
        for floor in self.floorList:
            outputData[f"{floor}_count"] = []
            outputData[f"{floor}_mean"]  = []
            outputData[f"{floor}_median"]  = []
        return outputData
    
    def findMappingFloor(self, wifiAp):
        matchingfloor = 'common'
        if self.uniqueBssidsMap is not None:
            for floor, floorAPList in self.uniqueBssidsMap.items():
                if wifiAp in floorAPList:
                    matchingfloor = floor
                    break
        return matchingfloor 
    
    def getPathFileWiFiData(self, file):
        _, wifiData = extract_wps_wifis(file)
        wifiData = pd.DataFrame(wifiData, columns = ['timestamp','type','ssid', 'bssid', 'rssi', 'freq', 'last_ts'])
        wifiData.drop(labels=['timestamp', 'type', 'ssid', 'freq', 'last_ts'], axis=1, inplace=True)
        wifiData['mappedFloor'] = wifiData['bssid'].apply(self.findMappingFloor)
        return wifiData
        
    def updatePathFileToOutput(self, file):
        # add path file to output
        self.outputData['pathName'].append(file.name.split('.')[0])
        
        # create wifi data of pathfile
        wifiData = self.getPathFileWiFiData(file)
        
        # shortlist wifi bssids based on each floor list
        for floor in self.floorList:
            rssiData = wifiData[wifiData['mappedFloor']== floor]['rssi'].values
            if len(rssiData) > 0:
                mean, median = np.mean(rssiData), np.median(rssiData)
            else:
                mean, median = 0.0, 0.0
                
            self.outputData[f"{floor}_count"].append(len(rssiData))
            self.outputData[f"{floor}_mean"].append(mean)
            self.outputData[f"{floor}_median"].append(median)
        del wifiData

In [11]:
buildingsList = generate_target_buildings()

```python
%%time
building_path = input_dir() / 'test'
# output placeholder
testPathFloorPredictions = { 'building' : [], 'pathName' : [], 'predFloor':[] }

for building in tqdm(buildingsList):
    buildingTestPathFiles = ssubm_df[ssubm_df[0] == building][1].unique()
    #print(f"There are {len(buildingTestPathFiles)} test path files in building")
    
    temp = wifiAPFloorMapping(building, 'floorUniqueBssidData.json')
    for testPathFile in buildingTestPathFiles:
        temp.updatePathFileToOutput(building_path / f"{testPathFile}.txt")  
    
    outputDf = pd.DataFrame(temp.outputData)
    totalColumns = list(set(temp.floorList) - set(['common']))
    columnsOfInterest = [f"{x}_count" for x in totalColumns]
    
    # taking the maximum count as floor prediction
    outputDf['predFloor'] = pd.Series([totalColumns[idx] for idx in outputDf[columnsOfInterest].values.argmax(axis=1)]).map(floor_map)
    
    # write to output variable
    testPathFloorPredictions['pathName'].extend(outputDf['pathName'].values.tolist())
    testPathFloorPredictions['predFloor'].extend(outputDf['predFloor'].values.tolist())
    testPathFloorPredictions['building'].extend([building] * outputDf.shape[0])    
```

```python
testPathFloorPredictions = pd.DataFrame(testPathFloorPredictions)
testPathFloorPredictions.to_csv('testPathFloorPredictions.csv',index=False)
testPathFloorPredictions.head(3)
```

In [12]:
def pathFloorMapping(inputPath, csvMap):
    return int(csvMap[csvMap['pathName']== inputPath]['predFloor'])

In [13]:
# read submission csv file and apply our floor mappings to it
ssubm = pd.read_csv(sampleCsvPath)
ssubm_df = ssubm["site_path_timestamp"].apply(lambda x: pd.Series(x.split("_")))
ownCsv = pd.read_csv('testPathFloor_Mapping.csv')
ssubm_df['floor'] = ssubm_df[1].apply(pathFloorMapping, csvMap=ownCsv)
ssubm_df['site_path_timestamp'] = ssubm_df[0].astype(str) + '_' + ssubm_df[1].astype(str) + '_' + ssubm_df[2].astype(str) 
ssubm_df.to_csv('testFloorPredictionsSubmssion.csv',index=False)

In [14]:
ssubm_df.head(3)

Unnamed: 0,0,1,2,floor,site_path_timestamp
0,5a0546857ecc773753327266,046cfa46be49fc10834815c6,9,0,5a0546857ecc773753327266_046cfa46be49fc1083481...
1,5a0546857ecc773753327266,046cfa46be49fc10834815c6,9017,0,5a0546857ecc773753327266_046cfa46be49fc1083481...
2,5a0546857ecc773753327266,046cfa46be49fc10834815c6,15326,0,5a0546857ecc773753327266_046cfa46be49fc1083481...


In [15]:
inputCsv = pd.read_csv('referencePublicNotebooks/99%_floorPredictionss_submission.csv')
matchingRows = (inputCsv['site_path_timestamp'] == ssubm_df['site_path_timestamp']).all()

if matchingRows == True:
    inputCsv['floor'] = ssubm_df['floor']
    inputCsv.to_csv('outputSubmission.csv',index=False)