## Libraries import

In [1]:
import pandas as pd
import numpy as np
import os
import glob

# functions and libraires from competition github
from dataclasses import dataclass
from io_f import read_data_file
from visualize_f import visualize_heatmap, visualize_trajectory

In [2]:
TRAIN_FOLDER = 'train'
TEST_FOLDER = 'test'

## helper functions

In [3]:
def findFilesSpecificFormat(searchDirectory, fileExtension):
    fileExtension = f"*.{fileExtension}"
    matchingFileFormatList = [file
                             for path, subdir, files in os.walk(searchDirectory)
                             for file in glob.glob(os.path.join(path, fileExtension))]
    return matchingFileFormatList

In [4]:
def getSiteTestPathFiles(siteID):
    tempdf = testSitePathTimestampData[testSitePathTimestampData.loc[:,0]==siteID]
    pathFilesNames = np.unique(tempdf.loc[:,1].values).tolist()
    sitePathFilesList = [f"{TEST_FOLDER}/{x}.txt" for x in pathFilesNames]
    return sitePathFilesList

In [5]:
def getUniqueWiFiAPList(data):
    wifiData = data.wifi
    return np.unique(wifiData[:,2]).tolist()

In [6]:
def getWifiAPList(pathFileList):
    pathWiFiAPList = [] 
    
    for pathFile in pathFileList:    
        # extract pathFile data
        pathFileData = read_data_file(pathFile)
    
        try:
            # compute unique WiFi access points list
            pathUniqueWiFiAPList = getUniqueWiFiAPList(pathFileData)
        except:
            print(f"{pathFile} has issue, output shape = {pathFileData.wifi.shape}")

        #print(f"There are {len(pathUniqueWiFiAPList)} unique WiFi APs in  {pathFile} pathfile")
        pathWiFiAPList.append(pathUniqueWiFiAPList)
    return pathWiFiAPList

In [7]:
def getSiteTestDataWiFiInfo(testSitePathList):
    siteTestDataWiFiInfo = {}
    siteTestDataTotalWiFiAPList  = []
    temp = getWifiAPList(testSitePathList)
    for idx, pathFile in enumerate(testSitePathList):    
        # get path name from file name
        pathName = pathFile.split("/")[1].replace(".txt", "")
        siteTestDataWiFiInfo[pathName] = temp[idx]
        siteTestDataTotalWiFiAPList.extend(temp[idx])
    del(temp)

    print(f"There are {len(siteTestDataTotalWiFiAPList)} wifi APs in samplesite")
    print(f"There are {len(list(set(siteTestDataTotalWiFiAPList)))} unique wifi APs in samplesite")  
    return siteTestDataWiFiInfo, siteTestDataTotalWiFiAPList

In [8]:
def getSiteTrainDataWiFiInfo(site, siteFloorList):
    siteTrainDataWiFiInfo = {}
    siteTrainDataTotalWiFiAPList = []
    
    # iterate through floors in site
    for floor in siteFloorList:
        tempList = os.listdir(f"{TRAIN_FOLDER}/{site}/{floor}")
        floorPathFileList = [f"{TRAIN_FOLDER}/{site}/{floor}/{x}" for x in tempList]
        print(f"There are {len(floorPathFileList)} Path files at {site}/{floor} Floor Train data")

        temp = getWifiAPList(floorPathFileList)
        siteTrainDataWiFiInfo[floor] = []
        for pathIBeaconList in temp:
            siteTrainDataWiFiInfo[floor].extend(pathIBeaconList)
            siteTrainDataTotalWiFiAPList.extend(pathIBeaconList)
        #print(f"Floor {floor} done")
    return siteTrainDataWiFiInfo, siteTrainDataTotalWiFiAPList

In [9]:
def getSiteWiFiDF(siteTestDataWiFiInfo, siteTrainDataWiFiInfo):
    site_df = pd.DataFrame()
    site_df["pathFile"] = siteTestDataWiFiInfo.keys()

    for floor in siteTrainDataWiFiInfo.keys():
        floorWiFiAPSet = set(siteTrainDataWiFiInfo[floor])
    
        WiFiMatchCount = [] 
        for _,key in enumerate(siteTestDataWiFiInfo.keys()):
            testPathWiFiAPSet = set(siteTestDataWiFiInfo[key])
            matchlist = list(floorWiFiAPSet.intersection(testPathWiFiAPSet))
            # print(f"{floor} - {key} matchcount = {len(matchlist)}")
            WiFiMatchCount.append(len(matchlist))
        
        site_df[floor] = WiFiMatchCount
    return site_df

## Parse sampleSubmissionCSV

In [10]:
sampleSubmissionCSV = pd.read_csv('sample_submission.csv')
testSitePathTimestampData = sampleSubmissionCSV['site_path_timestamp'].str.split('_', expand=True)

```python
print(sampleSubmissionCSV.head(3))
print(testSitePathTimestampData.head(3))
print("testSitePathTimestampData.columns = ", testSitePathTimestampData.columns)
```

In [11]:
# testSites listed as separate folders under train/
trainSitesList = os.listdir('train/')
uniqueTestSitesList = testSitePathTimestampData.loc[:,0].unique().tolist()
print(f"Number of train sites {len(trainSitesList)}")
print(f"Number of test sites {len(uniqueTestSitesList)}")

Number of train sites 204
Number of test sites 24


## Compute WiFi unique access points for one Test site

In [12]:
sampleTestSite = uniqueTestSitesList[0]
sampleTestSitePathList = getSiteTestPathFiles(sampleTestSite) 
print(f"There are {len(sampleTestSitePathList)} PathFiles in site {sampleTestSite}")
print(f"sample file is {sampleTestSitePathList[0]}")

There are 29 PathFiles in site 5a0546857ecc773753327266
sample file is test/046cfa46be49fc10834815c6.txt


```python
# To check if resulting files are present
print([os.path.isfile(x) for x in sampleTestSitePathList])
```

In [13]:
siteTestDataWiFiInfo, siteTestDataTotalWiFiAPList = getSiteTestDataWiFiInfo(sampleTestSitePathList)

There are 12948 wifi APs in samplesite
There are 2435 unique wifi APs in samplesite


## Train file WiFi APs list

In [14]:
floorsInSite = os.listdir(f"{TRAIN_FOLDER}/{sampleTestSite}")
floorsInSite.sort()
print(f"Floors at {sampleTestSite} Train Data = {floorsInSite}")

Floors at 5a0546857ecc773753327266 Train Data = ['B1', 'F1', 'F2', 'F3', 'F4']


In [15]:
siteTrainDataWiFiInfo, siteTrainDataTotalWiFiAPList = getSiteTrainDataWiFiInfo(sampleTestSite, floorsInSite)

There are 109 Path files at 5a0546857ecc773753327266/B1 Floor Train data
There are 131 Path files at 5a0546857ecc773753327266/F1 Floor Train data
There are 110 Path files at 5a0546857ecc773753327266/F2 Floor Train data
There are 78 Path files at 5a0546857ecc773753327266/F3 Floor Train data
train/5a0546857ecc773753327266/F3/5d8f0954b6e29d0006fb8c0d.txt has issue, output shape = (0,)
train/5a0546857ecc773753327266/F3/5d8f0955b6e29d0006fb8c0f.txt has issue, output shape = (0,)
There are 86 Path files at 5a0546857ecc773753327266/F4 Floor Train data


In [16]:
site_df = getSiteWiFiDF(siteTestDataWiFiInfo, siteTrainDataWiFiInfo)
print(site_df.head(3))
site_df.to_csv(f"{sampleTestSite}_WiFiOutput.csv")

                   pathFile   B1   F1   F2   F3   F4
0  046cfa46be49fc10834815c6  213  666  635  509  502
1  05d052dde78384b0c543d89c   62  436  417  311  303
2  0c06cc9f21d172618d74c6c8  260   90   29   20   19


In [17]:
print(f"There are {len(siteTrainDataTotalWiFiAPList)} Wifi APs in siteTrainData")
print(f"There are {len(siteTestDataTotalWiFiAPList)} Wifi APs in siteTestData")

There are 196282 Wifi APs in siteTrainData
There are 12948 Wifi APs in siteTestData


In [18]:
set(siteTestDataTotalWiFiAPList) - set(siteTrainDataTotalWiFiAPList)

{'03fa48b34077acca3e2ee12119818aff3961537b',
 '070b4ece157c69e694b61cfaf938d6da55e1fb9e',
 '0808e70dba3efac5b7848c717becf732e9ef7796',
 '0a9f31684c792f921b11500b38a95706b39a5fa7',
 '10eddac2d4471d6d5c49bcf095320b19a6d4bdab',
 '1e0fcf079317539437831981bee82b48b1ba1b89',
 '2da3d98cfc0309d453e09c4393e9457178ab14ef',
 '33f20983b8fae7222236a28f2fcdbfd0a64d3dae',
 '3b779dc1191bba796ae24168216743da16859c79',
 '3e83518fabdba187f3d68e47a4751cce50e936fc',
 '3eaaaafb58a6d01abb1689cb026adce5ac984caf',
 '506664c4bcc5d3c999a2431be0e7268cfd9636a4',
 '587ff76f98133eeb3511ecf035a486ad17f0222d',
 '598f4d9c4f6bd25f6c0506d3735dcecc0539d77f',
 '65b6be9211330264ecd5cb06f299a7417cb08f46',
 '669b8c1073728feae04468bf7884248e1134448c',
 '6de735923b07f3293ca6f7177b0bd48d5caa48bf',
 '771204188303e784dec04d86a91f20fb29a27bae',
 '7b5432c3a5502b81dd93b01010c34ca608befeaa',
 '8345511a25e86b3bf3b0074f6ad2da416d36f867',
 '872d90bee8ef74997d3e4e448c94a997c24c2d2e',
 '8ec25a67ccd966845be3c84a0f12860c616810df',
 '92704954