## Libraries import

In [1]:
import pandas as pd
import numpy as np
import os
import glob

# functions and libraires from competition github
from dataclasses import dataclass
from io_f import read_data_file
from visualize_f import visualize_heatmap, visualize_trajectory

In [2]:
TRAIN_FOLDER = 'train'
TEST_FOLDER = 'test'

## helper functions

In [3]:
def findFilesSpecificFormat(searchDirectory, fileExtension):
    fileExtension = f"*.{fileExtension}"
    matchingFileFormatList = [file
                             for path, subdir, files in os.walk(searchDirectory)
                             for file in glob.glob(os.path.join(path, fileExtension))]
    return matchingFileFormatList

In [4]:
def getSiteTestPathFiles(siteID):
    tempdf = testSitePathTimestampData[testSitePathTimestampData.loc[:,0]==siteID]
    pathFilesNames = np.unique(tempdf.loc[:,1].values).tolist()
    sitePathFilesList = [f"{TEST_FOLDER}/{x}.txt" for x in pathFilesNames]
    return sitePathFilesList

In [5]:
def getUniqueIBeaconAPList(data):
    wifiData = data.ibeacon
    return np.unique(wifiData[:,1]).tolist()

In [6]:
def getIBeaconList(pathFileList):
    pathIBeaconAPList = [] 
    
    for pathFile in pathFileList:    
        # extract pathFile data
        pathFileData = read_data_file(pathFile)
    
        try:
            # compute unique WiFi access points list
            temp = getUniqueIBeaconAPList(pathFileData)
        except:
            print(f"{pathFile} has issue, output shape = {pathFileData.ibeacon.shape}")

        #print(f"There are {len(temp)} unique WiFi APs in  {pathFile} pathfile")
        pathIBeaconAPList.append(temp)
    return pathIBeaconAPList

In [7]:
def getSiteTestDataIBeaconInfo(testSitePathList):
    siteTestDataIBeaconInfo = {}
    siteTestDataTotalIBeaconAPList  = []
    temp = getIBeaconList(testSitePathList)
    for idx, pathFile in enumerate(testSitePathList):    
        # get path name from file name
        pathName = pathFile.split("/")[1].replace(".txt", "")
        siteTestDataIBeaconInfo[pathName] = temp[idx]
        siteTestDataTotalIBeaconAPList.extend(temp[idx])
    del(temp)

    print(f"There are {len(siteTestDataTotalIBeaconAPList)} wifi APs in samplesite")
    print(f"There are {len(list(set(siteTestDataTotalIBeaconAPList)))} unique wifi APs in samplesite")  
    return siteTestDataIBeaconInfo, siteTestDataTotalIBeaconAPList

In [8]:
def getSiteTrainDataIBeaconInfo(site, siteFloorList):
    siteTrainDataIBeaconInfo = {}
    siteTrainDataTotalIBeaconAPList = []
    
    # iterate through floors in site
    for floor in siteFloorList:
        tempList = os.listdir(f"{TRAIN_FOLDER}/{site}/{floor}")
        floorPathFileList = [f"{TRAIN_FOLDER}/{site}/{floor}/{x}" for x in tempList]
        print(f"There are {len(floorPathFileList)} Path files at {site}/{floor} Floor Train data")

        temp = getIBeaconList(floorPathFileList)
        siteTrainDataIBeaconInfo[floor] = []
        for pathIBeaconList in temp:
            siteTrainDataIBeaconInfo[floor].extend(pathIBeaconList)
            siteTrainDataTotalIBeaconAPList.extend(pathIBeaconList)
        #print(f"Floor {floor} done")
    return siteTrainDataIBeaconInfo, siteTrainDataTotalIBeaconAPList

In [9]:
def getSiteIBeaconDF(siteTestDataIBeaconInfo, siteTrainDataIBeaconInfo):
    site_df = pd.DataFrame()
    site_df["pathFile"] = siteTestDataIBeaconInfo.keys()

    for floor in siteTrainDataIBeaconInfo.keys():
        floorIBeaconAPSet = set(siteTrainDataIBeaconInfo[floor])
    
        IBeaconMatchCount = [] 
        for _,key in enumerate(siteTestDataIBeaconInfo.keys()):
            testPathIBeaconAPSet = set(siteTestDataIBeaconInfo[key])
            matchlist = list(floorIBeaconAPSet.intersection(testPathIBeaconAPSet))
            # print(f"{floor} - {key} matchcount = {len(matchlist)}")
            IBeaconMatchCount.append(len(matchlist))
        
        site_df[floor] = IBeaconMatchCount
    return site_df

## Parse sampleSubmissionCSV

In [10]:
sampleSubmissionCSV = pd.read_csv('sample_submission.csv')
testSitePathTimestampData = sampleSubmissionCSV['site_path_timestamp'].str.split('_', expand=True)

```python
print(sampleSubmissionCSV.head(2))
print(testSitePathTimestampData.head(2))
print("testSitePathTimestampData.columns = ", testSitePathTimestampData.columns)
```

In [11]:
# testSites listed as separate folders under train/
trainSitesList = os.listdir('train/')
uniqueTestSitesList = testSitePathTimestampData.loc[:,0].unique().tolist()

print(f"Number of train sites {len(trainSitesList)}")
print(f"Number of test sites {len(uniqueTestSitesList)}")

Number of train sites 204
Number of test sites 24


## Compute iBeacon unique access points for one Test site

In [12]:
sampleTestSite = uniqueTestSitesList[0]
sampleTestSitePathList = getSiteTestPathFiles(sampleTestSite) 
print(f"There are {len(sampleTestSitePathList)} PathFiles in site {sampleTestSite}")
print(f"sample file is {sampleTestSitePathList[0]}")

There are 29 PathFiles in site 5a0546857ecc773753327266
sample file is test/046cfa46be49fc10834815c6.txt


```python
# To check if resulting files are present
print([os.path.isfile(x) for x in sampleTestSitePathList])
```

In [13]:
siteTestDataIBeaconInfo, siteTestDataTotalIBeaconAPList = getSiteTestDataIBeaconInfo(sampleTestSitePathList)

There are 112 wifi APs in samplesite
There are 34 unique wifi APs in samplesite


## Train file IBeacon APs list

In [14]:
floorsInSite = os.listdir(f"{TRAIN_FOLDER}/{sampleTestSite}")
floorsInSite.sort()
print(f"Floors at {sampleTestSite} Train Data = {floorsInSite}")

Floors at 5a0546857ecc773753327266 Train Data = ['B1', 'F1', 'F2', 'F3', 'F4']


In [15]:
siteTrainDataIBeaconInfo, siteTrainDataTotalIBeaconAPList = getSiteTrainDataIBeaconInfo(sampleTestSite, floorsInSite)

There are 109 Path files at 5a0546857ecc773753327266/B1 Floor Train data
There are 131 Path files at 5a0546857ecc773753327266/F1 Floor Train data
train/5a0546857ecc773753327266/F1/5e15b393f4c3420006d522ed.txt has issue, output shape = (0,)
There are 110 Path files at 5a0546857ecc773753327266/F2 Floor Train data
There are 78 Path files at 5a0546857ecc773753327266/F3 Floor Train data
There are 86 Path files at 5a0546857ecc773753327266/F4 Floor Train data


In [16]:
site_df = getSiteIBeaconDF(siteTestDataIBeaconInfo, siteTrainDataIBeaconInfo)
print(site_df.head(3))
site_df.to_csv(f"{sampleTestSite}_IBeaconOutput.csv")

                   pathFile  B1  F1  F2  F3  F4
0  046cfa46be49fc10834815c6   2   2   2   2   2
1  05d052dde78384b0c543d89c   2   2   2   2   2
2  0c06cc9f21d172618d74c6c8   7   2   2   2   2


In [17]:
print(f"There are {len(siteTrainDataTotalIBeaconAPList)} Ibeacon APs in siteTrainData")
print(f"There are {len(siteTestDataTotalIBeaconAPList)} Ibeacon APs in siteTestData")

There are 1575 Ibeacon APs in siteTrainData
There are 112 Ibeacon APs in siteTestData


In [18]:
set(siteTestDataTotalIBeaconAPList) - set(siteTrainDataTotalIBeaconAPList)

{'b1607a1d8a0371d61472032c7562886c35337be6_b6589fc6ab0dc82cf12099d1c2d40ab994e8410c_b6589fc6ab0dc82cf12099d1c2d40ab994e8410c'}