## Libraries import

In [1]:
import pandas as pd
import numpy as np
import os
import glob

# functions and libraires from competition github
from dataclasses import dataclass
from io_f import read_data_file
from visualize_f import visualize_heatmap, visualize_trajectory

In [2]:
TRAIN_FOLDER = 'train'
TEST_FOLDER = 'test'

## helper functions

In [3]:
def findFilesSpecificFormat(searchDirectory, fileExtension):
    fileExtension = f"*.{fileExtension}"
    matchingFileFormatList = [file
                             for path, subdir, files in os.walk(searchDirectory)
                             for file in glob.glob(os.path.join(path, fileExtension))]
    return matchingFileFormatList

In [4]:
def getSiteTestPathFiles(siteID):
    tempdf = testSitePathTimestampData[testSitePathTimestampData.loc[:,0]==siteID]
    pathFilesNames = np.unique(tempdf.loc[:,1].values).tolist()
    sitePathFilesList = [f"{TEST_FOLDER}/{x}.txt" for x in pathFilesNames]
    return sitePathFilesList

In [5]:
def getUniqueWiFiAPList(data):
    wifiData = data.wifi
    return np.unique(wifiData[:,2]).tolist()

In [6]:
def getWifiAPList(pathFileList):
    pathWiFiAPList = [] 
    
    for pathFile in pathFileList:    
        # extract pathFile data
        pathFileData = read_data_file(pathFile)
    
        try:
            # compute unique WiFi access points list
            pathUniqueWiFiAPList = getUniqueWiFiAPList(pathFileData)
        except:
            print(f"{pathFile} has issue, output shape = {pathFileData.wifi.shape}")

        #print(f"There are {len(pathUniqueWiFiAPList)} unique WiFi APs in  {pathFile} pathfile")
        pathWiFiAPList.append(pathUniqueWiFiAPList)
    return pathWiFiAPList

## Parse sampleSubmissionCSV

In [7]:
sampleSubmissionCSV = pd.read_csv('sample_submission.csv')
sampleSubmissionCSV.head(3)

Unnamed: 0,site_path_timestamp,floor,x,y
0,5a0546857ecc773753327266_046cfa46be49fc1083481...,0,75.0,75.0
1,5a0546857ecc773753327266_046cfa46be49fc1083481...,0,75.0,75.0
2,5a0546857ecc773753327266_046cfa46be49fc1083481...,0,75.0,75.0


In [8]:
testSitePathTimestampData = sampleSubmissionCSV['site_path_timestamp'].str.split('_', expand=True)
testSitePathTimestampData.head(3)

Unnamed: 0,0,1,2
0,5a0546857ecc773753327266,046cfa46be49fc10834815c6,9
1,5a0546857ecc773753327266,046cfa46be49fc10834815c6,9017
2,5a0546857ecc773753327266,046cfa46be49fc10834815c6,15326


In [9]:
print("testSitePathTimestampData.columns = ", testSitePathTimestampData.columns)

testSitePathTimestampData.columns =  RangeIndex(start=0, stop=3, step=1)


In [10]:
# testSites listed as separate folders under train/
trainSitesList = os.listdir('train/')
print(f"Number of train sites {len(trainSitesList)}")

uniqueTestSitesList = testSitePathTimestampData.loc[:,0].unique().tolist()
print(f"Number of test sites {len(uniqueTestSitesList)}")

Number of train sites 204
Number of test sites 24


## Compute WiFi unique access points for one Test site

In [11]:
sampleTestSite = uniqueTestSitesList[0]
sampleTestSitePathList = getSiteTestPathFiles(sampleTestSite) 
print(f"There are {len(sampleTestSitePathList)} PathFiles in site {sampleTestSite}")
print(f"sample file is {sampleTestSitePathList[0]}")

There are 29 PathFiles in site 5a0546857ecc773753327266
sample file is test/046cfa46be49fc10834815c6.txt


```python
# To check if resulting files are present
print([os.path.isfile(x) for x in sampleTestSitePathList])
```

In [12]:
siteTestDataWiFiInfo = {}
siteTotalWiFiAPList  = []
temp = getWifiAPList(sampleTestSitePathList)
for idx, pathFile in enumerate(sampleTestSitePathList):    
    # get path name from file name
    pathName = pathFile.split("/")[1].replace(".txt", "")
    siteTestDataWiFiInfo[pathName] = temp[idx]
    siteTotalWiFiAPList.extend(temp[idx])
del(temp)

print(f"There are {len(siteTotalWiFiAPList)} wifi APs in samplesite")
siteTestDataUniqueWiFiAPList = list(set(siteTotalWiFiAPList))
print(f"There are {len(siteTestDataUniqueWiFiAPList)} unique wifi APs in samplesite")

There are 12948 wifi APs in samplesite
There are 2435 unique wifi APs in samplesite


## Train file WiFi APs list

In [13]:
floorsInSite = os.listdir(f"{TRAIN_FOLDER}/{sampleTestSite}")
floorsInSite.sort()
print(f"Floors at {sampleTestSite} Train Data = {floorsInSite}")

Floors at 5a0546857ecc773753327266 Train Data = ['B1', 'F1', 'F2', 'F3', 'F4']


In [14]:
siteTrainDataFloorWifiInfo = {}
# iterate through floors in site
for floor in floorsInSite:
    tempList = os.listdir(f"{TRAIN_FOLDER}/{sampleTestSite}/{floor}")
    floorPathFileList = [f"{TRAIN_FOLDER}/{sampleTestSite}/{floor}/{x}" for x in tempList]
    print(f"There are {len(floorPathFileList)} Path files at {sampleTestSite}/{floor} Floor Train data")
    
    temp = getWifiAPList(floorPathFileList)
    siteTrainDataFloorWifiInfo[floor] = []
    for pathWifiList in temp:
        siteTrainDataFloorWifiInfo[floor].extend(pathWifiList)
    print(f"Floor {floor} done")

There are 109 Path files at 5a0546857ecc773753327266/B1 Floor Train data
Floor B1 done
There are 131 Path files at 5a0546857ecc773753327266/F1 Floor Train data
Floor F1 done
There are 110 Path files at 5a0546857ecc773753327266/F2 Floor Train data
Floor F2 done
There are 78 Path files at 5a0546857ecc773753327266/F3 Floor Train data
train/5a0546857ecc773753327266/F3/5d8f0954b6e29d0006fb8c0d.txt has issue, output shape = (0,)
train/5a0546857ecc773753327266/F3/5d8f0955b6e29d0006fb8c0f.txt has issue, output shape = (0,)
Floor F3 done
There are 86 Path files at 5a0546857ecc773753327266/F4 Floor Train data
Floor F4 done


In [15]:
for floor in floorsInSite:
    print(f"There are {len(siteTrainDataFloorWifiInfo[floor])} total wifi APs in {floor} floor")
    print(f"There are {len(list(set(siteTrainDataFloorWifiInfo[floor])))} unique wifi APs in {floor} floor")

There are 21390 total wifi APs in B1 floor
There are 1374 unique wifi APs in B1 floor
There are 60908 total wifi APs in F1 floor
There are 2052 unique wifi APs in F1 floor
There are 53733 total wifi APs in F2 floor
There are 1965 unique wifi APs in F2 floor
There are 29399 total wifi APs in F3 floor
There are 1534 unique wifi APs in F3 floor
There are 30852 total wifi APs in F4 floor
There are 1555 unique wifi APs in F4 floor


```python
print(siteTrainDataFloorWifiInfo.keys())
print(siteTestDataWiFiInfo.keys())
print(len(siteTestDataWiFiInfo.keys()))
```

In [16]:
site_df = pd.DataFrame()
site_df["pathFile"] = siteTestDataWiFiInfo.keys()
site_df.head(3)

Unnamed: 0,pathFile
0,046cfa46be49fc10834815c6
1,05d052dde78384b0c543d89c
2,0c06cc9f21d172618d74c6c8


In [17]:
for floor in floorsInSite:
    floorWiFiSet = set(siteTrainDataFloorWifiInfo[floor])
    
    wifimatchcount = [] 
    for _,key in enumerate(siteTestDataWiFiInfo.keys()):
        testPathWiFiSet = set(siteTestDataWiFiInfo[key])
        matchlist = list(floorWiFiSet.intersection(testPathWiFiSet))
        # print(f"{floor} - {key} matchcount = {len(matchlist)}")
        wifimatchcount.append(len(matchlist))
    site_df[floor] = wifimatchcount

B1 - 046cfa46be49fc10834815c6 matchcount = 213
B1 - 05d052dde78384b0c543d89c matchcount = 62
B1 - 0c06cc9f21d172618d74c6c8 matchcount = 260
B1 - 146035943a1482883ed98570 matchcount = 189
B1 - 1ef2771dfea25d508142ba06 matchcount = 86
B1 - 3506b3b626f494b0f0b934ca matchcount = 283
B1 - 3e0aebb66ef39150bbc27c24 matchcount = 273
B1 - 3e1d46017fbfcc8136bd1e9b matchcount = 170
B1 - 412da1891c4780f6f0f7f4bc matchcount = 130
B1 - 6d89334316127640cff99800 matchcount = 264
B1 - 72963a8c7eb520c56f88a536 matchcount = 299
B1 - 7d2d723a30cce824aad9915b matchcount = 308
B1 - 947e17f82dbddfbdb4cb2447 matchcount = 278
B1 - 986924433ab01afa81a59e7a matchcount = 234
B1 - a3c5aadf824a220327e9cdc6 matchcount = 79
B1 - ac1a2e8f11ab64d729199969 matchcount = 303
B1 - bb84ab5e77fc9f5fdbd52827 matchcount = 114
B1 - bd0f2c8626679895f3338ca5 matchcount = 267
B1 - bd5921cd1008382e2f537b53 matchcount = 239
B1 - ce80c08d743f15e6586741f6 matchcount = 91
B1 - d592885af4e6e380c376dc55 matchcount = 138
B1 - d6e092450892

In [18]:
site_df.to_csv(f"{sampleTestSite}_output.csv")