## Libraries import

In [1]:
import pandas as pd
import numpy as np
import os
import glob

# functions and libraires from competition github
from dataclasses import dataclass
from io_f import read_data_file
from visualize_f import visualize_heatmap, visualize_trajectory

In [2]:
TRAIN_FOLDER = 'train'
TEST_FOLDER = 'test'

## helper functions

In [3]:
def findFilesSpecificFormat(searchDirectory, fileExtension):
    fileExtension = f"*.{fileExtension}"
    matchingFileFormatList = [file
                             for path, subdir, files in os.walk(searchDirectory)
                             for file in glob.glob(os.path.join(path, fileExtension))]
    return matchingFileFormatList

In [4]:
def getSiteTestPathFiles(siteID):
    tempdf = testSitePathTimestampData[testSitePathTimestampData.loc[:,0]==siteID]
    pathFilesNames = np.unique(tempdf.loc[:,1].values).tolist()
    sitePathFilesList = [f"{TEST_FOLDER}/{x}.txt" for x in pathFilesNames]
    return sitePathFilesList

In [5]:
def getUniqueWiFiAPList(data):
    wifiData = data.wifi
    return np.unique(wifiData[:,2]).tolist()

## Parse sampleSubmissionCSV

In [6]:
sampleSubmissionCSV = pd.read_csv('sample_submission.csv')
sampleSubmissionCSV.head(3)

Unnamed: 0,site_path_timestamp,floor,x,y
0,5a0546857ecc773753327266_046cfa46be49fc1083481...,0,75.0,75.0
1,5a0546857ecc773753327266_046cfa46be49fc1083481...,0,75.0,75.0
2,5a0546857ecc773753327266_046cfa46be49fc1083481...,0,75.0,75.0


In [7]:
testSitePathTimestampData = sampleSubmissionCSV['site_path_timestamp'].str.split('_', expand=True)
testSitePathTimestampData.head(3)

Unnamed: 0,0,1,2
0,5a0546857ecc773753327266,046cfa46be49fc10834815c6,9
1,5a0546857ecc773753327266,046cfa46be49fc10834815c6,9017
2,5a0546857ecc773753327266,046cfa46be49fc10834815c6,15326


In [8]:
print("testSitePathTimestampData.columns = ", testSitePathTimestampData.columns)

testSitePathTimestampData.columns =  RangeIndex(start=0, stop=3, step=1)


In [9]:
# testSites listed as separate folders under train/
trainSitesList = os.listdir('train/')
print(f"Number of train sites {len(trainSitesList)}")

uniqueTestSitesList = testSitePathTimestampData.loc[:,0].unique().tolist()
print(f"Number of test sites {len(uniqueTestSitesList)}")

Number of train sites 204
Number of test sites 24


## Compute WiFi unique access points for one Test site

In [10]:
sampleTestSite = uniqueTestSitesList[0]
#print(f"sampleTestSite is {sampleTestSite}")
sampleTestSitePathList = getSiteTestPathFiles(sampleTestSite) 
print(f"There are {len(sampleTestSitePathList)} PathFiles in site {sampleTestSite}")
print(f"sample file is {sampleTestSitePathList[0]}")

There are 29 PathFiles in site 5a0546857ecc773753327266
sample file is test/046cfa46be49fc10834815c6.txt


```python
# To check if resulting files are present
print([os.path.isfile(x) for x in sampleTestSitePathList])
```

In [11]:
siteTestDataWiFiInfo = {}

for idx, pathFile in enumerate(sampleTestSitePathList):    
    # get path name from file name
    pathName = pathFile.split("/")[1].replace(".txt", "")
    
    # extract pathFile data
    pathFileData = read_data_file(pathFile)
    
    # compute unique WiFi access points list
    pathUniqueWiFiAPList = getUniqueWiFiAPList(pathFileData)
    print(f"There are {len(pathUniqueWiFiAPList)} unique WiFi APs in  {pathFile} pathfile")
    
    siteTestDataWiFiInfo[pathName] = { "pathFileData": pathFileData, 
                               "pathUniqueWiFiAPList": pathUniqueWiFiAPList }

There are 668 unique WiFi APs in  test/046cfa46be49fc10834815c6.txt pathfile
There are 436 unique WiFi APs in  test/05d052dde78384b0c543d89c.txt pathfile
There are 260 unique WiFi APs in  test/0c06cc9f21d172618d74c6c8.txt pathfile
There are 190 unique WiFi APs in  test/146035943a1482883ed98570.txt pathfile
There are 562 unique WiFi APs in  test/1ef2771dfea25d508142ba06.txt pathfile
There are 625 unique WiFi APs in  test/3506b3b626f494b0f0b934ca.txt pathfile
There are 563 unique WiFi APs in  test/3e0aebb66ef39150bbc27c24.txt pathfile
There are 557 unique WiFi APs in  test/3e1d46017fbfcc8136bd1e9b.txt pathfile
There are 605 unique WiFi APs in  test/412da1891c4780f6f0f7f4bc.txt pathfile
There are 265 unique WiFi APs in  test/6d89334316127640cff99800.txt pathfile
There are 300 unique WiFi APs in  test/72963a8c7eb520c56f88a536.txt pathfile
There are 308 unique WiFi APs in  test/7d2d723a30cce824aad9915b.txt pathfile
There are 481 unique WiFi APs in  test/947e17f82dbddfbdb4cb2447.txt pathfile

In [12]:
siteTotalWiFiAPList = []
for k,v in siteTestDataWiFiInfo.items():
    siteTotalWiFiAPList.extend(v["pathUniqueWiFiAPList"])

print(f"There are {len(siteTotalWiFiAPList)} wifi APs in samplesite")
siteTestDataUniqueWiFiAPList = list(set(siteTotalWiFiAPList))
print(f"There are {len(siteTestDataUniqueWiFiAPList)} unique wifi APs in samplesite")

There are 12948 wifi APs in samplesite
There are 2435 unique wifi APs in samplesite


## Train file WiFi APs list

In [13]:
floorsInSite = os.listdir(f"{TRAIN_FOLDER}/{sampleTestSite}")
print(f"Floors at {sampleTestSite} Train Data = {floorsInSite}")


siteTrainDataFloorWifiInfo = {}

# iterate through floors in site
for floor in floorsInSite:
    tempList = os.listdir(f"{TRAIN_FOLDER}/{sampleTestSite}/{floor}")
    floorPathFileList = [f"{TRAIN_FOLDER}/{sampleTestSite}/{floor}/{x}" for x in tempList]
    print(f"There are {len(floorPathFileList)} Path files at {sampleTestSite}/{floor} Floor Train data")
    
    siteTrainDataFloorWifiInfo[floor] = {}
    siteTrainDataFloorWifiInfo[floor]["total"] = []
    
    for idx, pathFile in enumerate(floorPathFileList): 
    
        # get path name from file name
        pathName = pathFile.split("/")[3].replace(".txt", "")

        # extract pathFile data
        pathFileData = read_data_file(pathFile)

        try:
            # compute unique WiFi access points list
            pathUniqueWiFiAPList = getUniqueWiFiAPList(pathFileData)
        except:
            print(pathFileData.wifi.shape)

        siteTrainDataFloorWifiInfo[floor][pathName] = pathFileData
        siteTrainDataFloorWifiInfo[floor]["total"].extend(pathUniqueWiFiAPList)
    print(f"Floor {floor} done")

Floors at 5a0546857ecc773753327266 Train Data = ['F4', 'F1', 'F2', 'F3', 'B1']
There are 86 Path files at 5a0546857ecc773753327266/F4 Floor Train data
Floor F4 done
There are 131 Path files at 5a0546857ecc773753327266/F1 Floor Train data
Floor F1 done
There are 110 Path files at 5a0546857ecc773753327266/F2 Floor Train data
Floor F2 done
There are 78 Path files at 5a0546857ecc773753327266/F3 Floor Train data
(0,)
(0,)
Floor F3 done
There are 109 Path files at 5a0546857ecc773753327266/B1 Floor Train data
Floor B1 done


In [14]:
for floor in floorsInSite:
    print(f"There are {len(siteTrainDataFloorWifiInfo[floor]['total'])} total wifi APs in {floor} floor")
    siteTrainDataFloorWifiInfo[floor]["unique"] = list(set(siteTrainDataFloorWifiInfo[floor]["total"]))
    print(f"There are {len(siteTrainDataFloorWifiInfo[floor]['unique'])} unique wifi APs in {floor} floor")

There are 30852 total wifi APs in F4 floor
There are 1555 unique wifi APs in F4 floor
There are 60908 total wifi APs in F1 floor
There are 2052 unique wifi APs in F1 floor
There are 53733 total wifi APs in F2 floor
There are 1965 unique wifi APs in F2 floor
There are 29399 total wifi APs in F3 floor
There are 1534 unique wifi APs in F3 floor
There are 21390 total wifi APs in B1 floor
There are 1374 unique wifi APs in B1 floor


In [15]:
siteTrainDataFloorWifiInfo.keys()

dict_keys(['F4', 'F1', 'F2', 'F3', 'B1'])

In [19]:
siteTestDataWiFiInfo.keys()

dict_keys(['046cfa46be49fc10834815c6', '05d052dde78384b0c543d89c', '0c06cc9f21d172618d74c6c8', '146035943a1482883ed98570', '1ef2771dfea25d508142ba06', '3506b3b626f494b0f0b934ca', '3e0aebb66ef39150bbc27c24', '3e1d46017fbfcc8136bd1e9b', '412da1891c4780f6f0f7f4bc', '6d89334316127640cff99800', '72963a8c7eb520c56f88a536', '7d2d723a30cce824aad9915b', '947e17f82dbddfbdb4cb2447', '986924433ab01afa81a59e7a', 'a3c5aadf824a220327e9cdc6', 'ac1a2e8f11ab64d729199969', 'bb84ab5e77fc9f5fdbd52827', 'bd0f2c8626679895f3338ca5', 'bd5921cd1008382e2f537b53', 'ce80c08d743f15e6586741f6', 'd592885af4e6e380c376dc55', 'd6e09245089299ec8e5173e7', 'dd4cbd69218f610f27cf33c8', 'de9cef0d69383e47bd74a64c', 'e91afaba9603500e8e28a454', 'ea777463a91a1293fc79fc52', 'ec9aee8ee72b1902bd83edf5', 'ed77f28aeaf89b317bc380fa', 'ffcd9524c80c0fa5bb859eaf'])

In [20]:
siteTestDataWiFiInfo["046cfa46be49fc10834815c6"].keys()

dict_keys(['pathFileData', 'pathUniqueWiFiAPList'])

In [21]:
len(siteTestDataWiFiInfo.keys())

29

In [22]:
site_df = pd.DataFrame()
site_df["pathFile"] = siteTestDataWiFiInfo.keys()

In [23]:
site_df.head()

Unnamed: 0,pathFile
0,046cfa46be49fc10834815c6
1,05d052dde78384b0c543d89c
2,0c06cc9f21d172618d74c6c8
3,146035943a1482883ed98570
4,1ef2771dfea25d508142ba06


In [26]:
for floor in floorsInSite:
    floorWiFiSet = set(siteTrainDataFloorWifiInfo[floor]['unique'])
    
    wifimatchcount = [] 
    for _,key in enumerate(siteTestDataWiFiInfo.keys()):
        testPathWiFiSet = set(siteTestDataWiFiInfo[key]["pathUniqueWiFiAPList"])
        matchlist = list(floorWiFiSet.intersection(testPathWiFiSet))
        print(f"{floor} - {key} matchcount = {len(matchlist)}")
        wifimatchcount.append(len(matchlist))
    site_df[floor] = wifimatchcount

F4 - 046cfa46be49fc10834815c6 matchcount = 502
F4 - 05d052dde78384b0c543d89c matchcount = 303
F4 - 0c06cc9f21d172618d74c6c8 matchcount = 19
F4 - 146035943a1482883ed98570 matchcount = 9
F4 - 1ef2771dfea25d508142ba06 matchcount = 423
F4 - 3506b3b626f494b0f0b934ca matchcount = 447
F4 - 3e0aebb66ef39150bbc27c24 matchcount = 377
F4 - 3e1d46017fbfcc8136bd1e9b matchcount = 431
F4 - 412da1891c4780f6f0f7f4bc matchcount = 448
F4 - 6d89334316127640cff99800 matchcount = 46
F4 - 72963a8c7eb520c56f88a536 matchcount = 82
F4 - 7d2d723a30cce824aad9915b matchcount = 97
F4 - 947e17f82dbddfbdb4cb2447 matchcount = 343
F4 - 986924433ab01afa81a59e7a matchcount = 403
F4 - a3c5aadf824a220327e9cdc6 matchcount = 392
F4 - ac1a2e8f11ab64d729199969 matchcount = 90
F4 - bb84ab5e77fc9f5fdbd52827 matchcount = 456
F4 - bd0f2c8626679895f3338ca5 matchcount = 5
F4 - bd5921cd1008382e2f537b53 matchcount = 62
F4 - ce80c08d743f15e6586741f6 matchcount = 407
F4 - d592885af4e6e380c376dc55 matchcount = 16
F4 - d6e09245089299ec8e5

In [27]:
site_df.to_csv(f"{sampleTestSite}_output.csv")