### Wifi features

This this is the code to generate the wifi features available in [this dataset](https://www.kaggle.com/devinanzelmo/indoor-navigation-and-location-wifi-features). Using these features can get a score below 14. For an example notebook using them see [this notebook](https://www.kaggle.com/devinanzelmo/wifi-features-lightgbm-starter). They only uses waypoints, wifi and timestamp data to generate solution. See this [forum post](https://www.kaggle.com/c/indoor-location-navigation/discussion/215445) for an outline of this solution method, and methods of improvement.

There are `break`'s inserted into loops which need to be removed to get this to run. Right now data is written to current working directory. This takes 2-4 hours to run depending on hard drive etc. There is a lot of room for improvement speeding up feature generation. 

**Update:** I added one line that creates a column for the path filename, this allows for a groupkfold crossvalidation. 


In [1]:
import pandas as pd
import numpy as np
import glob
import os
import gc
import json 
pd.options.mode.chained_assignment = None  # default='warn'

In [2]:
base_path = '..'

# pull out all the buildings actually used in the test set, given current method we don't need the other ones
ssubm = pd.read_csv('../sample_submission.csv')

# only 24 of the total buildings are used in the test set, 
# this allows us to greatly reduce the intial size of the dataset

ssubm_df = ssubm["site_path_timestamp"].apply(lambda x: pd.Series(x.split("_")))
used_buildings = sorted(ssubm_df[0].value_counts().index.tolist())

# dictionary used to map the floor codes to the values used in the submission file. 
floor_map = {"B2":-2, "B1":-1, "F1":0, "F2": 1, "F3":2, "F4":3, "F5":4, "F6":5, "F7":6,"F8":7, "F9":8,
             "1F":0, "2F":1, "3F":2, "4F":3, "5F":4, "6F":5, "7F":6, "8F": 7, "9F":8}

In [3]:
ssubm_df.head(3)

Unnamed: 0,0,1,2
0,5a0546857ecc773753327266,046cfa46be49fc10834815c6,9
1,5a0546857ecc773753327266,046cfa46be49fc10834815c6,9017
2,5a0546857ecc773753327266,046cfa46be49fc10834815c6,15326


```python
%%time
# get only the wifi bssid that occur over 1000 times(this number can be experimented with)
# these will be the only ones used when constructing features
bssid = dict()

for building in used_buildings:
    #break
    folders = sorted(glob.glob(os.path.join(base_path,'train/'+building+'/*')))
    print(building)
    wifi = list()
    for folder in folders:
        floor = floor_map[folder.split('/')[-1]]
        files = glob.glob(os.path.join(folder, "*.txt"))
        for file in files:
            with open(file) as f:
                txt = f.readlines()
                for e, line in enumerate(txt):
                    tmp = line.strip().split()
                    if tmp[1] == "TYPE_WIFI":
                        wifi.append(tmp)
    df = pd.DataFrame(wifi)
    #top_bssid = df[3].value_counts().iloc[:500].index.tolist()
    value_counts = df[3].value_counts()
    top_bssid = value_counts[value_counts > 1000].index.tolist()
    print(len(top_bssid))
    bssid[building] = top_bssid
    del df
    del wifi
    gc.collect()
```

In [3]:
#with open("bssid_1000.json", "w") as f:
#    json.dump(bssid, f)

with open("bssid_1000.json") as f:
    bssid = json.load(f)

In [4]:
building = used_buildings[0]
folders = sorted(glob.glob(os.path.join(base_path,'train', building +'/*')))
dfs = list()
index = sorted(bssid[building])
dt_index = [x+ "_dt" for x in index]
print(f"building = {building}")
folder = folders[0]
print(f"folder = {folder}")
floor = floor_map[folder.split('/')[-1]]
print(f"floor = {floor}")
files = glob.glob(os.path.join(folder, "*.txt"))
file =files[0]
print(f"file = {file}")

building = 5a0546857ecc773753327266
folder = ../train/5a0546857ecc773753327266/B1
floor = -1
file = ../train/5a0546857ecc773753327266/B1/5e157323a280850006f3d017.txt


In [5]:
wifi = list()
waypoint = list()
with open(file) as f:
    txt = f.readlines()
for line in txt:
    line = line.strip().split()
    if line[1] == "TYPE_WAYPOINT":
        waypoint.append(line)
    if line[1] == "TYPE_WIFI":
        wifi.append(line)

df = pd.DataFrame(np.array(wifi))  
print(df.shape)
df.head(3)

(3432, 7)


Unnamed: 0,0,1,2,3,4,5,6
0,1578462804152,TYPE_WIFI,da39a3ee5e6b4b0d3255bfef95601890afd80709,c08ad78a45798cfe176a42b35c7381ae602711c5,-42,5825,1578462803569
1,1578462804152,TYPE_WIFI,7182afc4e5c212133d5d7d76eb3df6c24618302b,4d89139ca69acc0a8a762672a822411a769ac266,-43,5825,1578462803585
2,1578462804152,TYPE_WIFI,d839a45ebe64ab48b60a407d837fb01d3c0dfef9,30f85a5e14351468a6dd13718a9da3b0d7b73685,-43,5825,1578462803583


In [6]:
df[5].unique()

array(['5825', '2472', '2462', '5745', '2437', '2412', '2447', '2427',
       '2442', '5765', '2432', '2457', '5785', '2422', '5805', '2417',
       '2452', '5300', '5260', '5180', '5280', '5220'], dtype=object)

In [7]:
print(np.array(waypoint).shape)
print(np.array(waypoint))

(5, 4)
[['1578462802213' 'TYPE_WAYPOINT' '223.09842' '154.77519']
 ['1578462809093' 'TYPE_WAYPOINT' '224.45633' '160.10567']
 ['1578462820081' 'TYPE_WAYPOINT' '232.462' '164.41673']
 ['1578462830942' 'TYPE_WAYPOINT' '225.08406' '165.7717']
 ['1578462843572' 'TYPE_WAYPOINT' '231.4029' '158.41515']]


In [8]:
# generate a feature, and label for each wifi block
for gid, g in df.groupby(0):
    dists = list()
    for e, k in enumerate(waypoint):
        dist = abs(int(gid) - int(k[0]))
        dists.append(dist)
    nearest_wp_index = np.argmin(dists)
    g = g.drop_duplicates(subset=3)  
    break

In [9]:
g.head(3)

Unnamed: 0,0,1,2,3,4,5,6
0,1578462804152,TYPE_WIFI,da39a3ee5e6b4b0d3255bfef95601890afd80709,c08ad78a45798cfe176a42b35c7381ae602711c5,-42,5825,1578462803569
1,1578462804152,TYPE_WIFI,7182afc4e5c212133d5d7d76eb3df6c24618302b,4d89139ca69acc0a8a762672a822411a769ac266,-43,5825,1578462803585
2,1578462804152,TYPE_WIFI,d839a45ebe64ab48b60a407d837fb01d3c0dfef9,30f85a5e14351468a6dd13718a9da3b0d7b73685,-43,5825,1578462803583


In [63]:
def getDtFeature(df):
    dtFeat = df.iloc[:,[0,3,6]]
    dtFeat.loc[:,3] = dtFeat.loc[:,3].apply(lambda x: f"{x}_dt")
    dtFeat.loc[:,4] = (dtFeat.loc[:, 0].astype(float) - dtFeat.loc[:, 6].astype(float)) / 1000.0
    dtFeat = dtFeat.drop(columns=[0,6]).set_index(3).reindex(dt_index).replace(np.nan, 100.0).T
    return dtFeat

In [68]:
def getWiFiIDFeature(df):
    wiFiIDFeat = df.iloc[:,3:5]
    wiFiIDFeat = wiFiIDFeat.set_index(3).reindex(index).replace(np.nan, -999).T
    return wiFiIDFeat

In [72]:
wiFiIDFeat = getWiFiIDFeature(g)
dtFeat = getDtFeature(g)
feat = pd.concat([wiFiIDFeat, dtFeat], axis=1)
print(feat.shape)

(1, 1882)


In [73]:
feat.shape

(1, 1882)

In [74]:
feat.head()

3,000840e5c600de293cea57f13326f273c86c3988,00ad587dcb9c7ce3788b92e22777a22ee0efea31,00af060fc145ee6a6a50475efa57b91cbf54237f,00bcc61bdea4d52d050822d66952dd707c2fcdf3,00f0904087c01d922d6ebf3005607dfdeaf6687b,011e20ebf721a1c6dfec42e8ed1e2ac566073a2a,01d2f676abab6ec03ec5dc696bfd49d66e392ea1,01e25e4a25acd32baf5137b3031151f751fadbb4,026c2f057932da75680b21ecdbd23bf9cb9350f3,028a310e23177c3747d37971678dd964ee28ce17,...,fd179c5e4fd5e33493ae290adbbda2950ecf0427_dt,fd1a502adb446e835797a88fad8e79d1e0bf4b4a_dt,fd977a3af7be241a9ed0213acb3aa75e5dc00253_dt,fdb1ad87bd6fb08014267f2586faeed0edc7412b_dt,fdc189e5a19850397f37201f4acc378cfddcf0d6_dt,fdc19f011587b75c11a6c30d8ca06d90107b6bde_dt,fdf37fa13679f581bdfaae3b99e368633e0a144b_dt,fdfe926caf5f49a88a9bcab8d025e887f422128b_dt,fe3211f90e4ab1f500e10fe175ae6142f4b13130_dt,ffa41c79865d7fb336f586e0dec8b080db1027fb_dt
4,-999,-999,-999,-999,-999,-999,-999,-999,-999,-999,...,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0


In [75]:
feat["x"] = float(waypoint[nearest_wp_index][2])
feat["y"] = float(waypoint[nearest_wp_index][3])
feat["f"] = floor
feat["path"] = file.split('/')[-1].split('.')[0] # useful for crossvalidation
dfs.append(feat)

In [76]:
feat.head()

3,000840e5c600de293cea57f13326f273c86c3988,00ad587dcb9c7ce3788b92e22777a22ee0efea31,00af060fc145ee6a6a50475efa57b91cbf54237f,00bcc61bdea4d52d050822d66952dd707c2fcdf3,00f0904087c01d922d6ebf3005607dfdeaf6687b,011e20ebf721a1c6dfec42e8ed1e2ac566073a2a,01d2f676abab6ec03ec5dc696bfd49d66e392ea1,01e25e4a25acd32baf5137b3031151f751fadbb4,026c2f057932da75680b21ecdbd23bf9cb9350f3,028a310e23177c3747d37971678dd964ee28ce17,...,fdc189e5a19850397f37201f4acc378cfddcf0d6_dt,fdc19f011587b75c11a6c30d8ca06d90107b6bde_dt,fdf37fa13679f581bdfaae3b99e368633e0a144b_dt,fdfe926caf5f49a88a9bcab8d025e887f422128b_dt,fe3211f90e4ab1f500e10fe175ae6142f4b13130_dt,ffa41c79865d7fb336f586e0dec8b080db1027fb_dt,x,y,f,path
4,-999,-999,-999,-999,-999,-999,-999,-999,-999,-999,...,100.0,100.0,100.0,100.0,100.0,100.0,223.09842,154.77519,-1,5e157323a280850006f3d017
