### Wifi features

This this is the code to generate the wifi features available in [this dataset](https://www.kaggle.com/devinanzelmo/indoor-navigation-and-location-wifi-features). Using these features can get a score below 14. For an example notebook using them see [this notebook](https://www.kaggle.com/devinanzelmo/wifi-features-lightgbm-starter). They only uses waypoints, wifi and timestamp data to generate solution. See this [forum post](https://www.kaggle.com/c/indoor-location-navigation/discussion/215445) for an outline of this solution method, and methods of improvement.

There are `break`'s inserted into loops which need to be removed to get this to run. Right now data is written to current working directory. This takes 2-4 hours to run depending on hard drive etc. There is a lot of room for improvement speeding up feature generation. 

**Update:** I added one line that creates a column for the path filename, this allows for a groupkfold crossvalidation. 


In [1]:
import os
import gc
import glob
import json 
import numpy as np
import pandas as pd
pd.options.mode.chained_assignment = None  # default='warn'

In [2]:
base_path = '.'

# pull out all the buildings actually used in the test set, given current method we don't need the other ones
ssubm = pd.read_csv('sample_submission.csv')
ssubm_df = ssubm["site_path_timestamp"].apply(lambda x: pd.Series(x.split("_")))
used_buildings = sorted(ssubm_df[0].value_counts().index.tolist())

# dictionary used to map the floor codes to the values used in the submission file. 
floor_map = {"B2":-2, "B1":-1, "F1":0, "F2": 1, "F3":2, "F4":3, "F5":4, "F6":5, "F7":6,"F8":7, "F9":8,
             "1F":0, "2F":1, "3F":2, "4F":3, "5F":4, "6F":5, "7F":6, "8F": 7, "9F":8}

In [3]:
minWiFiCount = 1000

# parameters related to wiFi RSSI based feature
wiFiIDTargetDir = "referencePublicNotebooks/waypt_WiFiID1000Feat"
wiFiIDMinRSSIValue = -999

# parameters related to wiFi last seen timestamp based feature
wiFiDtTargetDir = "referencePublicNotebooks/waypt_WiFiDt1000Feat"
wiFiDtMinValue = 1000.0

In [4]:
def getTimeStampDf(df, currentTimeStamp):
    # print(f"currentTimeStamp = {currentTimeStamp}")
    timestampdf = df[df[0] < currentTimeStamp]
    
    # wifi stamps are available before waypoint
    if len(timestampdf) > 0:
        # print(f"df within timestamp shape is {timestampdf.shape}")
        # print(f"There are {timestampdf[3].nunique()} unique wifi ids in given timestamp")
        uniquedf = timestampdf.drop_duplicates(subset=[3], keep='last')
        uniquedf[6] = (currentTimeStamp - uniquedf[6].astype(np.int64)).astype(float) / 1000.0
    else:
        uniquedf = None
    return uniquedf

In [5]:
def getWiFiIDFeature(uniquedf):
    wifiIDFeature = uniquedf.iloc[:,3:5]
    wifiIDFeature = wifiIDFeature.set_index(3).reindex(index).replace(np.nan, wiFiIDMinRSSIValue).T
    return wifiIDFeature

In [6]:
def getWiFiDtFeature(uniquedf):
    wifiDtFeature = uniquedf.iloc[:,[3,6]]
    wifiDtFeature = wifiDtFeature.set_index(3).reindex(index).replace(np.nan, wiFiDtMinValue).T
    return wifiDtFeature

```python
# get only the wifi bssid that occur over 1000 times(this number can be experimented with)
# these will be the only ones used when constructing features
bssid = dict()

for building in used_buildings:
    break
    folders = sorted(glob.glob(os.path.join(base_path,'train/'+building+'/*')))
    print(building)
    wifi = list()
    for folder in folders:
        floor = floor_map[folder.split('/')[-1]]
        files = glob.glob(os.path.join(folder, "*.txt"))
        for file in files:
            with open(file) as f:
                txt = f.readlines()
                for e, line in enumerate(txt):
                    tmp = line.strip().split()
                    if tmp[1] == "TYPE_WIFI":
                        wifi.append(tmp)
    df = pd.DataFrame(wifi)
    #top_bssid = df[3].value_counts().iloc[:500].index.tolist()
    value_counts = df[3].value_counts()
    top_bssid = value_counts[value_counts > 1000].index.tolist()
    print(len(top_bssid))
    bssid[building] = top_bssid
    del df
    del wifi
    gc.collect()
```

In [7]:
#with open("bssid_1000.json", "w") as f:
#    json.dump(bssid, f)

with open(f"referencePublicNotebooks/bssid_{minWiFiCount}.json") as f:
    bssid = json.load(f)

In [8]:
%%time

# generate all the training data 
for building in used_buildings:
    folders = sorted(glob.glob(os.path.join(base_path,'train', building +'/*')))
    wifiIDdfs = list()
    wifiDtdfs = list()
    index = sorted(bssid[building])
    print(building)

    # each floor
    for folder in folders:
        floor = floor_map[folder.split('/')[-1]]
        files = glob.glob(os.path.join(folder, "*.txt"))
        print(f"{floor} has {len(files)} path files")

        # each path file
        for file in files:
            wifi = list()
            waypoint = list()
            pathName = file.split('/')[-1].split('.')[0]

            # parse file
            with open(file) as f:
                txt = f.readlines()
            for line in txt:
                line = line.strip().split()
                if line[1] == "TYPE_WAYPOINT":
                    waypoint.append(line)
                if line[1] == "TYPE_WIFI":
                    wifi.append(line)

            # wifi data time sorted
            df = pd.DataFrame(np.array(wifi)) 
            df[0] = df[0].astype(np.int64)
            df.sort_values(by=[0],inplace=True)

            # waypoint data time sorted
            waypoint = pd.DataFrame(np.array(waypoint))
            waypoint.sort_values(by=[0], inplace=True)
            waypoint = waypoint.to_numpy()
            #print(f"{file} has {len(waypoint)} waypoints")

            # feature timestamps
            wayPtTimestamps = waypoint[:,0].astype(np.int64)
            # print(wayPtTimestamps)

            for idx, currentTimeStamp in enumerate(wayPtTimestamps):
                timeDf = getTimeStampDf(df, currentTimeStamp)                
                if timeDf is not None:
                    #print(timeDf.shape)
                    # wifi rssi based feature
                    idFeature = getWiFiIDFeature(timeDf)
                    idFeature["x"] = float(waypoint[idx][2])
                    idFeature["y"] = float(waypoint[idx][3])
                    idFeature["f"] = floor
                    idFeature["path"] = pathName # useful for crossvalidation
                    wifiIDdfs.append(idFeature)

                    #wifi last seen time based feature
                    dtFeature = getWiFiDtFeature(timeDf)
                    wifiDtdfs.append(dtFeature)

            #print(f"len(wifiIDdfs) = {len(wifiIDdfs)}")
            #print(f"len(wifiDtdfs) = {len(wifiDtdfs)}")

    # wifi rssi feature writing
    building_wiFiIDdf = pd.concat(wifiIDdfs)
    print(building_wiFiIDdf.shape)
    building_wiFiIDdf.to_csv(f"{wiFiIDTargetDir}/train/{building}_{minWiFiCount}_train.csv", index=False)

    # wifi dt feature writing
    building_wiFiDtdf = pd.concat(wifiDtdfs)
    print(building_wiFiDtdf.shape)
    building_wiFiDtdf.to_csv(f"{wiFiDtTargetDir}/train/{building}_{minWiFiCount}_train.csv", index=False)

5a0546857ecc773753327266
-1 has 109 path files
0 has 131 path files
1 has 110 path files
2 has 78 path files
3 has 86 path files
(1974, 945)
(1974, 941)
5c3c44b80379370013e0fd2b
-1 has 69 path files
0 has 109 path files
1 has 100 path files
2 has 39 path files
3 has 47 path files
4 has 21 path files
(1306, 871)
(1306, 867)
5d27075f03f801723c2e360f
-1 has 234 path files
0 has 284 path files
1 has 240 path files
2 has 213 path files
3 has 144 path files
4 has 17 path files
5 has 3 path files
6 has 6 path files
(6137, 1809)
(6137, 1805)
5d27096c03f801723c31e5e0
-1 has 29 path files
0 has 65 path files
1 has 73 path files
2 has 52 path files
3 has 32 path files
4 has 50 path files
5 has 50 path files
(1574, 315)
(1574, 311)
5d27097f03f801723c320d97
-1 has 30 path files
-2 has 29 path files
0 has 17 path files
1 has 110 path files
2 has 80 path files
3 has 82 path files
4 has 56 path files
(3367, 400)
(3367, 396)
5d27099f03f801723c32511d
-1 has 13 path files
0 has 20 path files
1 has 62 pat

In [9]:
a = building_wiFiIDdf.columns.values.tolist()
b = building_wiFiDtdf.columns.values.tolist()

print(len(a), len(b), len(bssid[building]))

print(set(a[0:-4]) -set(sorted(bssid[building])))
print(set(b) -set(sorted(bssid[building])))

print(a[0:-4] == sorted(bssid[building]))
print(b == sorted(bssid[building]))

566 562 562
set()
set()
True
True


```python
# Generate the features for the test set

ssubm_building_g = ssubm_df.groupby(0)
feature_dict = dict()

for gid0, g0 in ssubm_building_g:
    break
    index = sorted(bssid[g0.iloc[0,0]])
    feats = list()
    print(gid0)
    for gid,g in g0.groupby(1):

        # get all wifi time locations, 
        with open(os.path.join(base_path, 'test/' + g.iloc[0,1] + '.txt')) as f:
            txt = f.readlines()

        wifi = list()

        for line in txt:
            line = line.strip().split()
            if line[1] == "TYPE_WIFI":
                wifi.append(line)

        wifi_df = pd.DataFrame(wifi)
        wifi_points = pd.DataFrame(wifi_df.groupby(0).count().index.tolist())
        
        for timepoint in g.iloc[:,2].tolist():

            deltas = (wifi_points.astype(int) - int(timepoint)).abs()
            min_delta_idx = deltas.values.argmin()
            wifi_block_timestamp = wifi_points.iloc[min_delta_idx].values[0]
            
            wifi_block = wifi_df[wifi_df[0] == wifi_block_timestamp].drop_duplicates(subset=3)
            feat = wifi_block.set_index(3)[4].reindex(index).fillna(-999)

            feat['site_path_timestamp'] = g.iloc[0,0] + "_" + g.iloc[0,1] + "_" + timepoint
            feats.append(feat)
    feature_df = pd.concat(feats, axis=1).T
    feature_df.to_csv(gid0+"_1000_test.csv")
    feature_dict[gid0] = feature_df
```  