# wifi features
Huge thanks to the great [wifi fearures](https://www.kaggle.com/devinanzelmo/wifi-features) notebook by [Devin Anzelmo](https://www.kaggle.com/devinanzelmo). I learned a lot from the notebook.
I've made a small chage to the notebook and now it runs faster ~40min instead of 2-4 hours. Hope this helps some kagglers!

In case you find a bug please leave comments here :)

## Library imports

In [1]:
import collections
import csv
import glob
import multiprocessing
import os
from multiprocessing import Pool
from pathlib import Path
from typing import List, Tuple, Any

import numpy as np
import pandas as pd

## Configuration parameters

In [2]:
floor_map = {"B2": -2, "B1": -1, "F1": 0, "F2": 1, "F3": 2, "F4": 3, "F5": 4, "F6": 5, "F7": 6, "F8": 7, "F9": 8,
             "1F": 0, "2F": 1, "3F": 2, "4F": 3, "5F": 4, "6F": 5, "7F": 6, "8F": 7, "9F": 8}

In [3]:
minCount = 1
rssiFillerValue = -999.0
dtFillerValue   = 1000.0
freqFillerValue = 0
outputDir = 'referencePublicNotebooks/wiFiFeatures'
sampleCsvPath = 'sample_submission.csv'

## Helper functions

In [4]:
def input_dir() -> Path:
    #return Path('/kaggle/input/indoor-location-navigation/')
    return Path('.')

def extract_wps_wifis(file: Path) -> Tuple[List[str], List[str]]:
    wps = []
    wifis = []
    with open(file) as f:
        for row in csv.reader(f, delimiter="\t", doublequote=True):
            if row[1] == "TYPE_WAYPOINT":
                # x
                row[2] = float(row[2])  # type: ignore
                # y
                row[3] = float(row[3])  # type: ignore
                wps.append(row)
            elif row[1] == "TYPE_WIFI":
                # wifi signal value
                row[4] = int(row[4])  # type: ignore
                wifis.append(row)
    wps = sorted(wps, key=lambda x: x[0])  # timestamp
    wifis = sorted(wifis, key=lambda x: x[0])  # timestamp
    return wps, wifis


def top_bssids(bssids: List[str], n: int) -> List[str]:
    df = pd.DataFrame(bssids)
    value_counts = df[0].value_counts() # type: ignore
    return sorted(value_counts[value_counts > n].index.tolist())


def top_bssids_for_building(input_dir: Path, building: str, n: int) -> List[str]:
    folders = sorted(glob.glob(os.path.join(
        input_dir, 'train/' + building+'/*')))
    bssids = []
    for folder in folders:
        files = glob.glob(os.path.join(folder, "*.txt"))
        for file in files:
            _, wifis = extract_wps_wifis(Path(file))
            bssids.extend([wifi[3] for wifi in wifis])

    return top_bssids(bssids, n)


def nearest_waypoint(timestamp: int, wps: np.ndarray) -> List[str]:
    """
    dists = []
    for wp in wps:
        # timestamp delta
        dist = abs(timestamp - int(wp[0]))
        dists.append(dist)
    nearest_index = np.argmin(dists)
    return wps[nearest_index]
    """
    wayPtTimestamps = wps[:,0].astype(np.int64)
    waypointx = wps[:,2].astype(float)
    waypointy = wps[:,3].astype(float)
        
    interpolatedWiFiAPx = np.interp(timestamp, wayPtTimestamps, waypointx)
    interpolatedWiFiAPy = np.interp(timestamp, wayPtTimestamps, waypointy)
    return [interpolatedWiFiAPx, interpolatedWiFiAPy]
    


# Note: This can have exact same rows in train. Because both wifi_group_a and 
# wifi_group_b can be nearest to a certain waypoint and wifi_group_a and wifi_group_b are the same.
def generate_train_for_building(building_path: Path, bssids: List[str]) -> pd.Series:
    dfs = []
    folders = sorted(building_path.glob('*'))
    for folder in folders:
        files = folder.glob("*.txt")
        for file in files:
            rows = generate_train_for_path(file, bssids)
            dfs.extend(rows)
    print(len(dfs))
    """
    building_df = pd.concat(dfs)
    building_df.reset_index(drop=True, inplace=True)
    type_map = {column: int for column in bssids}
    building_df = building_df.astype(type_map) # type: ignore
    """
    building_df = pd.DataFrame(dfs)
    return building_df


def generate_train_for_path(path_file: Path, bssids: List[str]) -> List[Any]:
    floor = str(path_file.parent.name)
    wps, wifis = extract_wps_wifis(path_file)
    wps = np.array(wps)
    wifis_df = pd.DataFrame(wifis, columns=[
                            'timestamp', 'type', 'ssid', 'bssid', 'rssi', 'channel', 'last_timestamp'])

    # adding timestamp feature
    wifis_df['dt'] = (wifis_df['timestamp'].astype(float) - wifis_df['last_timestamp'].astype(float)) / 1000.0
    
    rows = []
    for timestamp, wifi_group in wifis_df.groupby('timestamp'):
        timestamp = int(timestamp)
        path = path_file.stem
        row = generate_train_for_timestamp(
            timestamp, wifi_group, wps, floor, path, bssids)
        rows.append(row)
    return rows


def generate_train_for_timestamp(timestamp: int, wifi_group: pd.DataFrame, wps: np.ndarray, floor: str, path: str, bssids: List[str]) -> pd.DataFrame:
    waypoint = nearest_waypoint(timestamp, wps)
    wifi_group = wifi_group.drop_duplicates(subset='bssid')
    
    # feature extraction
    tmp = wifi_group.loc[:,['bssid', 'rssi', 'dt','channel']]
    
    # reindex, one-hot encoding and fill nan with default value
    row = tmp.set_index('bssid').reindex(bssids)
    row.fillna({'rssi':rssiFillerValue, 'dt':dtFillerValue, 'channel':freqFillerValue}, inplace=True)
    row = row.values.flatten('F').tolist()
    
    # append target values - x,y,floor and path
    row.extend([waypoint[0], waypoint[1], floor_map[floor], path])
    
    """
    # generate rssi, dt and frequency features
    tmp = wifi_group.loc[:, ['bssid','value']]  
    dtTmp = wifi_group.loc[:, ['dt_bssid','dt']] 
    channelTmp = wifi_group.loc[:, ['channel_bssid','channel']] 
    
    # reindex each, replace nan with default values
    row = tmp.set_index('bssid').reindex(bssids).replace(np.nan, -999).T
    dtRow = dtTmp.set_index('dt_bssid').reindex(dtBssids).replace(np.nan, 1000.0).T
    channelRow = channelTmp.set_index('channel_bssid').reindex(channelBssids).replace(np.nan, 0).T
    
    # check if features order is correct
    #print(np.array_equal(np.where(row.values > -500)[1], np.where(dtRow.values < 500.0)[1]), np.array_equal(np.where(row.values > -500)[1], np.where(channelRow.astype(int).values > 100)[1]) ) 
        
    # append features horizontally
    row[dtRow.columns] = dtRow.values
    row[channelRow.columns] = channelRow.values
    
    # fill target values
    row["x"] = waypoint[0]
    row["y"] = waypoint[1]
    row["f"] = floor_map[floor]
    row["path"] = path

    """
    
    return row


def generate_target_buildings() -> List[str]:
    ssubm = pd.read_csv(sampleCsvPath)
    ssubm_df = ssubm["site_path_timestamp"].apply(
        lambda x: pd.Series(x.split("_")))
    buildingsList = sorted(ssubm_df[0].value_counts().index.tolist()) # type: ignore
    return buildingsList


def generate_one(building: str):
    print(f"start:{building}")
    building_path = input_dir() / 'train' / building
    bssids = top_bssids_for_building(input_dir(), building, minCount)
    train_df = generate_train_for_building(building_path, bssids)
    print(train_df.shape)
    train_df.to_csv(f'{outputDir}/{building}_train.csv', index=False)
    print(f"end:{building}")


def generate_train():
    num_cores = multiprocessing.cpu_count()
    print(f"num_cores={num_cores}")
    pool = Pool(num_cores)
    pool.map(generate_one, generate_target_buildings()[0:1])


def generate_test_one(building_df: pd.DataFrame):
    building = building_df.iloc[0, 0]
    print(f"start: {building}")
    bssids = top_bssids_for_building(input_dir(), building, minCount) # type: ignore
    feats = []
    # group by path
    for path, path_df in building_df.groupby('path'):
        _, wifis = extract_wps_wifis(input_dir() / 'test' / f'{path}.txt')

        wifi_df = pd.DataFrame(wifis)
        wifi_points = pd.DataFrame(wifi_df.groupby(0).count().index.tolist())
        for timepoint in path_df.iloc[:, 2].tolist():
            deltas = (wifi_points.astype(int) - int(timepoint)).abs()
            min_delta_idx = deltas.values.argmin()
            wifi_block_timestamp = wifi_points.iloc[min_delta_idx].values[0]

            wifi_block = wifi_df[wifi_df[0] ==
                                 wifi_block_timestamp].drop_duplicates(subset=3)
            feat = wifi_block.set_index(3)[4].reindex(bssids).fillna(-999)

            feat['site_path_timestamp'] = f'{building}_{path}_{timepoint}'
            feats.append(feat)
    feature_df = pd.concat(feats, axis=1).T
    feature_df.to_csv(f"{building}_test.csv", index=False)
    print(f'end: {building}')


def generate_test():
    sub_df = pd.read_csv(sampleCsvPath)
    sub_df = sub_df["site_path_timestamp"].apply(
        lambda x: pd.Series(x.split("_")))
    sub_df.columns = ['site', 'path', 'timestamp']

    building_dfs = [building_df for _, building_df in sub_df.groupby('site')]

    num_cores = multiprocessing.cpu_count()
    print(f"num_cores={num_cores}")
    pool = Pool(num_cores)
    pool.map(generate_test_one, building_dfs)

```python
buildingsList = generate_target_buildings()
building = buildingsList[0]
folders = sorted(Path(f"{input_dir()}/train/{building}").glob('*'))
folder = folders[0]

print(f"building = {building}, floor = {folder.stem}")

bssids = top_bssids_for_building(input_dir(), building, 1) 
dtBssids = ["dt_"+x for x in bssids]
channelBssids = ["channel_"+x for x in bssids]
print(len(bssids))
```

```python
#for path_file in sorted(files):
files = folder.glob("*.txt")
path_file = sorted(files)[0]
print(path_file.stem)
floor = str(path_file.parent.name)
wps, wifis = extract_wps_wifis(path_file)
wps = np.array(wps)
wifis_df = pd.DataFrame(wifis, columns=[
                        'timestamp', 'type', 'ssid', 'bssid', 'value', 'channel', 'last_timestamp'])

#wifis_df['dt_bssid'] = 'dt_' + wifis_df['bssid'].astype(str)
#wifis_df['channel_bssid'] = 'channel_' + wifis_df['bssid'].astype(str)

# adding timestamp feature
wifis_df['dt'] = (wifis_df['timestamp'].astype(float) - wifis_df['last_timestamp'].astype(float)) / 1000.0

    
for timestamp, wifi_group in wifis_df.groupby('timestamp'):
    timestamp = int(timestamp)
    path = path_file.stem
    #row = generate_train_for_timestamp(timestamp, wifi_group, wps, floor, path, bssids)
    waypoint = nearest_waypoint(timestamp, wps)
    wifi_group = wifi_group.drop_duplicates(subset='bssid')
    break
    #tmp = wifi_group.iloc[:, 3:5]  # bssid and value
    #row = tmp.set_index('bssid').reindex(bssids).replace(np.nan, -999).T

#break
```    

```python
wifi_group.head(3)
tmp = wifi_group.loc[:,['bssid', 'value', 'dt','channel']]
tmp.head(3)

row = tmp.set_index('bssid').reindex(bssids)
row.head(3)

row.fillna({'value':rssiFillerValue, 'dt':dtFillerValue, 'channel':freqFillerValue}, inplace=True)
row.head(3)

features = row.values.flatten('F').tolist()
print(len(features))

# f1 = row['value'].notnull().index.values
# f2 = row['dt'].notnull().index.values
# f3 = row['channel'].notnull().index.values

# np.array_equal(f1,f2)
# np.array_equal(f1,f3)
```

```python
tmp = wifi_group.loc[:, ['bssid','value']]  # bssid and value
print(tmp.shape)
print(tmp.head(2))

dtTmp = wifi_group.loc[:, ['dt_bssid','dt']]  # bssid and value
print(dtTmp.shape)
print(dtTmp.head(3))

channelTmp = wifi_group.loc[:, ['channel_bssid','channel']]  # bssid and value
print(channelTmp.shape)
print(channelTmp.head(3))


row = tmp.set_index('bssid').reindex(bssids).replace(np.nan, -999).T
dtRow = dtTmp.set_index('dt_bssid').reindex(dtBssids).replace(np.nan, 1000.0).T
channelRow = channelTmp.set_index('channel_bssid').reindex(channelBssids).replace(np.nan, 0).T

row[dtRow.columns] = dtRow.values
row[channelRow.columns] = channelRow.values

np.array_equal(np.where(row.values > -500)[1], np.where(dtRow.values < 500.0)[1]) 
np.array_equal(np.where(row.values > -500)[1], np.where(channelRow.astype(int).values > 100)[1]) 
```

In [5]:
%%time
generate_train()

num_cores=4
start:5a0546857ecc773753327266
9296
(9296, 10174)
end:5a0546857ecc773753327266
CPU times: user 2.18 s, sys: 65.6 ms, total: 2.24 s
Wall time: 3min 34s


In [6]:
#generate_test()