In [1]:
import pandas as pd
import numpy as np
import os
import glob
from tqdm.notebook import tqdm
import json
import gc

In [None]:
import sys
!{sys.executable} -m pip install matplotlib

In [2]:
data_path = 'data2'

floor_map = {"B2":-2, "B1":-1, "F1":0, "F2": 1, "F3":2, "F4":3, "F5":4, "F6":5, "F7":6,"F8":7, "F9":8,
             "1F":0, "2F":1, "3F":2, "4F":3, "5F":4, "6F":5, "7F":6, "8F": 7, "9F":8}

In [3]:
ssubm_df = pd.read_csv(os.path.join(data_path, 'input', 'sample_submission.csv'))
ssubm_df = ssubm_df['site_path_timestamp'].apply(lambda x: pd.Series(x.split('_')))
ssubm_df.columns = ['site', 'path', 'timestamp']

used_sites = ssubm_df['site'].value_counts().index.tolist()

In [4]:
def read_path(file_path):
    site = file_path.split('/')[-3]
    floor = floor_map[file_path.split('/')[-2]]
    path = file_path.split('/')[-1][:-4]
    
    waypoints = list()
    wifis = list()
    
    with open(file_path, 'r', encoding='utf-8') as file:
        lines = file.readlines()
    
    for line_data in lines:
        line_data = line_data.strip()
        if not line_data or line_data[0] == '#':
            continue

        line_data = line_data.split('\t')
    
        if line_data[1] == 'TYPE_WIFI':
            wifis.append(line_data)
        if line_data[1] == 'TYPE_WAYPOINT':
            waypoints.append(line_data)
            
    if len(wifis) == 0 or len(waypoints) == 0:
        return pd.DataFrame()
        
    wifi_df = pd.DataFrame(wifis)
    wifi_df.columns = ['sys_ts', 'type', 'ssid', 'bssid', 'rssi', 'freq', 'last_ts' ]
    wifi_df.drop(['type', 'ssid', 'last_ts', 'freq'], axis=1, inplace=True)
    
    wifi_df['site'] = site
    wifi_df['floor'] = floor
    wifi_df['path'] = path    
    
    for gid, g in wifi_df.groupby(['sys_ts'], group_keys=False):
        dists = list()
        for e, k in enumerate(waypoints):
            dist = abs(int(gid) - int(k[0]))
            dists.append(dist)
        nearest_wp = waypoints[np.argmin(dists)]
        
        wifi_df.loc[g.index, ['x']] = nearest_wp[2]
        wifi_df.loc[g.index, ['y']] = nearest_wp[3]
    
    return wifi_df

In [5]:
def read_site(site_path):
    site = site_path.split('/')[-1]
    
    train_paths = glob.glob(os.path.join(site_path, '*', '*'))
    
    paths_dfs = list()
    
    for train_path in tqdm(train_paths, desc=site[:10] + ' read paths'):
        paths_dfs.append(read_path(train_path))
        
    site_df = pd.concat(paths_dfs)
    
    return site_df

In [6]:
def create_site_feature(site_df):    
    bssid_count = site_df['bssid'].value_counts()
    bssids = bssid_count[bssid_count > 1000].index.tolist()
    
    feauter_dfs = list()
    
    for gid, group_df in tqdm(site_df.groupby(['sys_ts'], group_keys=False)):
        group_df = group_df.drop_duplicates(subset='bssid')
        tmp = group_df.iloc[:,1:3]
        feat = tmp.set_index('bssid').reindex(bssids).replace(np.nan, -999).T
        feat[['x', 'y', 'floor', 'path', 'sys_ts']] = group_df[['x', 'y', 'floor', 'path', 'sys_ts']].iloc[0]
        
        feauter_dfs.append(feat)
    
    feauter_df = pd.concat(feauter_dfs)
    #feauter_df.drop(['bssid'], axis=1, inplace=True)
    gc.collect()
    return feauter_df, bssids

In [75]:
site_df = read_site(os.path.join(data_path, 'input', 'train', '5d27075f03f801723c2e360f'))

5d27075f03 read paths:   0%|          | 0/1141 [00:00<?, ?it/s]

In [76]:
site_df

Unnamed: 0,sys_ts,bssid,rssi,site,floor,path,x,y
0,1571392393290,2edc5b7a5e108c03585d0413af3f8d0a0eff6e62,-48,5d27075f03f801723c2e360f,1,5da98de2df065a00069be6d4,81.21368,106.87596
1,1571392393290,294a5d6e08480ef62f25b39f7447ad64df650619,-49,5d27075f03f801723c2e360f,1,5da98de2df065a00069be6d4,81.21368,106.87596
2,1571392393290,82880d2839e20429b96e088ce8535a1fc36cb963,-49,5d27075f03f801723c2e360f,1,5da98de2df065a00069be6d4,81.21368,106.87596
3,1571392393290,38b2ef79b6d1c1a2a3f338e5060e760b3e18a0ca,-50,5d27075f03f801723c2e360f,1,5da98de2df065a00069be6d4,81.21368,106.87596
4,1571392393290,5c13e738721357f640cfa9ac17091fcf8146e5e3,-51,5d27075f03f801723c2e360f,1,5da98de2df065a00069be6d4,81.21368,106.87596
...,...,...,...,...,...,...,...,...
874,1571309327235,f9d650188e96137bd4c7dc3a96b7783f19881635,-89,5d27075f03f801723c2e360f,5,5da84747ae6cfc0006ca8268,54.08458,74.89516
875,1571309327235,2919faa14cdf7d1c40e10bbee47205e138f6d46f,-90,5d27075f03f801723c2e360f,5,5da84747ae6cfc0006ca8268,54.08458,74.89516
876,1571309327235,07244f79daa5ec8938ae36ba341262aa5ee11df4,-91,5d27075f03f801723c2e360f,5,5da84747ae6cfc0006ca8268,54.08458,74.89516
877,1571309327235,38f8bc0074544eb010dbca5fe45025a47478071c,-91,5d27075f03f801723c2e360f,5,5da84747ae6cfc0006ca8268,54.08458,74.89516


In [77]:
feauter_df, bssids = create_site_feature(site_df.head(1000))

  0%|          | 0/2 [00:00<?, ?it/s]

In [78]:
feauter_df

bssid,47f130e620fd5908a353b66b8537c5cd81f82318,fe035a849d728f6e37a8ffb93964655bd108aabe,44af9f2cd5d5ce0f4c8b6416b15d52a552b90735,c3fca718dc87e82937da9a9ebff0ebd156d7db33,8f95dae562bfb3c35cae53e8360d82bef990ddf9,dad3a56e96bd9fc9a70f65c15d529a80f0c3b567,e054f20200e3a114d462beabd0a8fcd3dcb76f74,6e15fa474f4da69c8825fb05b4dde834a7d6bac2,42a39d402d6526260d68f8db3191580769b16590,86146ed74de90b80a1374a0f9d5f411e3cfa4e4f,...,53c1a2f1936fdf3d50a7644cae4afbbece689c34,b0b9d3ee626d27f001962da6814d2e3aee0b6070,57e16aeb1473e1e6fadfcaa14a2d47713505978d,f9978a67201a56412361f693b8d4fec314ff3fd1,37240291e1dd4139b6e28640bc9078c21561ad67,x,y,floor,path,sys_ts
rssi,-69,-83,-72,-85,-85,-71,-81,-76,-72,-84,...,-90,-89,-91,-85,-88,81.21368,106.87596,1,5da98de2df065a00069be6d4,1571392393290
rssi,-69,-83,-69,-86,-85,-68,-82,-76,-72,-84,...,-999,-999,-999,-999,-999,80.9252,103.64109,1,5da98de2df065a00069be6d4,1571392395691


In [7]:
for site in tqdm(used_sites[22:]):
    site_path = os.path.join(data_path, 'input', 'train', site)

    site_df = read_site(site_path)
    
    feauter_df, bssids = create_site_feature(site_df)
    
    feauter_df.to_csv(os.path.join(data_path, 'output', 'features', 'train', site+'.csv'), index=False)
    with open(os.path.join(data_path, 'output', 'features', 'bssid', site+'.json'), 'w') as f:
        json.dump(bssids, f)
        
    del site_path
    del site_df
    del feauter_df
    del bssids
    gc.collect()

  0%|          | 0/2 [00:00<?, ?it/s]

5d27075f03 read paths:   0%|          | 0/1141 [00:00<?, ?it/s]

  0%|          | 0/23665 [00:00<?, ?it/s]

5c3c44b803 read paths:   0%|          | 0/385 [00:00<?, ?it/s]

  0%|          | 0/9736 [00:00<?, ?it/s]