In [1]:
import pandas as pd
import numpy as np
import os
import glob
from tqdm.notebook import tqdm
import json
import gc

In [2]:
data_path = 'data2'

floor_map = {"B2":-2, "B1":-1, "F1":0, "F2": 1, "F3":2, "F4":3, "F5":4, "F6":5, "F7":6,"F8":7, "F9":8,
             "1F":0, "2F":1, "3F":2, "4F":3, "5F":4, "6F":5, "7F":6, "8F": 7, "9F":8}

In [3]:
ssubm_df = pd.read_csv(os.path.join(data_path, 'input', 'sample_submission.csv'))
ssubm_df = ssubm_df['site_path_timestamp'].apply(lambda x: pd.Series(x.split('_')))
ssubm_df.columns = ['site', 'path', 'timestamp']

In [4]:
def read_path(file_path):
    path = file_path.split('/')[-1][:-4]
    
    wifis = list()
    
    with open(file_path, 'r', encoding='utf-8') as file:
        lines = file.readlines()
    
    site = lines[1].split('\t')[1][7:]
    
    for line_data in lines:
        line_data = line_data.strip()
        if not line_data or line_data[0] == '#':
            continue

        line_data = line_data.split('\t')
    
        if line_data[1] == 'TYPE_WIFI':
            wifis.append(line_data)
            
    if len(wifis) == 0:
        return pd.DataFrame()
        
    wifi_df = pd.DataFrame(wifis)
    wifi_df.columns = ['sys_ts', 'type', 'ssid', 'bssid', 'rssi', 'freq', 'last_ts' ]
    wifi_df.drop(['type', 'ssid', 'last_ts', 'freq'], axis=1, inplace=True)
    
    wifi_df['site'] = site
    wifi_df['path'] = path    
    
    return wifi_df

In [5]:
def read_site(test_paths, site):
    paths_dfs = list()
    
    for test_path in tqdm(test_paths, desc=site[:10] + ' read paths'):
        paths_dfs.append(read_path(test_path))
        #break
        
    site_df = pd.concat(paths_dfs)
    
    return site_df

In [6]:
def create_site_feature(site_df, bssids):    
    feauter_dfs = list()
    
    for gid, group_df in tqdm(site_df.groupby(['sys_ts'], group_keys=False)):
        group_df = group_df.drop_duplicates(subset='bssid')
        tmp = group_df.iloc[:,1:3]
        feat = tmp.set_index('bssid').reindex(bssids).replace(np.nan, -999).T
        feat[['path', 'sys_ts']] = group_df[['path', 'sys_ts']].iloc[0]
        
        feauter_dfs.append(feat)
    
    feauter_df = pd.concat(feauter_dfs)
    gc.collect()
    return feauter_df

In [7]:
file_path = os.path.join(data_path, 'input', 'test', '00ff0c9a71cc37a2ebdd0f05.txt')
wifi_df = read_path(file_path)
wifi_df

Unnamed: 0,sys_ts,bssid,rssi,site,path
0,0000000001180,889bfa434d66eed8c386ccbc90f445932c43f8dd,-58,5da1389e4db8ce0c98bd0547,00ff0c9a71cc37a2ebdd0f05
1,0000000001180,29c7d9e757292e7b2b3d00dc4dae7514531b20b4,-63,5da1389e4db8ce0c98bd0547,00ff0c9a71cc37a2ebdd0f05
2,0000000001180,98d67fadac518296992afddd24e97a2855af9472,-64,5da1389e4db8ce0c98bd0547,00ff0c9a71cc37a2ebdd0f05
3,0000000001180,11567178cc5ca582a37c4733207c77739e1bf5fd,-64,5da1389e4db8ce0c98bd0547,00ff0c9a71cc37a2ebdd0f05
4,0000000001180,bd400fbef9b9b15143e93f8ad2efb07c076e2f5b,-66,5da1389e4db8ce0c98bd0547,00ff0c9a71cc37a2ebdd0f05
...,...,...,...,...,...
2726,0000000086716,b1bf92177ec7aefb36d71ed7efbc6f97f5f31a98,-89,5da1389e4db8ce0c98bd0547,00ff0c9a71cc37a2ebdd0f05
2727,0000000086716,ea3cf52d4c93e6cdc26478871bc11493b474bd23,-89,5da1389e4db8ce0c98bd0547,00ff0c9a71cc37a2ebdd0f05
2728,0000000086716,68240ad4bdd59cd3b623d81cedceca3f10b0a701,-89,5da1389e4db8ce0c98bd0547,00ff0c9a71cc37a2ebdd0f05
2729,0000000086716,9770ecfe259938435eb4e3751883ffd797c527b7,-89,5da1389e4db8ce0c98bd0547,00ff0c9a71cc37a2ebdd0f05


In [8]:
for gid, group_df in tqdm(ssubm_df.groupby(['site'])):
    site = group_df['site'].iloc[0]
    
    with open(os.path.join(data_path, 'output', 'bssid', site+'.json')) as f:
        bssids = json.load(f)
    
    paths = group_df['path'].value_counts().index.tolist()
    
    test_paths = list(map(lambda path: os.path.join(data_path, 'input', 'test', path+'.txt'), paths))
    
    site_df = read_site(test_paths, site)
    feauter_df = create_site_feature(site_df, bssids)
    
    feauter_df.to_csv(os.path.join(data_path, 'output', 'features', 'test', site+'.csv'), index=False)
    gc.collect()

  0%|          | 0/24 [00:00<?, ?it/s]

5a0546857e read paths:   0%|          | 0/29 [00:00<?, ?it/s]

  0%|          | 0/1048 [00:00<?, ?it/s]

5c3c44b803 read paths:   0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/98 [00:00<?, ?it/s]

5d27075f03 read paths:   0%|          | 0/5 [00:00<?, ?it/s]

  0%|          | 0/201 [00:00<?, ?it/s]

5d27096c03 read paths:   0%|          | 0/60 [00:00<?, ?it/s]

  0%|          | 0/3211 [00:00<?, ?it/s]

5d27097f03 read paths:   0%|          | 0/17 [00:00<?, ?it/s]

  0%|          | 0/802 [00:00<?, ?it/s]

5d27099f03 read paths:   0%|          | 0/5 [00:00<?, ?it/s]

  0%|          | 0/227 [00:00<?, ?it/s]

5d2709a003 read paths:   0%|          | 0/20 [00:00<?, ?it/s]

  0%|          | 0/996 [00:00<?, ?it/s]

5d2709b303 read paths:   0%|          | 0/31 [00:00<?, ?it/s]

  0%|          | 0/2607 [00:00<?, ?it/s]

5d2709bb03 read paths:   0%|          | 0/34 [00:00<?, ?it/s]

  0%|          | 0/3085 [00:00<?, ?it/s]

5d2709c303 read paths:   0%|          | 0/31 [00:00<?, ?it/s]

  0%|          | 0/1312 [00:00<?, ?it/s]

5d2709d403 read paths:   0%|          | 0/51 [00:00<?, ?it/s]

  0%|          | 0/4582 [00:00<?, ?it/s]

5d2709e003 read paths:   0%|          | 0/31 [00:00<?, ?it/s]

  0%|          | 0/2113 [00:00<?, ?it/s]

5da138274d read paths:   0%|          | 0/8 [00:00<?, ?it/s]

  0%|          | 0/385 [00:00<?, ?it/s]

5da1382d4d read paths:   0%|          | 0/11 [00:00<?, ?it/s]

  0%|          | 0/1146 [00:00<?, ?it/s]

5da138314d read paths:   0%|          | 0/17 [00:00<?, ?it/s]

  0%|          | 0/962 [00:00<?, ?it/s]

5da138364d read paths:   0%|          | 0/10 [00:00<?, ?it/s]

  0%|          | 0/668 [00:00<?, ?it/s]

5da1383b4d read paths:   0%|          | 0/26 [00:00<?, ?it/s]

  0%|          | 0/1456 [00:00<?, ?it/s]

5da138754d read paths:   0%|          | 0/23 [00:00<?, ?it/s]

  0%|          | 0/1935 [00:00<?, ?it/s]

5da138764d read paths:   0%|          | 0/36 [00:00<?, ?it/s]

  0%|          | 0/1711 [00:00<?, ?it/s]

5da1389e4d read paths:   0%|          | 0/13 [00:00<?, ?it/s]

  0%|          | 0/514 [00:00<?, ?it/s]

5da138b74d read paths:   0%|          | 0/29 [00:00<?, ?it/s]

  0%|          | 0/1231 [00:00<?, ?it/s]

5da958dd46 read paths:   0%|          | 0/51 [00:00<?, ?it/s]

  0%|          | 0/3279 [00:00<?, ?it/s]

5dbc1d84c1 read paths:   0%|          | 0/50 [00:00<?, ?it/s]

  0%|          | 0/2087 [00:00<?, ?it/s]

5dc8cea765 read paths:   0%|          | 0/35 [00:00<?, ?it/s]

  0%|          | 0/1662 [00:00<?, ?it/s]