## Library imports

In [1]:
# basic imports
import os
import gc
import math
import glob
import random
import itertools
import numpy as np
import pandas as pd
from tqdm.notebook import tqdm

# basic plotting library
import matplotlib.pyplot as plt

# interactive plots
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
from plotly.offline import iplot

import warnings  
warnings.filterwarnings('ignore')

## Config parameters

In [2]:
class CFG:
    # pipeline parameters
    SEED        = 42
    TRAIN       = True
    LR_FIND     = False
    GENERATE_OOF= True
    N_FOLDS     = 5 
    N_EPOCHS    = 2
    TEST_BATCH_SIZE  = 32
    TRAIN_BATCH_SIZE = 16
    NUM_WORKERS      = 4
    DATA_FRAC        = 1.0
    FOLD_TO_TRAIN    = [0, 1, 2, 3, 4] # 

    # model parameters
    MODEL_ARCH  = 'MLP'
    MODEL_NAME  = 'mlp_vtest'
    WGT_PATH    = ''
    WGT_MODEL   = ''
    PRINT_N_EPOCH = 2
    
    # scheduler variables
    MAX_LR    = 1e-2
    MIN_LR    = 1e-5
    SCHEDULER = 'CosineAnnealingWarmRestarts'  # ['ReduceLROnPlateau', 'None', OneCycleLR', ','CosineAnnealingLR']
    T_0       = 10     # CosineAnnealingWarmRestarts
    T_MULT    = 2      # CosineAnnealingWarmRestarts
    T_MAX     = 5      # CosineAnnealingLR

    # optimizer variables
    OPTIMIZER     = 'Adam'
    WEIGHT_DECAY  = 1e-6
    GRD_ACC_STEPS = 1
    MAX_GRD_NORM  = 1000
    
    # features parameters
    USE_FREQ_FEATS = True
    BUILDING_SITES_RANGE = [0,1]

In [3]:
floor_map = {"B2": -2, "B1": -1, "F1": 0, "F2": 1, "F3": 2, "F4": 3, "F5": 4, "F6": 5, "F7": 6, "F8": 7, "F9": 8,
             "1F": 0, "2F": 1, "3F": 2, "4F": 3, "5F": 4, "6F": 5, "7F": 6, "8F": 7, "9F": 8}

minCount = 1
rssiFillerValue = -999.0
dtFillerValue   = 1000.0
freqFillerValue = 0
featureInputDir = 'referencePublicNotebooks/wiFiFeatures'
modelOutputDir = 'modelSaveDir'
sampleCsvPath = 'sample_submission.csv'
npySaveDir = 'referencePublicNotebooks/wiFi_npy_features'


In [4]:
buildingsList = sorted(glob.glob(f"{featureInputDir}/*.csv"))
buildingsList = buildingsList[CFG.BUILDING_SITES_RANGE[0]: CFG.BUILDING_SITES_RANGE[1]]
print(buildingsList[0].split('/')[-1])

5a0546857ecc773753327266_train.csv


## Helper functions

In [5]:
def getBuildingName(buildingCsvPath):
    fileName = buildingCsvPath.split('/')[-1]
    buildingName = fileName.rstrip('_train.csv')
    return buildingName

In [6]:
train_csv = pd.read_csv('referencePublicNotebooks/wiFiFeatures/5a0546857ecc773753327266_train.csv')

In [7]:
train_csv.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9296 entries, 0 to 9295
Columns: 10175 entries, 0 to 10174
dtypes: float64(6782), int64(3392), object(1)
memory usage: 721.6+ MB


In [8]:
buildingName = getBuildingName(buildingsList[0])
numFeatures = int((train_csv.shape[1] - 5) / 3)

In [9]:
buildingName

'5a0546857ecc773753327266'

In [10]:
for i in range(0, train_csv.shape[0]):
    timeStamp = train_csv.iloc[i,0]
    pathName  = train_csv.iloc[i,-1]
    rowData   = train_csv.iloc[i,1:-4].astype(np.float32)
    
    # scale output
    rowData[0:numFeatures] /= -999.0
    rowData[numFeatures : 2*numFeatures] /= 1000.0
    rowData[2*numFeatures : 3*numFeatures] /= 5000.0
    
    # write to file
    np.save(f"{npySaveDir}/{buildingName}_{pathName}_{timeStamp}.npy", rowData)
    
    if (i%1000 == 0):
        print(f"{i+1} rows saved")

1 rows saved
1001 rows saved
2001 rows saved
3001 rows saved
4001 rows saved
5001 rows saved
6001 rows saved
7001 rows saved
8001 rows saved
9001 rows saved


In [11]:
tempCsv = train_csv.iloc[:,[0,-4,-3,-2,-1]]

In [12]:
tempCsv.head(3)

Unnamed: 0,0,10171,10172,10173,10174
0,1578467411228,106.034371,162.169371,-1,5e1580adf4c3420006d520d4
1,1578467413209,104.20165,162.448659,-1,5e1580adf4c3420006d520d4
2,1578467415183,102.375404,162.72696,-1,5e1580adf4c3420006d520d4


In [13]:
tempCsv.to_csv(f"referencePublicNotebooks/wiFiFeatures/{buildingName}_npyTrain.csv")