- change model architecture
- cancel postprocessing to implement in different notebooks
- add dropout

In [1]:
# credit: https://www.kaggle.com/kokitanisaka/lstm-by-keras-with-unified-wi-fi-feats
!pip install tensorflow-determinism

Collecting tensorflow-determinism
  Downloading tensorflow-determinism-0.3.0.tar.gz (12 kB)
  Downloading tensorflow-determinism-0.2.0.tar.gz (10 kB)
  Downloading tensorflow-determinism-0.1.0.tar.gz (7.2 kB)
Building wheels for collected packages: tensorflow-determinism
  Building wheel for tensorflow-determinism (setup.py) ... [?25l- \ done
[?25h  Created wheel for tensorflow-determinism: filename=tensorflow_determinism-0.1.0-py3-none-any.whl size=5255 sha256=f9eb5c91a975ae5a2ca6d47c562d5ab57f3fd9ccbeab80784853c78e70f85b0f
  Stored in directory: /root/.cache/pip/wheels/ce/06/d4/df757adc4c81f705a74a0317c27cf19919ccd25ae2a6ecd2c5
Successfully built tensorflow-determinism
Installing collected packages: tensorflow-determinism
Successfully installed tensorflow-determinism-0.1.0


In [2]:
import numpy as np
import pandas as pd
import scipy.stats as stats
from pathlib import Path
import glob
import pickle

import random
import os

from tqdm.notebook import tqdm

from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import StandardScaler, LabelEncoder, MinMaxScaler, QuantileTransformer

import tensorflow as tf
import tensorflow.keras.layers as L
import tensorflow.keras.models as M
import tensorflow.keras.backend as K
import tensorflow_addons as tfa
from tensorflow_addons.layers import WeightNormalization
from tensorflow.keras.callbacks import ReduceLROnPlateau, ModelCheckpoint, EarlyStopping

from tensorflow.keras.layers import *
from tensorflow.keras.models import *
from tensorflow.keras.layers import *
from tensorflow.keras.optimizers import *
from tensorflow.keras.metrics import *
from tensorflow.keras.utils import *
from tensorflow.keras.callbacks import *

import multiprocessing
import scipy.interpolate
import scipy.sparse
from tqdm import tqdm

from sklearn.decomposition import PCA

from scipy.spatial.distance import cdist

import warnings
warnings.filterwarnings("ignore")

In [3]:
# options
N_SPLITS = 5
SEED = 2021
NUM_FEATS = 30 # number of features that we use. there are 100 feats but we don't need to use all of them
base_path = '/kaggle'

In [4]:
def set_seed(seed=42):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    os.environ['TF_DETERMINISTIC_OPS'] = '1'
    np.random.seed(seed)
    tf.random.set_seed(seed)
    session_conf = tf.compat.v1.ConfigProto(
        intra_op_parallelism_threads=1,
        inter_op_parallelism_threads=1
    )
    sess = tf.compat.v1.Session(graph=tf.compat.v1.get_default_graph(), config=session_conf)
    tf.compat.v1.keras.backend.set_session(sess)
    
def comp_metric(xhat, yhat, fhat, x, y, f):
    intermediate = np.sqrt(np.power(xhat-x, 2) + np.power(yhat-y, 2)) + 15 * np.abs(fhat-f)
    return intermediate.sum()/xhat.shape[0]

def xy_comp_metric(xhat, yhat, x, y):
    intermediate = np.sqrt(np.power(xhat-x, 2) + np.power(yhat-y, 2))
    return intermediate.sum()/xhat.shape[0]

def xy_loss(true, pred):
    return K.mean(K.sqrt(K.square(pred[:,0] - true[:,0]) + K.square(pred[:,1] - true[:,1]))) 

In [5]:
feature_dir = f"{base_path}/input/indooruniteddataset" #indoor-mod-united-dataset"  
train_files = sorted(glob.glob(os.path.join(feature_dir, '*_train.csv')))
test_files = sorted(glob.glob(os.path.join(feature_dir, '*_test.csv')))
subm = pd.read_csv(f'{base_path}/input/indoor-location-navigation/sample_submission.csv', index_col=0)

all_sites = sorted(set(pd.DataFrame(subm.index)["site_path_timestamp"].apply(lambda x: x.split("_")[0])))

In [6]:
# training target features
BSSID_FEATS = [f'bssid_{i}' for i in range(NUM_FEATS)]
RSSI_FEATS  = [f'rssi_{i}' for i in range(NUM_FEATS)]

In [7]:
data = []
for i in tqdm(train_files):
    tmp = pd.read_csv(i)
    tmp["site_id"] = i.split("/")[4].split("_")[0]
    data.append(tmp)
data = pd.concat(data).reset_index(drop=True)

test_data = []
for i in tqdm(test_files):
    tmp = pd.read_csv(i)
    tmp["site_id"] = i.split("/")[4].split("_")[0]
    test_data.append(tmp)
test_data = pd.concat(test_data).reset_index(drop=True)

100%|██████████| 48/48 [00:22<00:00,  2.17it/s]
100%|██████████| 24/24 [00:01<00:00, 21.97it/s]


In [8]:
train_only_index = data[~data.site_id.isin(all_sites)].index
normal_index = data[data.site_id.isin(all_sites)].index
assert len(set(train_only_index) & set(normal_index)) == 0

In [9]:
wifi_feat_nums = 60
wifi_bssids = []
for i in range(wifi_feat_nums):
    wifi_bssids.extend(data.iloc[:,i].values.tolist())
wifi_bssids = list(set(wifi_bssids))

wifi_bssids_size = len(wifi_bssids)
print(f'BSSID TYPES: {wifi_bssids_size}')

wifi_bssids_test = []
for i in range(wifi_feat_nums):
    wifi_bssids_test.extend(test_data.iloc[:,i].values.tolist())
wifi_bssids_test = list(set(wifi_bssids_test))

wifi_bssids_size = len(wifi_bssids_test)
print(f'BSSID TYPES: {wifi_bssids_size}')

wifi_bssids.extend(wifi_bssids_test)
wifi_bssids_size = len(wifi_bssids)

BSSID TYPES: 54067
BSSID TYPES: 25654


In [10]:
#floor_pred = pd.read_csv('../input/lstm-prediction-dataset/submission.csv') 
#test_data['floor'] = floor_pred['floor'].values

original_floor_values = data.loc[normal_index]["floor"].values

data = pd.get_dummies(data, columns=['floor'])
#test_data = pd.get_dummies(test_data, columns=['floor'])

floor_columns = [i for i in data.columns if "floor" in i]

In [11]:
# preprocess
ss = StandardScaler()
ss.fit(data.loc[:,RSSI_FEATS])

le = LabelEncoder()
le.fit(wifi_bssids)

le_site = LabelEncoder()
le_site.fit(data['site_id'])

LabelEncoder()

In [12]:
data.loc[:,RSSI_FEATS] = ss.transform(data.loc[:,RSSI_FEATS])
test_data.loc[:,RSSI_FEATS] = ss.transform(test_data.loc[:,RSSI_FEATS])

for i in BSSID_FEATS:
    data.loc[:,i] = le.transform(data.loc[:,i])
    
for i in BSSID_FEATS:
    test_data.loc[:,i] = le.transform(test_data.loc[:,i])
    
data.loc[:, 'site_id'] = le_site.transform(data.loc[:, 'site_id'])
test_data.loc[:, 'site_id'] = le_site.transform(test_data.loc[:, 'site_id'])

In [13]:
# # pca
# ALL_RSSI_FEATS = [i for i in data.columns if "rssi" in i]
# num = 5
# pca_cols = ["pca-rssi"+str(i+1) for i in range(num)]
# pca = PCA(n_components=num,random_state=42)
# pca_train = pca.fit_transform(data[ALL_RSSI_FEATS])
# pca_test = pca.transform(test_data[ALL_RSSI_FEATS])
# pca_train = pd.DataFrame(pca_train, columns=pca_cols)
# pca_test = pd.DataFrame(pca_test, columns=pca_cols)

# data = pd.concat([data, pca_train],axis=1)
# test_data = pd.concat([test_data, pca_test],axis=1)

In [14]:
site_count = len(data['site_id'].unique())
data.reset_index(drop=True, inplace=True)

In [15]:
set_seed(SEED)

In [16]:
train_only_data = data[data.index.isin(train_only_index)].copy().reset_index(drop=True)
data = data[data.index.isin(normal_index)].copy().reset_index(drop=True)
data.shape, train_only_data.shape

((258125, 136), (98658, 136))

# model

In [17]:
print(tf.test.gpu_device_name())

/device:GPU:0


In [18]:
bssid_embed_dim = 64
site_embed_dim = 2
batch_size=256
epochs= 40

In [19]:
def conv_layer(x, n_channels=32, kernel_size=3, pool_size=2, padding='same'):
    x = Conv1D(filters=n_channels, kernel_size=kernel_size, padding=padding)(x)
    x = BatchNormalization()(x)
    x = ReLU()(x)
    x = MaxPool1D(pool_size=pool_size, padding='same')(x)
    return x

def create_model(input_data):

    # bssid feats ------
    input_dim = input_data[0].shape[1]

    input_embd_layer = L.Input(shape=(input_dim,))
    x1 = L.Embedding(wifi_bssids_size, bssid_embed_dim)(input_embd_layer)
    x1 = L.Flatten()(x1)

    # rssi feats -------
    input_dim = input_data[1].shape[1]
    
    ## mlp
    input_layer = L.Input(input_dim, )
    x2 = L.BatchNormalization()(input_layer)
    x2 = L.Dropout(0.3)(x2) # add
    x2 = L.Dense(NUM_FEATS *64, activation='relu')(x2) 
    
    ## cnn
    #input_layer = Input(shape=(input_dim, 1, 1))
    #x2 = Reshape((input_dim, 1))(input_layer)
    #x2 = conv_layer(x2, 16, 5, 2)
    #x2 = conv_layer(x2, 16, 5, 2)
    #x2 = Reshape((-1,))(x2) #x = GlobalMaxPool1D()(x)

    # site -------------
    input_site_layer = L.Input(shape=(1,))
    x3 = L.Embedding(site_count, site_embed_dim)(input_site_layer)
    x3 = L.Flatten()(x3)

    # main stream ------
    x = L.Concatenate(axis=1)([x1, x3, x2])
    
    x = L.BatchNormalization()(x)
    x = L.Dropout(0.3)(x)
    x = L.Dense(256, activation='relu')(x)
    
    x = L.Reshape((1, -1))(x)
    x = L.BatchNormalization()(x)
    x = L.Bidirectional(L.LSTM(128, dropout=0.3, recurrent_dropout=0.3, return_sequences=True, activation='elu'))(x)
    x = L.Bidirectional(L.LSTM(16, return_sequences=False, activation='elu'))(x)
        
    output_layer_1 = L.Dense(2, name='xy')(x)
    output_layer_2 = L.Dense(len(floor_columns), activation='softmax', name='floor')(x)

    model = M.Model([input_embd_layer, input_layer, input_site_layer], 
                    [output_layer_1, output_layer_2])

    model.compile(optimizer=tf.optimizers.Adam(lr=0.002),
                  loss={'xy': xy_loss, 'floor': tf.keras.losses.CategoricalCrossentropy()}, 
                  metrics={'xy': xy_loss, 'floor': 'accuracy'})

    return model

In [20]:
score_df = pd.DataFrame()
predictions = list()

oof_xy = list()
oof_x, oof_y, oof_f = np.zeros(data.shape[0]), np.zeros(data.shape[0]), np.zeros(data.shape[0])
preds_x, preds_y = 0, 0
preds_f_arr = np.zeros((test_data.shape[0], N_SPLITS))

for fold, (trn_idx, val_idx) in enumerate(StratifiedKFold(n_splits=N_SPLITS, shuffle=True, 
                                                          random_state=SEED).split(data.loc[:, 'path'], 
                                                                                   data.loc[:, 'path'])):
        
    X_train = pd.concat([data.loc[trn_idx, BSSID_FEATS + RSSI_FEATS + ['site_id'] ], #+ pca_cols
                        train_only_data[BSSID_FEATS + RSSI_FEATS + ['site_id']]], axis=0).reset_index(drop=True) #+ pca_cols
    y_trainx = pd.concat([data.loc[trn_idx, 'x'], train_only_data['x']], axis=0).reset_index(drop=True)
    y_trainy = pd.concat([data.loc[trn_idx, 'y'], train_only_data['y']], axis=0).reset_index(drop=True)
    y_trainf = pd.concat([data.loc[trn_idx][floor_columns], train_only_data[floor_columns]], axis=0).reset_index(drop=True)

    #X_train = data.loc[trn_idx, BSSID_FEATS + RSSI_FEATS + ['site_id']].reset_index(drop=True)
    #y_trainx = data.loc[trn_idx, 'x'].reset_index(drop=True)
    #y_trainy = data.loc[trn_idx, 'y'].reset_index(drop=True)
    #y_trainf = data.loc[trn_idx][floor_columns].reset_index(drop=True)
    
    tmp = pd.concat([y_trainx, y_trainy], axis=1)
    y_train = [tmp, y_trainf]

    X_valid = data.loc[val_idx, BSSID_FEATS + RSSI_FEATS + ['site_id'] ] #+ pca_cols
    y_validx = data.loc[val_idx, 'x']
    y_validy = data.loc[val_idx, 'y']
    y_validf = data.loc[val_idx][floor_columns]

    tmp = pd.concat([y_validx, y_validy], axis=1)
    y_valid = [tmp, y_validf]

    model = create_model([X_train.loc[:,BSSID_FEATS], X_train.loc[:,RSSI_FEATS], X_train.loc[:,'site_id']]) #+pca_cols
    model.fit([X_train.loc[:,BSSID_FEATS], X_train.loc[:,RSSI_FEATS], X_train.loc[:,'site_id']], y_train,   #+pca_cols
                validation_data=([X_valid.loc[:,BSSID_FEATS], X_valid.loc[:,RSSI_FEATS], X_valid.loc[:,'site_id']], y_valid), #+pca_cols
                batch_size=batch_size, epochs=epochs,
                callbacks=[
                ReduceLROnPlateau(monitor='val_xy_loss', factor=0.1, patience=3, verbose=1, min_delta=1e-4, mode='min')
                , ModelCheckpoint(f'RNN_{SEED}_{fold}.hdf5', monitor='val_xy_loss', verbose=1, save_best_only=True, save_weights_only=True, mode='min')
                , EarlyStopping(monitor='val_xy_loss', min_delta=1e-4, patience=5, mode='min', baseline=None, restore_best_weights=True)
            ])

    model.load_weights(f'RNN_{SEED}_{fold}.hdf5')  
    val_pred = model.predict([X_valid.loc[:,BSSID_FEATS], X_valid.loc[:,RSSI_FEATS], X_valid.loc[:,'site_id']]) #+pca_cols

    oof_x[val_idx] = val_pred[0][:,0]
    oof_y[val_idx] = val_pred[0][:,1]
    oof_f[val_idx] = np.argmax(val_pred[1], axis=1)-2
                                                            
    pred = model.predict([test_data.loc[:,BSSID_FEATS], test_data.loc[:,RSSI_FEATS], test_data.loc[:,'site_id']]) #+pca_cols
    preds_x += pred[0][:,0]
    preds_y += pred[0][:,1]
    preds_f_arr[:, fold] = np.argmax(pred[1], axis=1)-2

    score = comp_metric(oof_x[val_idx], oof_y[val_idx], oof_f[val_idx],
                        y_validx.to_numpy(), y_validy.to_numpy(), original_floor_values[val_idx])
    print(f"fold {fold}: overall metrics {score}")
    score2 = xy_comp_metric(oof_x[val_idx], oof_y[val_idx],
                        y_validx.to_numpy(), y_validy.to_numpy())
    print(f"fold {fold}: root mean xy position error {score2}")
    oof_xy.append(score2)

    K.clear_session()

preds_x /= (fold + 1)
preds_y /= (fold + 1)
    
print("*+"*40)
score = comp_metric(oof_x, oof_y, oof_f, data["x"].to_numpy(), data["y"].to_numpy(), original_floor_values)
print(f"overall metrics {score}")
score2 = xy_comp_metric(oof_x, oof_y, data["x"].to_numpy(), data["y"].to_numpy())
print(f"root mean xy position error {score2}")
print(oof_xy)
print("*+"*40)

preds_f_mode = stats.mode(preds_f_arr, axis=1)
preds_f = preds_f_mode[0].astype(int).reshape(-1)
test_preds = pd.DataFrame(np.stack((preds_f, preds_x, preds_y))).T
test_preds.columns = subm.columns
test_preds.index = test_data["site_path_timestamp"]
test_preds["floor"] = test_preds["floor"].astype(int)
predictions.append(test_preds)

Epoch 1/40

Epoch 00001: val_xy_loss improved from inf to 13.67450, saving model to RNN_2021_0.hdf5
Epoch 2/40

Epoch 00002: val_xy_loss improved from 13.67450 to 8.74545, saving model to RNN_2021_0.hdf5
Epoch 3/40

Epoch 00003: val_xy_loss improved from 8.74545 to 8.28319, saving model to RNN_2021_0.hdf5
Epoch 4/40

Epoch 00004: val_xy_loss improved from 8.28319 to 7.56105, saving model to RNN_2021_0.hdf5
Epoch 5/40

Epoch 00005: val_xy_loss improved from 7.56105 to 7.02734, saving model to RNN_2021_0.hdf5
Epoch 6/40

Epoch 00006: val_xy_loss did not improve from 7.02734
Epoch 7/40

Epoch 00007: val_xy_loss improved from 7.02734 to 6.91871, saving model to RNN_2021_0.hdf5
Epoch 8/40

Epoch 00008: val_xy_loss improved from 6.91871 to 6.85925, saving model to RNN_2021_0.hdf5
Epoch 9/40

Epoch 00009: val_xy_loss did not improve from 6.85925
Epoch 10/40

Epoch 00010: val_xy_loss improved from 6.85925 to 6.09279, saving model to RNN_2021_0.hdf5
Epoch 11/40

Epoch 00011: val_xy_loss did not

In [21]:
all_preds = pd.concat(predictions)
all_preds = all_preds.reindex(subm.index)

################################################################
#floor_pred = pd.read_csv('../input/lstm-prediction-dataset/submission.csv') 
#all_preds['floor'] = floor_pred['floor'].values

all_preds.to_csv('submission_bef_pp.csv')