In [44]:
import numpy as np
import pandas as pd
import pickle
from torch.utils.data import Dataset
import time
import torch
from fastai.text.all import *
from sklearn.preprocessing import MinMaxScaler

# functions

In [45]:
def load_pickle(filename):
    infile = open(filename,'rb')
    obj = pickle.load(infile)
    infile.close()
    return obj

def save_pickle(obj, filename):
    outfile = open(filename,'wb')
    pickle.dump(obj, outfile)
    outfile.close()
    
    
    
def comp_metric(xhat, yhat, fhat, x, y, f):
    intermediate = torch.sqrt(torch.pow(xhat-x, 2) + torch.pow(yhat-y, 2)) + 15 * torch.abs(fhat-f)
#     intermediate = intermediate * distrib
    return intermediate.sum()/xhat.shape[0]/xhat.shape[1]

def comp_metric2(xhat, yhat, fhat, x, y, f):
    intermediate = torch.sqrt(torch.pow(xhat-x, 2) + torch.pow(yhat-y, 2)) + 15 * torch.abs(fhat-f)
    return intermediate.sum()/xhat.shape[0]


def loss_fn(outputs, labels):
    xhat = outputs[:, :, 1]
    yhat = outputs[:, :, 2]
    fhat = outputs[:, :, 0]
    
    x = labels[:, :, 1]
    y = labels[:, :, 2]
    f = labels[:, :, 0]

    return comp_metric(xhat, yhat, fhat, x, y, f)

def metric_fn(outputs, labels):
    xhat = outputs[:, -1, 1]
    yhat = outputs[:, -1, 2]
    fhat = outputs[:, -1, 0]

    
    x = labels[:, -1, 1]
    y = labels[:, -1, 2]
    f = labels[:, -1, 0]

    return comp_metric2(xhat, yhat, fhat, x, y, f)



class StopAt(Callback):
    """stops training after epoch {stop}. when stop is 1, it will train for two cycles (0 and 1)"""
    def __init__(self, stop):
        self.stop = stop
        super().__init__()

    def before_epoch(self):
        if self.epoch == self.stop + 1:
          raise CancelFitException()

# useful infos

In [46]:
device = "cuda" if torch.cuda.is_available() else "cpu"
print("Using {} device".format(device))

Using cuda device


# data

In [47]:
%%time
data = load_pickle('../input/indoor-location-rnn-data-v2/data.pickle')
#test_data = load_pickle('../input/indoor-location-rnn-test-data-v2/test-data.pickle')
path_data = load_pickle('../input/indoor-location-rnn-data-v2/path_data.pickle')

CPU times: user 3.1 s, sys: 971 ms, total: 4.07 s
Wall time: 4.09 s


In [1]:
%%time
# normalize X
rssi_list = []
for key in data:
    rssis = data[key][0][:, -1]
    rssi_list.append(rssis)
rssi_list = np.hstack(rssi_list).reshape(-1, 1)

scaler = MinMaxScaler()
scaler.fit(rssi_list)

for key in data:
    X = data[key][0].astype('float32')
    X[:, -1:] = scaler.transform(X[:, -1:])
    data[key] = (X, data[key][1].astype('float32'))
    
# for key in test_data:
#     X = test_data[key][0].astype('float32')
#     X[:, -1:] = scaler.transform(X[:, -1:])
#     test_data[key] = (X, test_data[key][1].astype('float32'))

NameError: name 'data' is not defined

In [49]:
# seq_lens = []
# for key in train_data:
#     seq_lens.append(train_data[key][0].shape[0])
# pd.Series(seq_lens).describe()
seq_len = 174

In [50]:
# bssids = set()
# for key in train_data:
#     X = train_data[key][0]
#     for wifi in X:
#         bssids.add(wifi[1])
# max(bssids)
n_bssids = 63114

In [51]:
def train_test_split(state=0, mod=10):
    assert state < mod
    train_data = {}
    val_data = {}
    for path in path_data:
        if path % mod == state:
            for key in path_data[path]:
                val_data[len(val_data)] = data[key]
        else:
            for key in path_data[path]:
                train_data[len(train_data)] = data[key]
    return train_data, val_data

In [52]:
# sample_data = {} # data of first building
# for key in data:
#     if key == 26507:
#         break
#     sample_data[key] = data[key]
    
# train_data, val_data = train_test_split(sample_data)

In [53]:
class MyDataset(Dataset):
    def __init__(self, data):#data 维度【batch_size,data】,其中data为一个tuple,包含x为二维数组【楼层 bssid rssi】，y为【楼层，经度,纬度】]
        self.data = data
        self.padding_len = seq_len
        self.pad = np.array([[0, n_bssids, -100]] * self.padding_len, dtype='float32')#数据填充，处理长度为14个，如果少于14个，则填充
        
    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, idx):
        X, Y = self.data[idx]
        original_len = X.shape[0]
        if original_len == self.padding_len:
            pass
        elif original_len > self.padding_len:
            X = X[:self.padding_len]
        else:
            X = np.vstack([X, self.pad[original_len:]])
#         X = np.pad(X, ((0, self.padding_len - original_len), (0, 0)), constant_values=((0, self.padding_value), (0, 0)))
#         X[-1][-1] = original_len
        X = X[:, 1:]#获取x中【楼层 bssid rssi】除去楼层信息
        Y = np.repeat(Y.reshape(1,3), self.padding_len, axis=0)#用之前y进行padding
#         Y = np.array([Y] * self.padding_len)
#         X = torch.tensor(X, device=device)
#         Y = torch.tensor(Y, device=device)
        return (X, Y) #x维度【batch_size,padding_len，2(bssid rssi)】y维度【batch_size,3】
    
    
    
class EmptyDataset(Dataset):
    def __init__(self):
        pass

    def __len__(self):
        return 0

    def __getitem__(self, idx):
        return None

In [54]:
# dset = MyDataset(data)
# dl = DataLoader(dset, batch_size=64, shuffle=True, device=device)

In [55]:
# %%time
# for i in range(len(dset)):
#     dset[i]

In [56]:
# pure 391ms
# padding 44.3s
# padding with values 58.8s
# padding by stacking 7.9s
# padding both X and Y 12.8s

# Model

In [57]:
class Model(Module):
    def __init__(self, vocab_sz, embed_dim, n_hidden, n_layers, p):
        self.i_h = nn.Embedding(vocab_sz, embed_dim)#bssid embedding
        self.rnn = nn.LSTM(embed_dim + 1, n_hidden, n_layers, batch_first=True, dropout=p)
        self.drop = nn.Dropout(p)
        self.h_o = nn.Linear(n_hidden, 3)

#     def forward(self, x):
#         raw, _ = self.rnn(torch.cat((self.i_h(x[:, :, 0].long()), x[:, :, 1].unsqueeze(2)), 2))
#         out = self.drop(raw)
#         return self.h_o(out)#,raw,out
    
    def forward(self, x):
#         max_size = int(x[:, -1, -1].max())
#         print(max_size)
#         x = x[:, :max_size]
        
        #self.i_h(x[:, :, 0].long()))为bssid 进行embedding,输出维度【batch_size,padding_len,embed_dim】
        #x[:, :, 1]维度【batch_size,padding_len】， x[:, :, 1].unsqueeze(2) 输出维度【batch_size,padding_len,1】
        #torch.cat((self.drop(self.i_h(x[:, :, 0].long())), x[:, :, 1].unsqueeze(2)), 2)维度【batch_size,padding_len,embed_dim+1】
        #rnn 中padding_size 当做了序列长度！！！！！
        raw, _ = self.rnn(torch.cat((self.drop(self.i_h(x[:, :, 0].long())), x[:, :, 1].unsqueeze(2)), 2))
        return self.h_o(self.drop(raw))

# train and predict

In [None]:
%%time
# hyperparameters
batch_size = 256
embed_dim =  128
hidden_size = (embed_dim + 1 + 3) // 2
n_layers = 2
p = 0
lr = 0.015625
cycle = 32
stop = 27

for i in range(10):
    # dataloader
    train, val = train_test_split(i)
    dset_train = MyDataset(train)
    dl_train = DataLoader(dset_train, batch_size=batch_size, shuffle=True)
    dset_val = MyDataset(val)
    dl_val = DataLoader(dset_val, batch_size=batch_size)
    dls = DataLoaders(dl_train, dl_val, device=device)

    # fit
    print('batch_size, embed_dim, hidden_size, n_layers, p, lr, cycle, stop')
    print(batch_size, embed_dim, hidden_size, n_layers, p, lr, cycle, stop)
    learn = Learner(dls, Model(n_bssids + 1, embed_dim, hidden_size, n_layers, p).to(device), loss_func=loss_fn, metrics=metric_fn) #loss_func损失函数, metrics 评估函数
    #cycle 训练周期
    #lr 学习率最大上限
    learn.fit_one_cycle(cycle, lr, cbs=StopAt(stop))
    
    # # predict
    # dset = MyDataset(test_data)
    # test_dl = DataLoader(dset, batch_size=batch_size, device=device)

    # preds, _ = learn.get_preds(dl=test_dl)
    # pred_df = pd.DataFrame(preds[:, -1])
    # pred_df.to_csv(f'predictions_fold{i}.csv', index=False)

batch_size, embed_dim, hidden_size, n_layers, p, lr, cycle, stop
256 128 66 2 0 0.015625 32 27


epoch,train_loss,valid_loss,metric_fn,time
0,83.068222,82.086716,81.186676,01:32


# ------------------------------------------------