In [1]:
import sys
sys.path.append('..')

from utils.misc import *
from utils.classifier import *
from utils.visualiser import *

## Loading pre-processed data

In [2]:
df_train = pd.read_csv("../data/train/df_train.csv", index_col=0)
df_train_b0 = pd.read_csv("../data/train/df_train_b0.csv", index_col=0)
df_train_b1 = pd.read_csv("../data/train/df_train_b1.csv", index_col=0)
df_train_b2 = pd.read_csv("../data/train/df_train_b2.csv", index_col=0)

df_test = pd.read_csv("../data/test/df_test.csv", index_col=0)
df_test_b0 = pd.read_csv("../data/test/df_test_b0.csv", index_col=0)
df_test_b1 = pd.read_csv("../data/test/df_test_b1.csv", index_col=0)
df_test_b2 = pd.read_csv("../data/test/df_test_b2.csv", index_col=0)

df_train_wap = np.load("../data/train/df_train_wap.npy", allow_pickle=True)
df_train_b0_wap = np.load("../data/train/df_train_b0_wap.npy", allow_pickle=True)
df_train_b1_wap = np.load("../data/train/df_train_b1_wap.npy", allow_pickle=True)
df_train_b2_wap = np.load("../data/train/df_train_b2_wap.npy", allow_pickle=True)

df_test_wap = np.load("../data/test/df_test_wap.npy", allow_pickle=True)
df_test_b0_wap = np.load("../data/test/df_test_b0_wap.npy", allow_pickle=True)
df_test_b1_wap = np.load("../data/test/df_test_b1_wap.npy", allow_pickle=True)
df_test_b2_wap = np.load("../data/test/df_test_b2_wap.npy", allow_pickle=True)

## First approach

In [3]:
%%time

cvs = classification(df_train, df_train_wap, 'building', 'knn')
print("Average balanced accuracy: %.2f%% ± %.2f%%" % (mean_ci(cvs)[0]* 100, mean_ci(cvs)[1] * 100))

Average balanced accuracy: 99.78% ± 0.38%
CPU times: user 1min 54s, sys: 1.11 s, total: 1min 56s
Wall time: 2min 7s


In [4]:
%%time

cvs = classification(df_train, df_train_wap, 'building', 'lr')
print("Average balanced accuracy: %.2f%% ± %.2f%%" % (mean_ci(cvs)[0]* 100, mean_ci(cvs)[1] * 100))

Average balanced accuracy: 99.95% ± 0.12%
CPU times: user 34.1 s, sys: 1.16 s, total: 35.3 s
Wall time: 20.2 s


In [5]:
%%time

cvs = classification(df_train, df_train_wap, 'building', 'svm')
print("Average balanced accuracy: %.2f%% ± %.2f%%" % (mean_ci(cvs)[0]* 100, mean_ci(cvs)[1] * 100))

Average balanced accuracy: 99.78% ± 0.45%
CPU times: user 10.5 s, sys: 457 ms, total: 11 s
Wall time: 11.3 s


### Test result

In [6]:
clf = KNeighborsClassifier(n_neighbors=3, p=2, algorithm='kd_tree')
clf.fit(df_train_wap, df_train['BUILDINGID'])
pred = clf.predict(df_test_wap)
print("Test accuracy: %.2f%%" % (balanced_accuracy_score(df_test['BUILDINGID'], pred) * 100))

Test accuracy: 99.24%


In [7]:
clf = LogisticRegression(random_state=1)
clf.fit(df_train_wap, df_train['BUILDINGID'])
pred = clf.predict(df_test_wap)
print("Test accuracy: %.2f%%" % (balanced_accuracy_score(df_test['BUILDINGID'], pred) * 100))

Test accuracy: 100.00%


In [8]:
clf = SVC(kernel='linear', C=0.1, random_state=1)
clf.fit(df_train_wap, df_train['BUILDINGID'])
pred = clf.predict(df_test_wap)
print("Test accuracy: %.2f%%" % (balanced_accuracy_score(df_test['BUILDINGID'], pred) * 100))

Test accuracy: 99.50%


## Second approach

Each building has an exclusive subset of WAPs. As long as a WiFi fingerprint has RSSI values from some of these building-specific WAPs, we can assign the WiFi fingerprint to such a building.

In [9]:
print("Total number of WAPs in use in building 0: ",  
      df_train_b0.iloc[:, :465].astype(bool).sum(axis=0).astype(bool).sum())
print("Total number of WAPs in use in building 1: ",  
      df_train_b1.iloc[:, :465].astype(bool).sum(axis=0).astype(bool).sum())
print("Total number of WAPs in use in building 2: ",  
      df_train_b2.iloc[:, :465].astype(bool).sum(axis=0).astype(bool).sum())

Total number of WAPs in use in building 0:  200
Total number of WAPs in use in building 1:  207
Total number of WAPs in use in building 2:  203


In [10]:
# convert RSSI vectors into dummy variables
df_wap_bool = df_train.iloc[:, :465].astype(bool)
df_wap_building = pd.concat([df_wap_bool, df_train['BUILDINGID']], axis=1)

# The occurrence of WAPs in each building
df_wap_building = df_wap_building.groupby('BUILDINGID').sum().astype(bool)
# WAPs only in one building
exclusive_waps = np.where(df_wap_building.sum(axis=0) == 1)

# count how many unique WAPs in one building
df_wap_building[df_wap_building.columns[exclusive_waps]].sum(axis=1)

BUILDINGID
0    137
1     69
2    117
dtype: int64

In [11]:
%%time

clf = MultinomialNB()
cvs = cross_val_score(clf, df_wap_bool, df_train['BUILDINGID'], 
                      scoring=make_scorer(balanced_accuracy_score), cv=10)
print("Average balanced accuracy: %.2f%% ± %.2f%%" % (mean_ci(cvs)[0]* 100, mean_ci(cvs)[1] * 100))

Average balanced accuracy: 99.86% ± 0.11%
CPU times: user 4.37 s, sys: 232 ms, total: 4.6 s
Wall time: 3.96 s


### Test result

In [12]:
df_wap_bool = df_train.iloc[:, :465].astype(bool)
building = df_train['BUILDINGID']
clf.fit(df_wap_bool, building)

df_wap_bool = df_test.iloc[:, :465].astype(bool)
building = df_test['BUILDINGID']
pred = clf.predict(df_wap_bool)

print("Test accuracy: %.2f%%" % (balanced_accuracy_score(building, pred) * 100))

Test accuracy: 99.88%
