# Import libraries

In [1]:
import pandas as pd

In [2]:
import numpy as np

In [3]:
import matplotlib.pyplot as plt

In [4]:
import seaborn as sns

In [5]:
import mllib

# Import dataset

Import and read dataset:

In [6]:
trainingData = pd.read_csv('csv/cleaned/trainingData_clean.csv',index_col=0)

In [7]:
validationData = pd.read_csv('csv/validationData.csv')

# Transform data

## Split dataset

Lets select WAP features for input:

In [8]:
WAP_cols = [col for col in trainingData.columns if col[0:3]=='WAP']
X = trainingData[WAP_cols]

Select target 

In [9]:
# y = trainingData['building_floor']

Split using train_test_split:

In [10]:
from sklearn.model_selection import train_test_split
test_ratio = 0.2
X_train, X_test = train_test_split(X, test_size = test_ratio, random_state=42)

In [11]:
X_val = validationData[WAP_cols]

## Feature selection

### Drop with high correlation

Lets check which feature have correlation of more than 0.9:

In [12]:
threshold = 0.9
high_corr_features_df = mllib.get_correlation(X_train,threshold)
high_corr_features_df

Unnamed: 0,f1,f2,corr
0,WAP016,WAP015,0.909556
1,WAP054,WAP053,0.931331
2,WAP081,WAP080,0.923826
3,WAP102,WAP101,0.914404
4,WAP104,WAP103,0.907454
5,WAP120,WAP119,0.90555
6,WAP124,WAP123,0.910565
7,WAP126,WAP125,0.911669
8,WAP130,WAP129,0.931914
9,WAP135,WAP134,0.91098


As we can see, LONGITUDE is correlating with BUILDINGID (because buildings are situated in distinct LONGITUDE range). The correlation of WAP could be explained with their close position to each other. Now we form a list of features to drop:

In [13]:
high_corr_features = list(high_corr_features_df.f2.unique())

As we interested only in WAPs, lets keep them only:

In [14]:
high_corr_features = [ col for col in high_corr_features if col[0:3] == 'WAP']
high_corr_features

['WAP015',
 'WAP053',
 'WAP080',
 'WAP101',
 'WAP103',
 'WAP119',
 'WAP123',
 'WAP125',
 'WAP129',
 'WAP134',
 'WAP136',
 'WAP155',
 'WAP161',
 'WAP166',
 'WAP193',
 'WAP208',
 'WAP286',
 'WAP492',
 'WAP493',
 'WAP498',
 'WAP499']

Finally, we drop columns features with high correlation:

In [15]:
X_train = X_train.drop(columns = high_corr_features)
X_test = X_test.drop(columns = high_corr_features)
X_val = X_val.drop(columns = high_corr_features)

### Drop low variance features

Lets look up for features with variance of zero:

In [16]:
waps_train_var = pd.DataFrame(X_train.var(),columns=['variance'])
low_var_col = list(waps_train_var.query('variance==0').index)
low_var_col

['WAP003',
 'WAP004',
 'WAP092',
 'WAP093',
 'WAP094',
 'WAP095',
 'WAP152',
 'WAP158',
 'WAP159',
 'WAP160',
 'WAP190',
 'WAP215',
 'WAP217',
 'WAP226',
 'WAP227',
 'WAP238',
 'WAP239',
 'WAP240',
 'WAP241',
 'WAP242',
 'WAP243',
 'WAP244',
 'WAP245',
 'WAP246',
 'WAP247',
 'WAP254',
 'WAP293',
 'WAP296',
 'WAP301',
 'WAP303',
 'WAP304',
 'WAP307',
 'WAP333',
 'WAP349',
 'WAP353',
 'WAP354',
 'WAP360',
 'WAP365',
 'WAP416',
 'WAP419',
 'WAP423',
 'WAP429',
 'WAP433',
 'WAP438',
 'WAP441',
 'WAP442',
 'WAP444',
 'WAP445',
 'WAP451',
 'WAP458',
 'WAP482',
 'WAP485',
 'WAP487',
 'WAP488',
 'WAP491',
 'WAP497',
 'WAP519',
 'WAP520']

In [17]:
X_train = X_train.drop(columns = low_var_col)
X_test = X_test.drop(columns = low_var_col)
X_val = X_val.drop(columns = low_var_col)

### Drop WAPs found in building 0 and 2 at the same time 

These WAPs were found in the previous notebook

In [18]:
col_inconsistent_drop = ['WAP212']
X_train = X_train.drop(columns = col_inconsistent_drop)
X_test = X_test.drop(columns = col_inconsistent_drop)
X_val = X_val.drop(columns = col_inconsistent_drop)

### Drop WAPs removed/added - in validation dataset (OPTION)

In [19]:
# # get variance of signal
# waps_var = pd.DataFrame(X_train.var(),columns=['train'])
# waps_val_var = X_val.var()

In [20]:
# # common variance table for train and validation
# waps_var['validation'] = waps_val_var

In [21]:
# # ADDED WAPs
# # if variance was = 0 and now > 0 - 
# added = waps_var.query('(train == 0 and validation != 0)').index

In [22]:
# # REMOVED WAPs
# removed = waps_var.query('(train != 0 and validation == 0)').index

In [23]:
# X_train = X_train.drop(columns = added)
# X_train = X_train.drop(columns = removed)

# X_test = X_test.drop(columns = added)
# X_test = X_test.drop(columns = removed)

# X_val = X_val.drop(columns = added)
# X_val = X_val.drop(columns = removed)

## Scale WAP signal

The task is to transform measured values (-104..0 dBm) to 0..1 and to replace 100 (not vailable) to 0

In [24]:
# replace 100 with NaN
X_train.replace(100, np.nan, inplace=True)
X_test.replace(100, np.nan, inplace=True)
X_val.replace(100, np.nan, inplace=True)

In [25]:
# shift RSSI values so they are higher than 0 (-104..0-->196..300)
X_train = mllib.transform(X_train,300,'shift')
X_test =  mllib.transform(X_test,300,'shift')
X_val =  mllib.transform(X_val,300,'shift')

In [26]:
# replace NaNs with 0
X_train.replace(np.nan, 0, inplace=True)
X_test.replace(np.nan, 0, inplace=True)
X_val.replace(np.nan, 0, inplace=True)

In [27]:
# Scale data to (0,1) range
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler(feature_range=(0,1))
scaler.fit(X_train)

X_train_sc = pd.DataFrame(scaler.transform(X_train))
X_train_sc.columns = X_train.columns
X_train_sc.index = X_train.index

X_test_sc =  pd.DataFrame(scaler.transform(X_test))
X_test_sc.columns = X_test.columns
X_test_sc.index = X_test.index

X_val_sc =  pd.DataFrame(scaler.transform(X_val))
X_val_sc.columns = X_val.columns
X_val_sc.index = X_val.index

# Export

In [28]:
X_train_sc.to_csv('csv/transformed/X_train.csv')

In [29]:
X_test_sc.to_csv('csv/transformed/X_test.csv')

In [30]:
X_val_sc.to_csv('csv/transformed/X_val.csv')