## Data balancing

Types of sampling used:

- Stratified sampling: we will divide the rows by class column: 1 OK 0 ERR

- For each strata (group): systematic sampling will be used, where we will choose a row every 5 rows until we have 50% ERR and 50% OK.

In [5]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from keras.models import Sequential
from keras.layers import Dense # dense is all neurons connected to all next layer neurons
from keras.layers import Dropout
from keras.wrappers.scikit_learn import KerasClassifier
from keras.constraints import maxnorm
from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline

In [6]:
db = pd.read_csv('db.csv', low_memory=False)
db.shape

(33543, 35)

In [7]:
db.columns

Index(['Unnamed: 0', 'FCS', 'CORR', 'BUFF_OVERFLOW', 'GENERIC', 'PCKT_NUM',
       'TIME(MS)', 'LENGTH', 'PAYLOAD', 'RSSI', 'CRC_OK', 'FRAME_TYPE',
       'SECURITY_ENABLED', 'FRAME_PENDING', 'ACKNOWLEDGE_REQ',
       'PAN_COMPRESSION', 'DEST_PAN', 'DEST_ADD', 'SRC_PAN', 'SRC_ADD', 'DATA',
       'CMD', 'ACK', '1_OCT_HEADER', 'CSL_WAKEUP', 'CSL_SECURE_ACK',
       'RFID_BLINK', 'BCN', 'TEST_TYPE', 'CONTROL', 'DOUBLENETWORK', 'ELECTRO',
       'NORMAL', 'VIENTO', 'WIRELESS'],
      dtype='object')

In [8]:
db = db.drop(['Unnamed: 0'], axis = 1)
db.head()

Unnamed: 0,FCS,CORR,BUFF_OVERFLOW,GENERIC,PCKT_NUM,TIME(MS),LENGTH,PAYLOAD,RSSI,CRC_OK,...,CSL_SECURE_ACK,RFID_BLINK,BCN,TEST_TYPE,CONTROL,DOUBLENETWORK,ELECTRO,NORMAL,VIENTO,WIRELESS
0,1,1,0,0,1,0.0,11,,-98.0,1.0,...,0,0,0,control,1,0,0,0,0,0
1,1,1,0,0,2,16751.182375,11,,-99.0,1.0,...,0,0,0,control,1,0,0,0,0,0
2,1,1,0,0,3,1.07,14,,-90.0,1.0,...,0,0,1,control,1,0,0,0,0,0
3,1,1,0,0,4,652.69775,11,,-91.0,1.0,...,0,0,0,control,1,0,0,0,0,0
4,1,1,0,0,5,2.006625,14,,-90.0,1.0,...,0,0,1,control,1,0,0,0,0,0


In [9]:
ignore = ['PAYLOAD', 'CRC_OK', 'FRAME_TYPE', 'DEST_PAN', 'DEST_ADD', 'SRC_PAN', 'SRC_ADD', 'TEST_TYPE', 'BUFF_OVERFLOW',
          'FCS', 'GENERIC']
y = db['CRC_OK']
y

0        1.0
1        1.0
2        1.0
3        1.0
4        1.0
        ... 
33538    1.0
33539    0.0
33540    0.0
33541    1.0
33542    1.0
Name: CRC_OK, Length: 33543, dtype: float64

In [10]:
numdb = db.drop(ignore, axis = 1)
d = pd.DataFrame({'OK':y.values})
numdb = pd.concat([numdb, d], axis=1)
numdb.head()

Unnamed: 0,CORR,PCKT_NUM,TIME(MS),LENGTH,RSSI,SECURITY_ENABLED,FRAME_PENDING,ACKNOWLEDGE_REQ,PAN_COMPRESSION,DATA,...,CSL_SECURE_ACK,RFID_BLINK,BCN,CONTROL,DOUBLENETWORK,ELECTRO,NORMAL,VIENTO,WIRELESS,OK
0,1,1,0.0,11,-98.0,0.0,0.0,0.0,0.0,0,...,0,0,0,1,0,0,0,0,0,1.0
1,1,2,16751.182375,11,-99.0,0.0,0.0,0.0,0.0,0,...,0,0,0,1,0,0,0,0,0,1.0
2,1,3,1.07,14,-90.0,0.0,0.0,0.0,0.0,0,...,0,0,1,1,0,0,0,0,0,1.0
3,1,4,652.69775,11,-91.0,0.0,0.0,0.0,0.0,0,...,0,0,0,1,0,0,0,0,0,1.0
4,1,5,2.006625,14,-90.0,0.0,0.0,0.0,0.0,0,...,0,0,1,1,0,0,0,0,0,1.0


In [11]:
mean_rssi = int(numdb['RSSI'].mean())
numdb['RSSI'] = numdb['RSSI'].fillna(mean_rssi)

In [12]:
numdb['SECURITY_ENABLED'] = numdb['SECURITY_ENABLED'].fillna(0)
numdb['FRAME_PENDING'] = numdb['FRAME_PENDING'].fillna(0)
numdb['ACKNOWLEDGE_REQ'] = numdb['ACKNOWLEDGE_REQ'].fillna(0)
numdb['PAN_COMPRESSION'] = numdb['PAN_COMPRESSION'].fillna(0)
numdb['OK'] = numdb['OK'].fillna(0)

In [13]:
numdb[numdb.isna().any(axis=1)]

Unnamed: 0,CORR,PCKT_NUM,TIME(MS),LENGTH,RSSI,SECURITY_ENABLED,FRAME_PENDING,ACKNOWLEDGE_REQ,PAN_COMPRESSION,DATA,...,CSL_SECURE_ACK,RFID_BLINK,BCN,CONTROL,DOUBLENETWORK,ELECTRO,NORMAL,VIENTO,WIRELESS,OK


In [14]:
xdb = numdb.drop(['OK'], axis = 1)
y = numdb['OK']
y

0        1.0
1        1.0
2        1.0
3        1.0
4        1.0
        ... 
33538    1.0
33539    0.0
33540    0.0
33541    1.0
33542    1.0
Name: OK, Length: 33543, dtype: float64

In [15]:
X = xdb.to_numpy()
len(X[0]) # n features

23

In [16]:
n_samples, n_features = X.shape
print(n_samples, n_features)

33543 23


In [17]:
xdb.head()

Unnamed: 0,CORR,PCKT_NUM,TIME(MS),LENGTH,RSSI,SECURITY_ENABLED,FRAME_PENDING,ACKNOWLEDGE_REQ,PAN_COMPRESSION,DATA,...,CSL_WAKEUP,CSL_SECURE_ACK,RFID_BLINK,BCN,CONTROL,DOUBLENETWORK,ELECTRO,NORMAL,VIENTO,WIRELESS
0,1,1,0.0,11,-98.0,0.0,0.0,0.0,0.0,0,...,0,0,0,0,1,0,0,0,0,0
1,1,2,16751.182375,11,-99.0,0.0,0.0,0.0,0.0,0,...,0,0,0,0,1,0,0,0,0,0
2,1,3,1.07,14,-90.0,0.0,0.0,0.0,0.0,0,...,0,0,0,1,1,0,0,0,0,0
3,1,4,652.69775,11,-91.0,0.0,0.0,0.0,0.0,0,...,0,0,0,0,1,0,0,0,0,0
4,1,5,2.006625,14,-90.0,0.0,0.0,0.0,0.0,0,...,0,0,0,1,1,0,0,0,0,0


In [18]:
err = numdb[numdb['OK'] != 1]
err.shape

(658, 24)

In [24]:
import random
num_err = err.shape[0]
indices = list(err.index)
ok = numdb[numdb['OK'] != 0]
num_ok = ok.shape[0]
rows = 0
visited = []
interval = 50
r = 0
x_bal_ok = np.zeros((err.shape[0], err.shape[1]-1), dtype=float)
while(rows < (num_err-1)):
    r += interval
    r = r % (numdb.shape[0]-1)
    if r not in visited and r not in indices and r < (len(ok)-1):
        # without ok
        row = ok.values[r, :-1]
        x_bal_ok[rows, :] = row
        visited.append(r)
        rows += 1
x_bal_ok

array([[  1.       ,  52.       , 185.927875 , ...,   0.       ,
          0.       ,   0.       ],
       [  1.       , 102.       ,   2.652625 , ...,   0.       ,
          0.       ,   0.       ],
       [  1.       , 152.       , 472.13975  , ...,   0.       ,
          0.       ,   0.       ],
       ...,
       [  1.       , 412.       ,  53.268625 , ...,   0.       ,
          0.       ,   0.       ],
       [  1.       , 464.       ,   3.5838125, ...,   0.       ,
          0.       ,   0.       ],
       [  0.       ,   0.       ,   0.       , ...,   0.       ,
          0.       ,   0.       ]])

In [25]:
x_bal_ok.shape

(658, 23)