In [1]:
import pandas as pd
import numpy as np
import re
from datetime import datetime as dt
from scipy.spatial import distance
import scipy.stats as stats

%matplotlib inline

In [2]:
TIME_SAMPLE_FREQ = '60s'

In [3]:
df = pd.read_csv("..\\..\\scripts\\_split_all\\user_1\\base_bt.data", sep=';', index_col = False, header = None, low_memory = False, \
                 names = ['timestamp', 'action', 'bssid', 'major_class', 'class', \
                          'bond_state', 'type'])

In [4]:
df.head()

Unnamed: 0,timestamp,action,bssid,major_class,class,bond_state,type
0,08.03.2021_15:15:19.867,android.bluetooth.device.action.FOUND,5C:78:F8:72:FB:9B,512,524,10,1
1,08.03.2021_15:15:59.171,android.bluetooth.device.action.FOUND,63:33:92:B9:A9:7F,7936,7936,10,2
2,08.03.2021_15:16:01.657,android.bluetooth.device.action.FOUND,63:33:92:B9:A9:7F,7936,7936,10,2
3,08.03.2021_15:16:16.788,android.bluetooth.device.action.FOUND,63:33:92:B9:A9:7F,7936,7936,10,2
4,08.03.2021_15:16:39.046,android.bluetooth.device.action.FOUND,5C:78:F8:72:FB:9B,512,524,10,1


In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 16945 entries, 0 to 16944
Data columns (total 7 columns):
timestamp      16945 non-null object
action         16945 non-null object
bssid          16945 non-null object
major_class    16945 non-null int64
class          16945 non-null int64
bond_state     16945 non-null int64
type           16945 non-null int64
dtypes: int64(4), object(3)
memory usage: 926.8+ KB


In [6]:
df = df[df['action'] == 'android.bluetooth.device.action.FOUND']

In [7]:
df['timestamp'] = df['timestamp'].apply(lambda x: dt.strptime(x, '%d.%m.%Y_%H:%M:%S.%f'))
df.index = pd.DatetimeIndex(df.timestamp)
df = df.sort_index()

In [8]:
df = df.drop(['timestamp', 'action', 'class', 'major_class', 'bond_state', 'type'], axis = 1)

In [9]:
bssid_map = { bssid.replace(' ', ''): idx for bssid, idx in zip(df.bssid.unique(), range(len(df.bssid.unique()))) }

In [10]:
df.bssid = df.bssid.apply(lambda x: str(x).replace(' ', ''))

In [11]:
df['count'] = 1

In [12]:
def agg_string_join(col):
    col = col.apply(lambda x: str(x))
    return col.str.cat(sep = ',').replace(' ', '')

In [13]:
def agg_bssid_col(col):
    array_len = len(bssid_map)
    array = np.zeros(array_len, dtype = 'int8')
    def fill_array(bssid):
        array[bssid_map[bssid.replace(' ', '')]] = 1
        return
        
    col.apply(lambda x: fill_array(x))
    return np.array2string(array, separator = ',').replace(' ', '')[1:-1]

In [14]:
one_hot_columns_count = 0
for col in df.columns:
    if col.find('one_hot') != -1:
        one_hot_columns_count += 1

In [15]:
df.columns

Index(['bssid', 'count'], dtype='object')

In [16]:
cat_columns = df.columns[1:1 + one_hot_columns_count]
cat_columns_map = { col: 'mean' for col in cat_columns }

all_func_dicts_quantum = { 'bssid' : agg_bssid_col, 'count' : 'sum' }
all_func_dicts_quantum.update(cat_columns_map)

In [17]:
df_quantum = df.groupby(pd.Grouper(freq = '5s'), as_index=True).agg(all_func_dicts_quantum)

In [18]:
df_quantum = df_quantum.reset_index()
df_quantum.index = pd.DatetimeIndex(df_quantum.timestamp)

In [19]:
df_quantum = df_quantum.dropna()

In [20]:
df_le = pd.read_csv("..\\..\\scripts\\_split_all\\user_1\\le_bt.data", sep = ';', index_col = False, header = None, low_memory = False, \
                 names = ['timestamp', '1', '2', '3', 'level', '3', 'connectable', '4'])

df_le['timestamp'] = df_le['timestamp'].apply(lambda x: dt.strptime(x, '%d.%m.%Y_%H:%M:%S.%f'))
df_le = df_le.drop(df_le.columns.difference(['connectable','timestamp', 'level']), axis = 1)
df_le.index = pd.DatetimeIndex(df_le.timestamp)
df_le = df_le.sort_index()

  return _read(filepath_or_buffer, kwds)


In [21]:
df_le['connectable'] = df_le['connectable'].apply(lambda x: 1 if str(x).lower() == 'true' else 0)

In [22]:
df_le = df_le.groupby(pd.Grouper(freq = '5s'), as_index=True).agg({'level':'mean', 'connectable':'mean'})

In [23]:
df_le = df_le.dropna()

In [24]:
def get_le_conn_status_from_row(row):
    conn = df_le.iloc[df_le.index.get_loc(row.name, method = 'nearest')]['connectable']
    time = df_le.iloc[df_le.index.get_loc(row.name, method = 'nearest')].name
    return conn if abs((time - row.name).total_seconds()) < 10 else 0

def get_le_level_from_row(row):
    level = df_le.iloc[df_le.index.get_loc(row.name, method = 'nearest')]['level']
    time = df_le.iloc[df_le.index.get_loc(row.name, method = 'nearest')].name
    return level if abs((time - row.name).total_seconds()) < 10 else 0


df_quantum['le_connectable'] = df_quantum.apply(lambda row: get_le_conn_status_from_row(row), axis = 1)
df_quantum['le_level'] = df_quantum.apply(lambda row: get_le_level_from_row(row), axis = 1)

In [25]:
def string2array(string):
    try:
        array = np.fromstring(string, sep=',')
        return array
    except:
        return np.nan

def to_ones_array(array):
    try:
        array[array != 0] = 1
        return array
    except:
        return np.nan

def get_len(obj):
    try:
        length = len(obj)
        return length
    except:
        return np.nan

In [26]:
def get_occured_nets_count(row, prev_col, curr_col):
    prev = to_ones_array(string2array(row[prev_col]))
    curr = to_ones_array(string2array(row[curr_col]))
    intersection = np.logical_and(curr, prev)
    diff = np.logical_and(curr, np.logical_not(intersection))
    
    if (np.count_nonzero(np.logical_or(prev, curr)) == 0):
        return 0
    
    return np.count_nonzero(diff) / np.count_nonzero(np.logical_or(prev, curr))

def get_disappeared_nets_count(row, prev_col, curr_col):
    prev = to_ones_array(string2array(row[prev_col]))
    curr = to_ones_array(string2array(row[curr_col]))
    intersection = np.logical_and(curr, prev)
    diff = np.logical_and(prev, np.logical_not(intersection))
    
    if (np.count_nonzero(np.logical_or(prev, curr)) == 0):
        return 0
    
    return np.count_nonzero(diff) / np.count_nonzero(np.logical_or(prev, curr))

def get_jaccard_index(row, prev_col, curr_col):
    prev = to_ones_array(string2array(row[prev_col]))
    curr = to_ones_array(string2array(row[curr_col]))
    return distance.jaccard(prev, curr)

def get_occur_speed(row, prev_col, curr_col):
    prev = to_ones_array(string2array(row[prev_col]))
    curr = to_ones_array(string2array(row[curr_col]))
    return np.linalg.norm(prev - curr) / np.sqrt(get_len(prev))

def calc_single_cols_in_window(df, col, new_col, window, func):
    def func_wrapper(func, row, prev_col, curr_col):
        delta = row.timestamp - row.prev_timestamp
        if pd.isnull(delta):
            delta = 0
        else:
            delta = abs(delta.total_seconds())
        if delta > 10 * 60:
            return np.nan
        else:
            return func(row, prev_col_name, col)
        
    new_cols = []
        
    for i in range(window):
        prev_col_name = "_".join(['prev', col, str(i + 1)])
        new_col_name = "_".join([new_col, str(i + 1)])
        
        df.loc[:, 'prev_timestamp'] = df.timestamp.shift(i + 1)
        df.loc[:, prev_col_name] = df[col].shift(i + 1)
        df.loc[:, new_col_name] = df.apply(lambda row: func_wrapper(func, row, prev_col_name, col), axis = 1)
        df = df.drop(prev_col_name, axis = 1)
        df = df.drop('prev_timestamp', axis = 1)
        new_cols.append(new_col_name)
        
    df.loc[:, "_".join([new_col, 'mean'])] = df[new_cols].mean(axis = 1)
    df.loc[:, "_".join([new_col, 'median'])] = df[new_cols].median(axis = 1)
    df.loc[:, "_".join([new_col, 'var'])] = df[new_cols].var(axis = 1)
        
    return df

In [27]:
WINDOW_SIZE = 5

occur_and_level_columns_map = [
    ("bssid", "occured_devices_count", WINDOW_SIZE, get_occured_nets_count),
    ("bssid", "disappeared_devices_count", WINDOW_SIZE, get_disappeared_nets_count),
    ("bssid", "jaccard_index", WINDOW_SIZE, get_jaccard_index), 
    ("bssid", "occur_speed", WINDOW_SIZE, get_occur_speed)
]

for (col, new_col, window, func) in occur_and_level_columns_map:
    df_quantum = calc_single_cols_in_window(df_quantum, col, new_col, window, func)

In [28]:
def get_conn_level_speed(row, prev_col, curr_col):
    return row[curr_col] - row[prev_col]

In [29]:
single_columns_map = [
    ("count", "count_speed", WINDOW_SIZE, get_conn_level_speed)
]

for (col, new_col, window, func) in single_columns_map:
    df_quantum = calc_single_cols_in_window(df_quantum, col, new_col, window, func)

In [30]:
def agg_str(col):
    all_freq = col.str.cat(sep=',')
    return string2array(all_freq)

def str_mean(col):
    return np.mean(agg_str(col))

def str_var(col):
    return np.var(agg_str(col))

def str_median(col):
    return np.median(agg_str(col))

def str_skew(col):
    return stats.skew(agg_str(col))

def str_kurt(col):
    return stats.kurtosis(agg_str(col))

def mean(col):
    return np.mean(col)

def var(col):
    return np.var(col)

def median(col):
    return np.median(col)

def skew(col):
    return stats.skew(col)

def kurt(col):
    return stats.kurtosis(col)

In [31]:
cols_for_drop = []
names = [
    "occured_devices_count",
    "disappeared_devices_count",
    "jaccard_index",
    "occur_speed",
    "count_speed"
]

for i in range(1, WINDOW_SIZE + 1):
    for name in names:
        cols_for_drop.append('_'.join([name, str(i)]))
        
df_quantum = df_quantum.drop(['bssid', 'timestamp'], axis = 1)
df_quantum = df_quantum.drop(cols_for_drop, axis = 1)

In [32]:
df_quantum.columns

Index(['count', 'le_connectable', 'le_level', 'occured_devices_count_mean',
       'occured_devices_count_median', 'occured_devices_count_var',
       'disappeared_devices_count_mean', 'disappeared_devices_count_median',
       'disappeared_devices_count_var', 'jaccard_index_mean',
       'jaccard_index_median', 'jaccard_index_var', 'occur_speed_mean',
       'occur_speed_median', 'occur_speed_var', 'count_speed_mean',
       'count_speed_median', 'count_speed_var'],
      dtype='object')

In [33]:
common_cols = df_quantum.columns[:one_hot_columns_count + 3]
speed_acc_cols = df_quantum.columns[one_hot_columns_count + 3:]

common_funcs_list = [mean, var, median, skew, kurt]
special_funcs_list = [mean, pd.DataFrame.mad, skew]

common_cols_map = { col : common_funcs_list for col in common_cols }
speed_acc_cols_map = { col : special_funcs_list for col in speed_acc_cols }

agg_dict = common_cols_map
agg_dict.update(speed_acc_cols_map)

In [34]:
df_quantum[speed_acc_cols] = df_quantum[speed_acc_cols].apply(pd.to_numeric)

In [35]:
df_sampling = df_quantum.groupby(pd.Grouper(freq = TIME_SAMPLE_FREQ)).agg(agg_dict)

In [None]:
df_rolling = df_quantum.rolling(TIME_SAMPLE_FREQ, min_periods = 1, center = False).agg(agg_dict)

In [None]:
df_sampling.columns = ["_".join([str(high_level_name), str(low_level_name)]) \
                       for (high_level_name, low_level_name) in df_sampling.columns.values]

df_rolling.columns = ["_".join([str(high_level_name), str(low_level_name)]) \
                      for (high_level_name, low_level_name) in df_rolling.columns.values]

In [36]:
df_sampling

Unnamed: 0_level_0,count,count,count,count,count,le_connectable,le_connectable,le_connectable,le_connectable,le_connectable,...,occur_speed_var,count_speed_mean,count_speed_mean,count_speed_mean,count_speed_median,count_speed_median,count_speed_median,count_speed_var,count_speed_var,count_speed_var
Unnamed: 0_level_1,mean,var,median,skew,kurt,mean,var,median,skew,kurt,...,skew,mean,mad,skew,mean,mad,skew,mean,mad,skew
timestamp,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2
2021-03-08 15:15:00,0.222222,0.172840,0.0,1.336306,-0.214286,0.222222,0.172840,0.0,1.336306,-0.214286,...,,-1.604167e-01,0.370312,,-0.062500,0.343750,,0.183333,0.157143,
2021-03-08 15:16:00,0.500000,0.416667,0.0,0.929516,-0.240000,0.666667,0.222222,1.0,-0.707107,-1.500000,...,-1.401879e-01,8.333333e-02,0.630556,0.747936,0.416667,0.652778,0.283007,0.400000,0.200000,1.046622e+00
2021-03-08 15:17:00,0.416667,0.576389,0.0,1.425613,0.249964,0.000000,0.000000,0.0,0.000000,-3.000000,...,-2.790147e-01,-8.333333e-02,0.775000,1.181799,0.333333,0.666667,0.935323,0.741667,0.220833,-8.827791e-01
2021-03-08 15:18:00,0.416667,0.243056,0.0,0.338062,-1.885714,0.250000,0.187500,0.0,1.154701,-0.666667,...,-1.844838e-15,1.666667e-02,0.519444,0.139593,0.083333,0.611111,-0.140181,0.250000,0.050000,2.674365e-15
2021-03-08 15:19:00,0.083333,0.076389,0.0,3.015113,7.090909,0.250000,0.187500,0.0,1.154701,-0.666667,...,5.252924e-01,-5.000000e-02,0.208333,2.365505,0.083333,0.152778,3.015113,0.125000,0.104167,-1.207451e-01
2021-03-08 15:20:00,0.166667,0.138889,0.0,1.788854,1.200000,0.000000,0.000000,0.0,0.000000,-3.000000,...,6.403569e-16,1.666667e-02,0.327778,1.733200,0.166667,0.277778,1.788854,0.150000,0.075000,-1.154701e+00
2021-03-08 15:21:00,0.416667,0.409722,0.0,1.266584,0.411951,0.000000,0.000000,0.0,0.000000,-3.000000,...,-3.380617e-01,1.850372e-17,0.600000,0.927724,0.333333,0.611111,0.581378,0.458333,0.268056,3.177374e-01
2021-03-08 15:22:00,0.583333,0.243056,1.0,-0.338062,-1.885714,0.000000,0.000000,0.0,0.000000,-3.000000,...,-3.380617e-01,1.333333e-01,0.477778,-0.271412,0.250000,0.500000,-0.148265,0.258333,0.048611,-3.380617e-01
2021-03-08 15:23:00,0.416667,0.243056,0.0,0.338062,-1.885714,0.000000,0.000000,0.0,0.000000,-3.000000,...,-1.788854e+00,-6.666667e-02,0.588889,0.283007,0.000000,0.833333,0.000000,0.283333,0.027778,-1.788854e+00
2021-03-08 15:24:00,0.333333,0.222222,0.0,0.707107,-1.500000,0.250000,0.187500,0.0,1.154701,-0.666667,...,-1.279898e+00,-5.000000e-02,0.508333,0.614928,-0.083333,0.611111,0.140181,0.233333,0.088889,-1.425326e+00


In [None]:
df_sampling = df_sampling.dropna()
df_sampling = df_sampling.fillna(0)

df_rolling = df_rolling.dropna()
df_rolling = df_rolling.fillna(0)

In [None]:
df_sampling.to_csv(".\\_datasets\\60s\\bt_sampling_dataset_3.csv")
df_rolling.to_csv(".\\_datasets\\60s\\bt_rolling_dataset_3.csv")