In [1]:
import pandas as pd
import numpy as np
import re
from datetime import datetime as dt
from scipy.spatial import distance
import scipy.stats as stats

%matplotlib inline

In [2]:
TIME_SAMPLE_FREQ = '30s'

In [3]:
df = pd.read_csv("..\\pipelines\\_gen\\base_wifi_filtered_1.data", sep = ';', index_col = False, header = None, low_memory = False, \
                 names = ['timestamp', 'uuid', 'bssid', 'chwidth', 'freq', 'level'])

In [4]:
df.head()

Unnamed: 0,timestamp,uuid,bssid,chwidth,freq,level
0,2021-03-08 15:15:05.666,9363aa0a-c910-4d28-bc52-db311897a1c3,50:ff:20:23:22:20,2,5180,-49
1,2021-03-08 15:15:05.666,9363aa0a-c910-4d28-bc52-db311897a1c3,54:64:d9:b6:4d:14,1,2417,-56
2,2021-03-08 15:15:05.666,9363aa0a-c910-4d28-bc52-db311897a1c3,88:d7:f6:5e:c5:64,0,2472,-78
3,2021-03-08 15:15:10.688,9c45ea0a-4bd8-4250-b933-3ff9540b6f31,c8:60:00:70:e4:0c,0,2412,-85
4,2021-03-08 15:15:10.688,9c45ea0a-4bd8-4250-b933-3ff9540b6f31,52:ff:20:93:22:20,1,2447,-43


In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 110277 entries, 0 to 110276
Data columns (total 6 columns):
timestamp    110277 non-null object
uuid         110277 non-null object
bssid        110277 non-null object
chwidth      110277 non-null int64
freq         110277 non-null int64
level        110277 non-null int64
dtypes: int64(3), object(3)
memory usage: 5.0+ MB


In [6]:
df.index = pd.DatetimeIndex(df.timestamp)
df = df.sort_index()

In [7]:
df = df.drop(['timestamp', 'chwidth'], axis = 1)

In [8]:
df.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 110277 entries, 2021-03-08 15:15:05.666000 to 2021-03-14 00:57:08.353000
Data columns (total 4 columns):
uuid     110277 non-null object
bssid    110277 non-null object
freq     110277 non-null int64
level    110277 non-null int64
dtypes: int64(2), object(2)
memory usage: 4.2+ MB


In [9]:
bssid_map = { bssid.replace(' ', ''): idx for bssid, idx in zip(df.bssid.unique(), range(len(df.bssid.unique()))) }

In [10]:
df.bssid = df.bssid.apply(lambda x: str(x).replace(' ', ''))
df.level = df.level.apply(lambda x: str(x).replace(' ', ''))
df.freq = df.freq.apply(lambda x: str(x).replace(' ', ''))

In [11]:
df['bssid_level'] = df[['bssid', 'level']].agg(','.join, axis=1)
df['count'] = 1

In [12]:
def agg_string_join(col):
    col = col.apply(lambda x: str(x))
    return col.str.cat(sep = ',').replace(' ', '')

In [13]:
def agg_bssid_col(col):
    array_len = len(bssid_map)
    array = np.zeros(array_len, dtype = 'float')
    def fill_array(x):
        tmp = x.split(',')
        bssid = tmp[0]
        level = float(tmp[1])
        array[bssid_map[bssid.replace(' ', '')]] = level
        return
        
    col.apply(lambda x: fill_array(x))
    return np.array2string(array, separator = ',').replace(' ', '')[1:-1]

In [14]:
all_func_dicts_quantum = { 'freq': agg_string_join, 'level': agg_string_join, 'bssid_level' : agg_bssid_col, 'count' : 'sum' }

In [15]:
df_quantum = df.groupby(['timestamp', 'uuid'], as_index=True).agg(all_func_dicts_quantum)

In [16]:
df_quantum

Unnamed: 0_level_0,Unnamed: 1_level_0,freq,level,bssid_level,count
timestamp,uuid,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2021-03-08 15:15:05.666,9363aa0a-c910-4d28-bc52-db311897a1c3,518024172472,"-49,-56,-78","-49.,-56.,-78.,...,0.,0.,0.",3
2021-03-08 15:15:10.688,9c45ea0a-4bd8-4250-b933-3ff9540b6f31,2412244751805180245224172472,"-85,-43,-49,-49,-76,-82,-82","-49.,-82.,-82.,...,0.,0.,0.",7
2021-03-08 15:15:15.689,1ab922aa-33ea-42d6-b222-ef829063ea61,2437247224175180244724475180,"-83,-80,-71,-50,-43,-43,-49","-50.,-71.,-80.,...,0.,0.,0.",7
2021-03-08 15:15:20.623,969990ab-53eb-4361-b5df-d2ca52ee77fb,244224722452241724375180244724475180,"-88,-76,-77,-74,-83,-49,-45,-42,-52","-52.,-74.,-76.,...,0.,0.,0.",9
2021-03-08 15:15:28.487,926784ec-a151-46a3-9a3a-8e9107aee398,243724722452241724225180244724475180,"-86,-77,-79,-57,-88,-48,-47,-42,-55","-55.,-57.,-77.,...,0.,0.,0.",9
2021-03-08 15:15:33.521,049b48ae-3552-4ddd-adfb-1deaa0fc22fa,518024172472,"-52,-66,-79","-52.,-66.,-79.,...,0.,0.,0.",3
2021-03-08 15:15:43.045,b24ddc30-f0b3-4667-9453-2e026ddfb60d,241224222452247224175180244724475180,"-89,-86,-79,-75,-67,-49,-42,-41,-50","-49.,-67.,-75.,...,0.,0.,0.",9
2021-03-08 15:15:48.054,acae557a-7666-431d-92ef-2ca8599247c8,241224722437245224175180244724475180,"-87,-77,-85,-78,-58,-50,-42,-42,-50","-50.,-58.,-77.,...,0.,0.,0.",9
2021-03-08 15:15:53.590,5cb16f36-83ca-4cd3-8286-4b4db9ddcd01,247224422422243724175180244724475180,"-76,-88,-88,-85,-57,-49,-43,-42,-51","-51.,-57.,-76.,...,0.,0.,0.",9
2021-03-08 15:15:59.006,35e4619b-38f0-4730-93e3-c3a485ca6297,24722412518024172412,"-76,-88,-49,-66,-87","-49.,-66.,-76.,...,0.,0.,0.",5


In [17]:
df_quantum = df_quantum.reset_index()
df_quantum.index = pd.DatetimeIndex(df_quantum.timestamp)

In [18]:
df_quantum = df_quantum[df_quantum['count'] != 0]

In [19]:
df_conn = pd.read_csv("..\\pipelines\\_gen\\conn_wifi_filtered_1.data", sep = ';', index_col = False, header = None, low_memory = False, \
                 names = ['timestamp', '1', 'bssid', '2', '3', '4', '5', 'level', '6'])

df_conn = df_conn.drop(df_conn.columns.difference(['bssid','timestamp', 'level']), axis = 1)
df_conn.index = pd.DatetimeIndex(df_conn.timestamp)
df_conn = df_conn.sort_index()

In [20]:
def get_level_from_row(row):
    bssid = df_conn.iloc[df_conn.index.get_loc(row.name, method = 'nearest')]['bssid']
    if str(bssid) == 'nan' or str(bssid) == 'null' or str(bssid) == '':
        return 0
    
    level = df_conn.iloc[df_conn.index.get_loc(row.name, method = 'nearest')]['level']
    time = dt.strptime(df_conn.iloc[df_conn.index.get_loc(row.name, method = 'nearest')]['timestamp'], '%Y-%m-%d %H:%M:%S.%f')
    return level if abs((time - row.name).total_seconds()) <= 10 else 0

df_quantum['conn_level'] = df_quantum.apply(lambda row: get_level_from_row(row), axis = 1)

In [21]:
def string2array(string):
    try:
        array = np.fromstring(string, sep=',')
        return array
    except:
        return np.nan

def to_ones_array(array):
    try:
        array[array != 0] = 1
        return array
    except:
        return np.nan

def get_len(obj):
    try:
        length = len(obj)
        return length
    except:
        return np.nan

In [22]:
def get_occured_nets_count(row, prev_col, curr_col):
    prev = to_ones_array(string2array(row[prev_col]))
    curr = to_ones_array(string2array(row[curr_col]))
    intersection = np.logical_and(curr, prev)
    diff = np.logical_and(curr, np.logical_not(intersection))
    
    if (np.count_nonzero(np.logical_or(prev, curr)) == 0):
        return 0
    
    return np.count_nonzero(diff) / np.count_nonzero(np.logical_or(prev, curr))

def get_disappeared_nets_count(row, prev_col, curr_col):
    prev = to_ones_array(string2array(row[prev_col]))
    curr = to_ones_array(string2array(row[curr_col]))
    intersection = np.logical_and(curr, prev)
    diff = np.logical_and(prev, np.logical_not(intersection))
    
    if (np.count_nonzero(np.logical_or(prev, curr)) == 0):
        return 0
    
    return np.count_nonzero(diff) / np.count_nonzero(np.logical_or(prev, curr))

def get_jaccard_index(row, prev_col, curr_col):
    prev = to_ones_array(string2array(row[prev_col]))
    curr = to_ones_array(string2array(row[curr_col]))
    return distance.jaccard(prev, curr)

def get_occur_speed(row, prev_col, curr_col):
    prev = to_ones_array(string2array(row[prev_col]))
    curr = to_ones_array(string2array(row[curr_col]))
    return np.linalg.norm(prev - curr) / np.sqrt(get_len(prev))
    
def get_level_speed(row, prev_col, curr_col):
    prev = string2array(row[prev_col])
    curr = string2array(row[curr_col])
    return np.linalg.norm(prev - curr) / np.sqrt(get_len(prev))

def calc_single_cols_in_window(df, col, new_col, window, func):
    def func_wrapper(func, row, prev_col, curr_col):
        delta = row.timestamp - row.prev_timestamp
        if pd.isnull(delta):
            delta = 0
        else:
            delta = abs(delta.total_seconds())
        if delta > 10 * 60:
            return np.nan
        else:
            return func(row, prev_col_name, col)
        
    new_cols = []
        
    for i in range(window):
        prev_col_name = "_".join(['prev', col, str(i + 1)])
        new_col_name = "_".join([new_col, str(i + 1)])
        
        df['prev_timestamp'] = df.timestamp.shift(i + 1)
        df[prev_col_name] = df[col].shift(i + 1)
        df[new_col_name] = df.apply(lambda row: func_wrapper(func, row, prev_col_name, col), axis = 1)
        df = df.drop(prev_col_name, axis = 1)
        df = df.drop('prev_timestamp', axis = 1)
        new_cols.append(new_col_name)
        
    df["_".join([new_col, 'mean'])] = df[new_cols].mean(axis = 1)
    df["_".join([new_col, 'median'])] = df[new_cols].median(axis = 1)
    df["_".join([new_col, 'var'])] = df[new_cols].var(axis = 1)
        
    return df

In [23]:
WINDOW_SIZE = 5

occur_and_level_columns_map = [
    ("bssid_level", "occured_nets_count", WINDOW_SIZE, get_occured_nets_count),
    ("bssid_level", "disappeared_nets_count", WINDOW_SIZE, get_disappeared_nets_count),
    ("bssid_level", "jaccard_index", WINDOW_SIZE, get_jaccard_index), 
    ("bssid_level", "occur_speed", WINDOW_SIZE, get_occur_speed),
    ("bssid_level", "level_speed", WINDOW_SIZE, get_level_speed)
]

for (col, new_col, window, func) in occur_and_level_columns_map:
    df_quantum = calc_single_cols_in_window(df_quantum, col, new_col, window, func)

In [24]:
def get_conn_level_speed(row, prev_col, curr_col):
    return row[curr_col] - row[prev_col]

In [25]:
single_columns_map = [
    ("conn_level", "conn_level_speed", WINDOW_SIZE, get_conn_level_speed),
    ("count", "count_speed", WINDOW_SIZE, get_conn_level_speed)
]

for (col, new_col, window, func) in single_columns_map:
    df_quantum = calc_single_cols_in_window(df_quantum, col, new_col, window, func)

In [26]:
def agg_str(col):
#     all_freq = col.str.cat(sep=',')
    return string2array(col)

def str_mean(col):
    array = agg_str(col)
    if str(array) == 'nan':
        return 0 
    return np.mean(array)

def mean(col):
    return np.mean(col)

def var(col):
    return np.var(col)

def median(col):
    return np.median(col)

def skew(col):
    return stats.skew(col)

def kurt(col):
    return stats.kurtosis(col)

In [27]:
df_quantum['freq'] = df_quantum.apply(lambda row: str_mean(row['freq']), axis = 1)
df_quantum['level'] = df_quantum.apply(lambda row: str_mean(row['level']), axis = 1)

In [28]:
cols_for_drop = []
names = [
    "occured_nets_count",
    "disappeared_nets_count",
    "jaccard_index",
    "occur_speed",
    "count_speed",
    "conn_level_speed",
    "level_speed",
    "count_speed"
]

for i in range(1, WINDOW_SIZE + 1):
    for name in names:
        cols_for_drop.append('_'.join([name, str(i)]))
        
df_quantum = df_quantum.drop(['bssid_level', 'timestamp', 'uuid'], axis = 1)
df_quantum = df_quantum.drop(cols_for_drop, axis = 1)

In [29]:
df_quantum.columns

Index(['freq', 'level', 'count', 'conn_level', 'occured_nets_count_mean',
       'occured_nets_count_median', 'occured_nets_count_var',
       'disappeared_nets_count_mean', 'disappeared_nets_count_median',
       'disappeared_nets_count_var', 'jaccard_index_mean',
       'jaccard_index_median', 'jaccard_index_var', 'occur_speed_mean',
       'occur_speed_median', 'occur_speed_var', 'level_speed_1',
       'level_speed_2', 'level_speed_3', 'level_speed_4', 'level_speed_5',
       'level_speed_mean', 'level_speed_median', 'level_speed_var',
       'conn_level_speed_mean', 'conn_level_speed_median',
       'conn_level_speed_var', 'count_speed_mean', 'count_speed_median',
       'count_speed_var'],
      dtype='object')

In [None]:
common_cols = df_quantum.columns[0:4]
speed_acc_cols = df_quantum.columns[4:]

common_funcs_list = [mean, var, median, skew, kurt]
special_funcs_list = [mean, pd.DataFrame.mad, skew]

common_cols_map = { col : common_funcs_list for col in common_cols }
speed_acc_cols_map = { col : special_funcs_list for col in speed_acc_cols }

agg_dict = common_cols_map
agg_dict.update(speed_acc_cols_map)

In [None]:
df_quantum[speed_acc_cols] = df_quantum[speed_acc_cols].apply(pd.to_numeric)

In [None]:
df_sampling = df_quantum.groupby(pd.Grouper(freq = TIME_SAMPLE_FREQ)).agg(agg_dict)

In [None]:
df_rolling = df_quantum.rolling(TIME_SAMPLE_FREQ, min_periods = 1, center = False).agg(agg_dict)

In [None]:
df_sampling.columns = ["_".join([str(high_level_name), str(low_level_name)]) \
                       for (high_level_name, low_level_name) in df_sampling.columns.values]

df_rolling.columns = ["_".join([str(high_level_name), str(low_level_name)]) \
                      for (high_level_name, low_level_name) in df_rolling.columns.values]

In [None]:
df_sampling = df_sampling.dropna()
df_sampling = df_sampling.fillna(0)

df_rolling = df_rolling.dropna()
df_rolling = df_rolling.fillna(0)

In [None]:
df_sampling.to_csv(".\\_datasets\\5s\\wifi_sampling_dataset_5.csv")
df_rolling.to_csv(".\\_datasets\\5s\\wifi_rolling_dataset_5.csv")