# zerina project

## setup

In [4]:
import pandas as pd
import numpy as np
import scipy
from tqdm import tqdm_notebook, tnrange

from sklearn.pipeline import make_pipeline
from sklearn.model_selection import train_test_split

from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler

from keras.models import Sequential
from keras.layers import Dense
from keras.layers import LSTM

## data processing

### transition matrices

In [107]:
tmat_file = 'data/wifi_t_matrices.csv'

tmat_df = (
    pd
    .read_csv(
        tmat_file, 
        names=['OnOn','OnOff','OffOn','OffOff','timestep']
    )
    [['OnOn', 'OnOff', 'OffOff', 'OffOn', 'timestep']]
)

tmat_df.head()

Unnamed: 0,OnOn,OnOff,OffOff,OffOn,timestep
0,0.99546,0.004543,0.002411,0.99759,1
1,0.95586,0.044138,0.025682,0.97432,11
2,0.93161,0.068395,0.043521,0.95648,21
3,0.90886,0.091141,0.062781,0.93722,31
4,0.91697,0.083027,0.059557,0.94044,41


### read csv and preprocess

In [3]:
tar_file = r'data/wifitrafficstats2.csv'
#tar_file = r'data/wifitrafficstats3.csv'
#tar_file = r'data/wifitrafficstats4.csv'
#tar_file = r'data/wifitrafficstats5.csv'
#tar_file = r'data/wifitrafficstats6.csv'

df = (
    pd
    .read_csv(tar_file)
    .rename({ 
        'Time':'time', 
        'Length':'length', 
        'TX Rate':'rate'
    }, axis=1)
    .eval('time = time * 1e6')
    .eval('rate = rate * 1e6')
    .eval('length = length * 8')
    .eval('on_time = 1e6 * length / rate')
    .eval('end_time = time + on_time')
    .assign(
        shift_time = lambda x: x.end_time.shift(1),
        off_time = lambda x: [
            (time - shift_time) 
                if (time - shift_time) > 0 else 0 
            for time, shift_time in (zip(x.time, x.shift_time))
    ])
    .drop(['No.','Delta Time','shift_time'], axis=1)
    .round(0)
)



df.head(10)

Unnamed: 0,time,length,rate,on_time,end_time,off_time
0,0.0,1352,6000000.0,225.0,225.0,0.0
1,36341.0,312,24000000.0,13.0,36354.0,36116.0
2,39469.0,312,24000000.0,13.0,39482.0,3115.0
3,49356.0,2288,1000000.0,2288.0,51644.0,9874.0
4,50556.0,1008,1000000.0,1008.0,51564.0,0.0
5,64737.0,6584,6000000.0,1097.0,65834.0,13173.0
6,82497.0,312,24000000.0,13.0,82510.0,16663.0
7,85683.0,312,24000000.0,13.0,85696.0,3173.0
8,101354.0,312,6000000.0,52.0,101406.0,15658.0
9,103643.0,456,12000000.0,38.0,103681.0,2237.0


### generate data list of `1`'s and `0`'s

In [5]:
def generate_data_list(df):
    '''
    takes dataframe with 'on_time' and 'off_time' time stamp columns 
    and returns a numpy array of 1's (on_time) and 0's (off_time)
    '''
    
    data_list = []

    for row in tqdm_notebook(df[['on_time','off_time']].iloc[1:].itertuples(), total=df.shape[0]-1):
        data_list.extend([1 for i in range(int(row.on_time))])
        data_list.extend([0 for i in range(int(row.off_time))])
    
    return np.array(data_list, copy=True)

## compute statistics 

### fast get all state lengths

In [13]:
def get_state_length_list(data):
    '''
    takes data list ([1,1,0,1,0,0,...]) and returns numpy array 
    of duration of consecutive bits ([13,245,2588,19,1056,...])
    '''
    
    data = np.array(data)    
    
    return np.diff(
        np.where(
            np.concatenate(
                ([data[0]],
                 data[:-1] != data[1:],
                 [0]
                )
            )
        )[0]
    )[::2]

## data simulator

In [39]:
from itertools import cycle

In [None]:
tmat_df.head(10)

### setup

In [109]:
state_cycler = cycle(transition_matrix)
    
cycle_dict = {
    (1,1,1): 0,
    (1,1,0): 1,
    (1,0,1): 2,
    (1,0,0): 1,
    (0,1,1): 1,
    (0,1,0): 2,
    (0,0,1): 1,
    (0,0,0): 0,
}

global p
p = 1

def cycle_states():
    return next(state_cycler)

def generate_bit(n):
    global p
    
    if n == 0:
        print(p)
        return np.random.binomial(1, p)
        
    else:
        for i in range(n):
            p = cycle_states()
        
        print(p)
        return np.random.binomial(1, p)

In [137]:
generate_bit(0)

0.99546


1

In [127]:
transition_matrix

array([0.99546  , 0.0045434, 0.0024107, 0.99759  ])

In [140]:
def generate_sim_data(n_samples, transition_matrix):
    '''
    takes a transition matrix and generates a simulated signal
    of size <length>
    '''
    
    state_cycler = cycle(transition_matrix)
    
    cycle_dict = {
        (1,1,1): 0,
        (1,1,0): 1,
        (1,0,1): 2,
        (1,0,0): 1,
        (0,1,1): 1,
        (0,1,0): 2,
        (0,0,1): 1,
        (0,0,0): 0,
    }
    
    global p
    p = 1
    sim_data = [1,1,1]  
    
    def cycle_states():
        return next(state_cycler)
    
    def generate_bit(n):
        global p
        
        if n == 0:
            print(p)
            return np.random.binomial(1, p)
            
        else:
            for i in range(n):
                p = cycle_states()
            
            print(p)
            return np.random.binomial(1, p)
        
    for i in range(n_samples+3):        
        new_bit = generate_bit(cycle_dict[sim_data[-3:]])        
        sim_data.append(new_bit)        
        
    return np.array(sim_data[3:])
    

In [108]:
transition_matrix = tmat_df.iloc[0,:4].values

### execution

In [7]:
trials = 100
length = 100000

sim_matrix = np.zeros((trials, length))
sim_matrix.shape

(100, 100000)

In [8]:
for i in tnrange(trials):
    sim_matrix[i,:] = generate_sim_data(length, tmat_df)

HBox(children=(IntProgress(value=0), HTML(value='')))




### stats

In [24]:
from collections import Counter

In [23]:
stats = sim_matrix.sum(axis=1)

print(f'normalized mean: {np.mean(stats) / length:0.4f}')
print(f'standard deviation: {np.std(stats):0.4f}')

normalized mean: 0.4997
standard deviation: 596.6634


In [35]:
state_lengths = get_state_length_list(sim_matrix[0,:])
print(f'unique state lengths:\n{list(set(state_lengths))}')

sl_counts = Counter(sorted(state_lengths))
print(f'\nstate length frequency:\n{sl_counts.items()}')

unique state lengths:
[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 56, 57, 58, 59, 61, 63, 64, 65, 67, 68, 72, 73]

state length frequency:
dict_items([(1, 23318), (2, 163), (3, 161), (4, 136), (5, 121), (6, 113), (7, 106), (8, 86), (9, 86), (10, 74), (11, 86), (12, 61), (13, 50), (14, 60), (15, 47), (16, 42), (17, 39), (18, 48), (19, 40), (20, 34), (21, 25), (22, 29), (23, 30), (24, 26), (25, 31), (26, 21), (27, 28), (28, 18), (29, 9), (30, 18), (31, 11), (32, 12), (33, 14), (34, 11), (35, 11), (36, 13), (37, 9), (38, 4), (39, 12), (40, 8), (41, 4), (42, 7), (43, 13), (44, 4), (45, 4), (46, 5), (47, 3), (48, 4), (49, 4), (50, 2), (51, 2), (52, 2), (53, 1), (56, 1), (57, 2), (58, 2), (59, 1), (61, 2), (63, 1), (64, 1), (65, 1), (67, 1), (68, 1), (72, 1), (73, 1)])
