# zerina project

## setup

In [4]:
import pandas as pd
import numpy as np
import scipy
from tqdm import tqdm_notebook, tnrange

from sklearn.pipeline import make_pipeline
from sklearn.model_selection import train_test_split

from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler

from keras.models import Sequential
from keras.layers import Dense
from keras.layers import LSTM

## data processing

### transition matrices

In [231]:
tmat_file = 'data/wifi_t_matrices.csv'

tmat_df = (
    pd
    .read_csv(
        tmat_file, 
        names=['OnOn','OnOff','OffOn','OffOff','timestep']
    )
    #.eval('OffOff = 1 - OffOff')
    #.eval('OffOn = 1 - OffOn')
    [['OnOn', 'OnOff', 'OffOff', 'OffOn', 'timestep']]
)

tmat_df.head()

Unnamed: 0,OnOn,OnOff,OffOff,OffOn,timestep
0,0.99546,0.004543,0.002411,0.99759,1
1,0.95586,0.044138,0.025682,0.97432,11
2,0.93161,0.068395,0.043521,0.95648,21
3,0.90886,0.091141,0.062781,0.93722,31
4,0.91697,0.083027,0.059557,0.94044,41


### read csv and preprocess

In [236]:
tar_file = r'data/wifitrafficstats2.csv'
#tar_file = r'data/wifitrafficstats3.csv'
#tar_file = r'data/wifitrafficstats4.csv'
#tar_file = r'data/wifitrafficstats5.csv'
#tar_file = r'data/wifitrafficstats6.csv'

df = (
    pd
    .read_csv(tar_file)
    .rename({ 
        'Time':'time', 
        'Length':'length', 
        'TX Rate':'rate'
    }, axis=1)
    .eval('time = time * 1e6')
    .eval('rate = rate * 1e6')
    .eval('length = length * 8')
    .eval('on_time = 1e6 * length / rate')
    .eval('end_time = time + on_time')
    .assign(
        shift_time = lambda x: x.end_time.shift(1),
        off_time = lambda x: [
            (time - shift_time) 
                if (time - shift_time) > 0 else 0 
            for time, shift_time in (zip(x.time, x.shift_time))
    ])
    .drop(['No.','Delta Time','shift_time'], axis=1)
    .round(0)
)



df.head(10)

Unnamed: 0,time,length,rate,on_time,end_time,off_time
0,0.0,1352,6000000.0,225.0,225.0,0.0
1,36341.0,312,24000000.0,13.0,36354.0,36116.0
2,39469.0,312,24000000.0,13.0,39482.0,3115.0
3,49356.0,2288,1000000.0,2288.0,51644.0,9874.0
4,50556.0,1008,1000000.0,1008.0,51564.0,0.0
5,64737.0,6584,6000000.0,1097.0,65834.0,13173.0
6,82497.0,312,24000000.0,13.0,82510.0,16663.0
7,85683.0,312,24000000.0,13.0,85696.0,3173.0
8,101354.0,312,6000000.0,52.0,101406.0,15658.0
9,103643.0,456,12000000.0,38.0,103681.0,2237.0


### generate data list of `1`'s and `0`'s

In [237]:
def generate_data_list(df):
    '''
    takes dataframe with 'on_time' and 'off_time' time stamp columns 
    and returns a numpy array of 1's (on_time) and 0's (off_time)
    '''
    
    data_list = []

    for row in tqdm_notebook(df[['on_time','off_time']].iloc[1:].itertuples(), total=df.shape[0]-1):
        data_list.extend([1 for i in range(int(row.on_time))])
        data_list.extend([0 for i in range(int(row.off_time))])
    
    return np.array(data_list, copy=True)

## compute statistics 

### fast get all state lengths

In [13]:
def get_state_length_list(data):
    '''
    takes data list ([1,1,0,1,0,0,...]) and returns numpy array 
    of duration of consecutive bits ([13,245,2588,19,1056,...])
    '''
    
    data = np.array(data)    
    
    return np.diff(
        np.where(
            np.concatenate(
                ([data[0]],
                 data[:-1] != data[1:],
                 [0]
                )
            )
        )[0]
    )[::2]

## data simulator

In [39]:
from itertools import cycle

In [None]:
tmat_df.head(10)

### setup

In [109]:
state_cycler = cycle(transition_matrix)
    
cycle_dict = {
    (1,1,1): 0,
    (1,1,0): 1,
    (1,0,1): 2,
    (1,0,0): 1,
    (0,1,1): 1,
    (0,1,0): 2,
    (0,0,1): 1,
    (0,0,0): 0,
}

global p
p = 1

def cycle_states():
    return next(state_cycler)

def generate_bit(n):
    global p
    
    if n == 0:
        print(p)
        return np.random.binomial(1, p)
        
    else:
        for i in range(n):
            p = cycle_states()
        
        print(p)
        return np.random.binomial(1, p)

In [137]:
generate_bit(0)

0.99546


1

In [127]:
transition_matrix

array([0.99546  , 0.0045434, 0.0024107, 0.99759  ])

In [304]:
def generate_sim_data(n_samples, m_trials, transition_matrix):
    '''
    takes a transition matrix and generates a simulated signal
    of size <length>
    '''
        
    cycle_dict = {
        (1,1,1): 0,
        (1,1,0): 1,
        (1,0,1): 2,
        (1,0,0): 1,
        (0,1,1): 1,
        (0,1,0): 2,
        (0,0,1): 1,
        (0,0,0): 0,
    }
    
    global p
    p = 0.5
    sim_matrix = np.zeros((m_trials, n_samples))
    
    state_cycler = cycle(transition_matrix)
    
    def cycle_states():
        return next(state_cycler)
    
    def generate_bit(n):
        global p
        
        if n == 0:
            return np.random.binomial(1, p)
            
        else:
            for i in range(n):
                p = cycle_states()

            return np.random.binomial(1, p)
        
    for i in tnrange(m_trials, leave=False):
        sim_data = [1,1,1]
        p = 0.5
        
        for _ in range(n_samples + 1):        
            new_bit = generate_bit(cycle_dict[tuple(sim_data[-3:])])        
            sim_data.append(new_bit)
            
        sim_matrix[i,:] = np.array(sim_data[4:])
    
    return sim_matrix
    

### execution

In [269]:
sample_size = int(10e6)
trial_size = 10
transition_matrix = tmat_df.iloc[15,:4].values
print(f'transition probabilities: {transition_matrix}')

transition probabilities: [0.91189  0.088106 0.11047  0.88953 ]


In [270]:
%time sim_matrix = generate_sim_data(sample_size, trial_size, transition_matrix)

HBox(children=(IntProgress(value=0), HTML(value='')))

CPU times: user 26.6 s, sys: 491 ms, total: 27.1 s
Wall time: 26.9 s


## main loop

In [None]:
%%time

master_dictionary = {}
sample_size = int(10e6)
trial_size = 10

for row in tqdm_notebook(tmat_df.itertuples(), total=tmat_df.shape[0]):
    transition_matrix = [row.OnOn, row.OnOff, row.OffOff, row.OffOn]
    sim_matrix = generate_sim_data(sample_size, trial_size, transition_matrix)    
    
    master_dictionary[row.timestep] = compute_stats(sim_matrix)

HBox(children=(IntProgress(value=0), HTML(value='')))

HBox(children=(IntProgress(value=0, max=10), HTML(value='')))

### stats

In [257]:
from collections import Counter

In [286]:
def compute_stats(sim_matrix):
    sample_size = sim_matrix.shape[0]
    stats = sim_matrix.sum(axis=1)
                    
    state_lengths = [
        item for sublist in
        [
            list(get_state_length_list(sim_matrix[i,:]))
            for i in range(sim_matrix.shape[0])
        ]
        for item in sublist
    ]
    
    return {
        'mean':np.mean(stats) / sample_size,
        'std':np.std(stats) / sample_size,
        'state_lengths':state_lengths
    }    

In [288]:
compute_stats(sim_matrix)

In [282]:
#stats = sim_matrix[0,:].sum()
stats = sim_matrix.sum(axis=1)

print(f'normalized mean: {np.mean(stats) / sample_size:0.4f}')
print(f'standard deviation: {np.std(stats / sample_size):0.4f}')

state_lengths = [
    item for sublist in
    [
        list(get_state_length_list(sim_matrix[i,:]))
        for i in range(sim_matrix.shape[0])
    ]
    for item in sublist
]
print(f'unique state lengths:\n{list(set(state_lengths))}')

sl_counts = Counter(sorted(state_lengths))
print(f'\nstate length frequency:\n{sl_counts.items()}')

normalized mean: 0.5128
standard deviation: 0.0219
unique state lengths:
[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 103, 105, 107, 110, 112, 115, 116, 122, 124, 128]

state length frequency:
dict_items([(1, 1544700), (2, 513080), (3, 63660), (4, 20857), (5, 15120), (6, 13649), (7, 12184), (8, 10790), (9, 9976), (10, 9099), (11, 8127), (12, 7399), (13, 6706), (14, 5958), (15, 5374), (16, 4958), (17, 4456), (18, 3991), (19, 3654), (20, 3341), (21, 2955), (22, 2746), (23, 2416), (24, 2326), (25, 1979), (26, 1781), (27, 1692), (28, 1539), (29, 1300), (30, 1113), (31, 1164), (32, 1042), (33, 923), (34, 846), (35, 728), (36, 647), (37, 590), (

## real data ref

In [238]:
data_list = generate_data_list(df)

HBox(children=(IntProgress(value=0, max=5310), HTML(value='')))

In [242]:
state_lengths = get_state_length_list(data_list)
print(f'unique state lengths:\n{list(set(state_lengths))}')

sl_counts = Counter(sorted(state_lengths))
print(f'\nstate length frequency:\n{sl_counts.items()}')

unique state lengths:
[3584, 3073, 1027, 4, 5, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 1046, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 3104, 38, 39, 40, 1063, 42, 3113, 2600, 45, 46, 3625, 2608, 47, 1067, 48, 52, 53, 51, 1591, 563, 2105, 57, 59, 60, 1084, 63, 64, 2112, 65, 67, 70, 583, 71, 1097, 2118, 76, 1101, 78, 77, 82, 86, 87, 1110, 1113, 1114, 91, 90, 602, 1035, 1112, 101, 104, 4200, 106, 108, 621, 110, 1135, 624, 1648, 111, 1145, 24, 123, 1149, 128, 3200, 133, 1160, 649, 537, 1680, 148, 149, 2065, 151, 152, 3224, 156, 158, 6816, 164, 1700, 2216, 3240, 3760, 5297, 178, 2232, 2241, 195, 2245, 752, 41, 1233, 209, 1237, 2776, 219, 44, 736, 225, 2080, 229, 1768, 238, 2288, 240, 242, 3824, 753, 245, 246, 1271, 2296, 244, 2305, 3329, 259, 2309, 263, 2824, 783, 277, 2326, 2328, 2334, 1822, 2848, 2336, 289, 295, 808, 1833, 1322, 2348, 303, 1840, 304, 819, 2871, 312, 2872, 1848, 3385, 316, 2879, 1856, 2372, 325, 327, 1864, 329, 330, 3400, 4425, 342, 856, 4952