# occupancy prediction

![img](dev/IMG_6435.jpg)

plan:
- split `real_data_array` into `timeunit` chunks
- compute transition matrix for each chunk
- simulate `timeunit` length of data using each transition matrix
- compare simulated data to each respective real data chunk
- make sure to normalize the *`timestep`* with `scipy.signal.decimate`
<br>
<br>

In [5]:
import pandas as pd
import numpy as np
import scipy as sp
from scipy import signal
from tqdm import tqdm_notebook, tnrange, trange, tqdm
from itertools import cycle
from collections import Counter
from pathlib import Path
import time
from contextlib import suppress

import altair as alt
from altair.expr import datum
alt.data_transformers.enable('json')

from _modules.wifi_traffic_analyzer import WifiTrafficAnalyzer

## params

In [6]:
timestep = 250
timeunit = 10_000

## configs

In [7]:
real_data_path_dict = {
    'real_2': Path(r'data/wifitrafficstats2.csv'),
    'real_3': Path(r'data/wifitrafficstats3.csv'),
    'real_4': Path(r'data/wifitrafficstats4.csv'),
    'real_5': Path(r'data/wifitrafficstats5.csv'),
    'real_6': Path(r'data/wifitrafficstats6.csv')
}

transition_matrices_path_dict = {
    'tmat_1': Path(r'data/wifi_t_matrices.csv'),
    #'tmat_2': Path(r'data/wifi_t_matrices2.csv')
}

## occupancy predictor

### classing it up

In [14]:
class OccupancyPredictor:
    def __init__(self, real_data_path_dict, timeunits, timesteps, autoselect_file=True):
        print('initializing OP..')
        
        # attributes
        self.real_data_path_dict = real_data_path_dict
        
        # set timeunits
        if not isinstance(timeunits, list):
            self.timeunits = [timeunits]
            print(f'single timeunit received {self.timeunits}')
        else:
            self.timeunits = timeunits
            print(f'{len(self.timeunits)} timeunits received')
            
        # set timesteps
        if not isinstance(timesteps, list):
            self.timesteps = [timesteps]
            print(f'single timestep received {self.timesteps}')
        else:
            if len(self.timeunits) > 1:
                print('ERROR: timeunit already list, timestep must be single value')
                raise ValueError
            self.timesteps = timesteps
            print(f'{len(self.timesteps)} timesteps received')
        
        # startup tasks
        self.WTA_real = WifiTrafficAnalyzer(mode='real', path_dict=self.real_data_path_dict)
        self.select_file_key(autoselect=autoselect_file)
        
        
    
    def select_file_key(self, autoselect):        
        if autoselect:
            self.file_key = list(self.WTA_real.path_dict.keys())[1]
            self.WTA_real.process_real_data(self.file_key)
            print(f'\tautoselected and processed \'{self.file_key}\'')
            
        else:
            file_key = input('please select a target file..')
            
            try:
                assert file_key.strip().lower() in self.WTA_real.path_dict.keys()
                
                self.file_key = file_key.strip().lower()
                self.WTA_real.process_real_data(self.file_key)
                
                print(f'\tset and processed target file: {self.file_key}')
                
            except AssertionError:
                print(f'invalid target file: {file_key}, options are {self.WTA_real.path_dict.keys()}')
                raise
        
        
    def convert_and_reshape(self, timeunit, downsample_factor):
        self.data_array = self.decimate_signal(
            self.WTA_real.real_data_array,
            downsample_factor
        )
        print(f'generated real data array, size: {self.data_array.shape}')
        
        data_matrix = (
            self.data_array[
                :int(timeunit*np.floor(len(self.data_array) / timeunit))
            ].reshape((-1, timeunit))
        )
        print(f'reshaped into {data_matrix.shape[0]} : {timeunit}µs chunks')
        
        return data_matrix
    
    
    def decimate_signal(self, data_array, factor):
        with suppress(FutureWarning):
            return signal.decimate(
                tuple(data_array),
                factor,
                ftype='fir'
            )
    
        
    def compute_transition_matrix(self, row, timeunit):
        onon, onoff, offon, offoff = 0, 0, 0, 0
        
        for i in range(timeunit-1):    
            # if ON -> ON
            if row[i] == row[i+1] == 1:
                onon += 1
                
            # if ON -> OFF
            elif row[i] == 1 and row[i+1] == 0:
                onoff += 1
                
            # if OFF -> ON
            elif row[i] == 0 and row[i+1] == 1:
                offon += 1
                
            #if OFF -> OFF
            elif row[i] == row[i+1] == 0:
                offoff += 1
                
        try:
            p_onon = onon / (onon + onoff)
        except ZeroDivisionError:
            p_onon = 0
        
        try:
            p_offoff = offoff / (offoff + offon)
        except ZeroDivisionError:
            p_offoff = 0
            
        try:
            p_onoff = onoff / (onon + onoff)
        except ZeroDivisionError:
            p_onoff = 0
            
        try:
            p_offon = offon / (offoff + offon)
        except ZeroDivisionError:
            p_offon = 0
            
        
        return (p_onon, p_onoff, p_offon, p_offoff) 
    
    
    def compute_tmats_and_load_df(self, data_matrix, timeunit):
        # compute all transition matrices
        tmat_array = [
            self.compute_transition_matrix(
                data_matrix[idx,:],
                timeunit
            )
            for idx in tnrange(data_matrix.shape[0])
        ]
        
        # load dataframe
        tmat_df = (
            pd
            .DataFrame(
                tmat_array, 
                columns=['OnOn', 'OnOff', 'OffOn', 'OffOff']
            ).assign(
                timeunit = timeunit,
                source = self.file_key
            )
            [['OnOn','OnOff','OffOff','OffOn','timeunit','source']]
        )
        print(f'created transition matrix dataframe, {tmat_df.shape[0]} rows')
        
        return tmat_df
    
    
    def process_timeunit(self, timeunit, timestep):
        print(f'processing data for timeunit: {timeunit}µs')
        
        data_matrix = self.convert_and_reshape(timeunit, timestep)        
        tmat_df = self.compute_tmats_and_load_df(data_matrix, timeunit)
        
        return data_matrix, tmat_df
    
        
    def iterate_timeunits(self):
        df_dict = {}
        timestep = self.timesteps[0]
        print(f'iterating timeunits, using fixed timestep: {timestep}')
        
        for timeunit in tqdm_notebook(self.timeunits): 
            data_matrix, tmat_df = self.process_timeunit(int(timeunit/timestep), timestep) 
            print(f'data matrix: {data_matrix.shape}')
        
            sim_data_matrix = self.WTA_real.simulate_all_OP_transition_matrices(
                tmat_dataframe=tmat_df,
                n_samples=int(timeunit/timestep), 
                m_trials=1
            )[1:,:]
            
            print(f'sim data matrix: {sim_data_matrix.shape}')
            
            comp_df = (
                pd
                .DataFrame({
                    'real_data': data_matrix.sum(axis=1),
                    'sim_data': sim_data_matrix.sum(axis=1)
                })
                .assign(
                    real_data = lambda x: x.real_data.astype('int'),
                    sim_data = lambda x: x.sim_data.astype('int'),
                    timestep = timestep,
                    timeunit = timeunit
                )
                .eval('diff = sim_data - real_data')
            )
            
            df_dict[timeunit] = comp_df            
        
        return pd.concat([df for df in df_dict.values()])
    
    
    def iterate_timesteps(self):
        df_dict = {}
        timeunit = self.timeunits[0]
        print(f'iterating timeunits, using fixed timestep: {timeunit}')
        
        for timestep in tqdm_notebook(self.timesteps): 
            data_matrix, tmat_df = self.process_timeunit(int(timeunit/timestep), timestep) 
            print(f'data matrix: {data_matrix.shape}')
        
            sim_data_matrix = self.WTA_real.simulate_all_OP_transition_matrices(
                tmat_dataframe=tmat_df,
                n_samples=int(timeunit/timestep), 
                m_trials=1
            )[1:,:]
            
            print(f'sim data matrix: {sim_data_matrix.shape}')
            
            comp_df = (
                pd
                .DataFrame({
                    'real_data': data_matrix.sum(axis=1),
                    'sim_data': sim_data_matrix.sum(axis=1)
                })
                .assign(
                    real_data = lambda x: x.real_data.astype('int'),
                    sim_data = lambda x: x.sim_data.astype('int'),
                    timestep = timestep,
                    timeunit = timeunit
                )
                .eval('diff = sim_data - real_data')
            )
            
            df_dict[timeunit] = comp_df            
        
        return pd.concat([df for df in df_dict.values()])
    
        

## dev

### iterating timeunits

In [15]:
OP = OccupancyPredictor(
    real_data_path_dict, 
    timeunits=list(range(9000, 11000, 50)), 
    timesteps=1000, 
    autoselect_file=True
)

initializing OP..
40 timeunits received
single timestep received [1000]
initializing WTA..
initialization complete, mode: real
real data options: 
	key: real_2, path: data\wifitrafficstats2.csv
	key: real_3, path: data\wifitrafficstats3.csv
	key: real_4, path: data\wifitrafficstats4.csv
	key: real_5, path: data\wifitrafficstats5.csv
	key: real_6, path: data\wifitrafficstats6.csv


HBox(children=(IntProgress(value=0, max=148455), HTML(value='')))


	autoselected and processed 'real_3'


In [None]:
full_comp_df = OP.iterate_timeunits()

iterating timeunits, using fixed timestep: 1000


HBox(children=(IntProgress(value=0, max=40), HTML(value='')))

processing data for timeunit: 9µs
generated real data array, size: (71885,)
reshaped into 7987 : 9µs chunks


HBox(children=(IntProgress(value=0, max=7987), HTML(value='')))

created transition matrix dataframe, 7987 rows
data matrix: (7987, 9)


HBox(children=(IntProgress(value=0, max=7987), HTML(value='')))

total elapsed time: 0.01 minutes
sim data matrix: (7987, 9)
processing data for timeunit: 9µs
generated real data array, size: (71885,)
reshaped into 7987 : 9µs chunks


HBox(children=(IntProgress(value=0, max=7987), HTML(value='')))

created transition matrix dataframe, 7987 rows
data matrix: (7987, 9)


HBox(children=(IntProgress(value=0, max=7987), HTML(value='')))

total elapsed time: 0.01 minutes
sim data matrix: (7987, 9)
processing data for timeunit: 9µs
generated real data array, size: (71885,)
reshaped into 7987 : 9µs chunks


HBox(children=(IntProgress(value=0, max=7987), HTML(value='')))

created transition matrix dataframe, 7987 rows
data matrix: (7987, 9)


HBox(children=(IntProgress(value=0, max=7987), HTML(value='')))

total elapsed time: 0.01 minutes
sim data matrix: (7987, 9)
processing data for timeunit: 9µs
generated real data array, size: (71885,)
reshaped into 7987 : 9µs chunks


HBox(children=(IntProgress(value=0, max=7987), HTML(value='')))

created transition matrix dataframe, 7987 rows
data matrix: (7987, 9)


HBox(children=(IntProgress(value=0, max=7987), HTML(value='')))

total elapsed time: 0.01 minutes
sim data matrix: (7987, 9)
processing data for timeunit: 9µs
generated real data array, size: (71885,)
reshaped into 7987 : 9µs chunks


HBox(children=(IntProgress(value=0, max=7987), HTML(value='')))

created transition matrix dataframe, 7987 rows
data matrix: (7987, 9)


HBox(children=(IntProgress(value=0, max=7987), HTML(value='')))

total elapsed time: 0.01 minutes
sim data matrix: (7987, 9)
processing data for timeunit: 9µs

### iterating timesteps

In [15]:
OP = OccupancyPredictor(
    real_data_path_dict, 
    timeunits=list(range(9000, 11000, 50)), 
    timesteps=1000, 
    autoselect_file=True
)

initializing OP..
40 timeunits received
single timestep received [1000]
initializing WTA..
initialization complete, mode: real
real data options: 
	key: real_2, path: data\wifitrafficstats2.csv
	key: real_3, path: data\wifitrafficstats3.csv
	key: real_4, path: data\wifitrafficstats4.csv
	key: real_5, path: data\wifitrafficstats5.csv
	key: real_6, path: data\wifitrafficstats6.csv


HBox(children=(IntProgress(value=0, max=148455), HTML(value='')))


	autoselected and processed 'real_3'


In [None]:
full_comp_df = OP.iterate_timesteps()

In [None]:
df_dict = {}
timesteps = list(range(50, 2000, 50))
timeunit = 10_000

for tstep in timesteps:
    df_dict[tstep] = iterate_params(tstep, timeunit)

In [None]:
len(df_dict.keys())

In [None]:
full_comp_df = pd.concat([df for df in df_dict.values()])

In [None]:
(full_comp_df
 .eval('real_data = real_data * timestep / timeunit')
 .eval('sim_data = sim_data * timestep / timeunit')
 .assign(pct_diff = lambda x: np.abs(x.sim_data - x.real_data))
 .head()
)

In [None]:
full_comp_df.head()

In [None]:
full_comp_df.groupby('timestep').diff.mean()

In [None]:
alt.Chart(
    full_comp_df
).mark_circle(
).encode(
    alt.X('timestep:N'),
    alt.Y('mean(diff):Q')
).configure(
    background='#abb2bf'
)

In [None]:
OP = OccupancyPredictor(real_data_path_dict, timeunit, autoselect_file=False)

In [None]:
data_matrix, tmat_df = OP.process_timeunit(int(OP.timeunits[0]/timestep))

print(f'data matrix: {data_matrix.shape}')
display(tmat_df.head())

In [None]:
sim_data_matrix = OP.WTA_real.simulate_all_OP_transition_matrices(
    tmat_dataframe=tmat_df,
    n_samples=int(OP.timeunits[0]/timestep), 
    m_trials=1
)[1:,:]

print(f'sim data matrix: {sim_data_matrix.shape}')

In [None]:
print(f'real data shape: {data_matrix.shape}')
data_matrix.sum(axis=1)[:10]

In [None]:
print(f'sim data shape: {sim_data_matrix.shape}')
sim_data_matrix.sum(axis=1)[:10]

In [None]:
comp_df = (
    pd
    .DataFrame({
        'real_data': data_matrix.sum(axis=1),
        'sim_data': sim_data_matrix.sum(axis=1)
    })
    .assign(
        real_data = lambda x: x.real_data.astype('int'),
        sim_data = lambda x: x.sim_data.astype('int')
    )
    .eval('diff = sim_data - real_data')
)

comp_df.head()

In [None]:
comp_df.shape

In [None]:
x_scaler = alt.selection_interval(encodings=['x'])

base = alt.Chart(
    comp_df.reset_index().sample(frac=0.5),
    height=200,
    width=800
#).transform_filter(
#    x_scaler
).mark_line(
    opacity=0.5
).encode(
    alt.X('index:Q', scale={'domain': x_scaler.ref()}),
    #alt.Y('real_data:Q')
)

layered = alt.vconcat(
    base.encode(
        alt.Y('real_data:Q'),
        color=alt.value('#4286f4') # blue
    ).transform_filter(x_scaler),
    base.encode(
        alt.Y('sim_data:Q'),
        color=alt.value('#41f470') # green
    ).transform_filter(x_scaler),
    base.encode(
        alt.Y('diff:Q'),
        color=alt.value('#f44141') # red
    ).transform_filter(x_scaler),
)

wide = base.encode(alt.Y('diff:Q')).add_selection(x_scaler).properties(height=100)

alt.vconcat(layered, wide, background='#abb2bf')



In [None]:
comp_df.corr()

In [None]:
comp_df.head(50)

## misfits