# Hybrid = markov + hierarchical model
1. Generate sequences
2. Compute MI

In [None]:
import numpy as np
import pandas as pd
from parallelspaper.models import gen_seq_hierarchical, gen_seq_markov, gen_balanced_matrix
from parallelspaper.utils import nowstring
import parallelspaper.information_theory as it
from parallelspaper.config.paths import DATA_DIR
from tqdm.autonotebook import tqdm

### Parameters

In [None]:
#### Hierarchical parameters

# how many branches to sample in hierarchical
n_subsamples = [2]
# how many subsamples to perform
depth = 12
# how many sequences to use
nseq = 1000
# alphabet size
a_n = 5
alphabet = np.arange(a_n)

print('seq len ',(np.mean(n_subsamples)**depth))

In [None]:
# how many markov items to sample 
markov_seq_len_range = [2,5]
# number of elements in markov alphabet
a_n_markov = 25
markov_alphabet_items = np.arange(a_n_markov)
# the number of sequences can correspond to each hierarchical element
markov_n_seq_per_element = 5

### Generate sequences

In [None]:
# generate markov probabilities
markov_probs = np.random.rand(a_n_markov**2).reshape((a_n_markov, a_n_markov))**2
markov_probs = markov_probs/np.sum(markov_probs, axis = 0)
# test it out...
gen_seq_markov(markov_alphabet_items, markov_probs, 10) 

In [None]:
# generate hierarchical recursive sampling probability matrix
probs = gen_balanced_matrix(ps=[.85,.15])

In [None]:
# each leaf in the tree grammar should correspond to a markov generated sequence
markov_alphabet = {i:[gen_seq_markov(markov_alphabet_items,
                                     markov_probs, 
                                     np.random.randint(markov_seq_len_range[0], markov_seq_len_range[1])
                                    ) for j in range(markov_n_seq_per_element)] for i in markov_alphabet_items}
markov_alphabet[alphabet[0]]

In [None]:
from sklearn.externals.joblib import Parallel, delayed
import parallelspaper.information_theory as it
n_jobs = 12; verbosity=0

In [None]:
# sample sequences hierarchically
seqs_list = tqdm(range(nseq), leave=False)# if nseq < 3 else range(nseq)
with Parallel(n_jobs=n_jobs, verbose=verbosity) as parallel:
    sequences = parallel(
        delayed(gen_seq_hierarchical)(alphabet, probs, depth, n_subsamples)
             for seq in seqs_list)

In [None]:
# replace each element with Markov sampled sequences
seqs = [np.concatenate([markov_alphabet[i][np.random.choice(markov_n_seq_per_element)] for i in seq]) for seq in tqdm(sequences)]

In [None]:
len(np.concatenate(seqs))

### Calculate MI

In [None]:
# sequence statistics
seq_len = len(np.concatenate(seqs))
bout_lens = [len(i) for i in seqs]
unique_elements = len(np.unique([np.concatenate(seqs)]))

In [None]:
# sequential distances to compute MI at
distances = np.arange(1,101)

In [None]:
MI_DF = pd.DataFrame(columns=['name', 'type', 'rep', 'MI', 'MI_shuff', 'distances',
                              'MI_var', 'MI_shuff_var', 'n_elements', 'unique_elements', 'bout_lens'])

In [None]:
from sklearn.externals.joblib import Parallel, delayed
n_jobs = 12; verbosity=0

In [None]:
# calculate MI
(MI, var_MI), (MI_shuff, MI_shuff_var) = it.sequential_mutual_information([np.concatenate(seqs)],
                                                                          distances,
                                                                          n_jobs=n_jobs,
                                                                          verbosity=verbosity)
# add to MI_DF
MI_DF.loc[len(MI_DF)] = ['hybrid', 'full', 0, MI,
                         MI_shuff, distances, var_MI, MI_shuff_var, seq_len, unique_elements, bout_lens]

In [None]:
now_string = nowstring()
MI_DF.to_pickle(str(DATA_DIR / ('MI_DF/models/hybrid_'+now_string+'.pickle')))