# Calculate MI for each unit/language
1. load datasets
2. calculate MI

In [None]:
import pandas as pd
import numpy as np
from parallelspaper.config.paths import DATA_DIR
from parallelspaper import information_theory as it 
from tqdm.autonotebook import tqdm

### load datasets

In [None]:
german_seqs = pd.read_pickle(DATA_DIR/'speech_seq_df/GECO_seq_df.pickle')
italian_seqs = pd.read_pickle(DATA_DIR/'speech_seq_df/AsiCA_seq_df.pickle')
english_seqs = pd.read_pickle(DATA_DIR/'speech_seq_df/BUCKEYE_seq_df.pickle')
japanese_seqs = pd.read_pickle(DATA_DIR/'speech_seq_df/CSJ_seq_df.pickle')

seq_df = pd.concat([german_seqs, italian_seqs, english_seqs, japanese_seqs])

In [None]:
seq_df.columns

In [None]:
seq_df[['language', 'levels']].T

### Calculate MI
- for each unit calculate MI within speaker, and within speaker when shuffling words when available

In [None]:
distances = np.arange(1,101)
verbosity = 0; n_jobs = 20

In [None]:
def flatlist(list_of_lists):
    return [val for sublist in list_of_lists for val in sublist]

In [None]:
MI_DF = pd.DataFrame(columns=['language', 'unit', 'analysis', 'MI', 'MI_shuff', 'distances', 'MI_var', 'MI_shuff_var', 'n_elements'])

for idx, (language, levels, data) in tqdm(seq_df.iterrows(), total = len(seq_df)):
    levels = levels.split('/')
    
    # buckeye has an additional 'utterance' level to ignore
    if language == 'english':
        data = [flatlist(speaker) for speaker in data]
        if len(levels) == 4:
            levels = np.array(levels)[[0,2,3]].tolist()
        elif len(levels) == 3:
            levels = np.array(levels)[[0,2]].tolist()
            
    if len(levels) == 2:
        # speakers is the highest level or organization so just compute MI
        units = data
        (MI, var_MI), (MI_shuff, MI_shuff_var) = it.sequential_mutual_information(units, distances, n_jobs = n_jobs, verbosity = verbosity)
        MI_DF.loc[len(MI_DF)] = [language, levels[-1], 'session', MI, MI_shuff, distances, var_MI, MI_shuff_var, len(flatlist(units))]

    else:   
        # concatenate across words, compute MI
        units = np.array([flatlist(i) for i in data])
        (MI, var_MI), (MI_shuff, MI_shuff_var) = it.sequential_mutual_information(units, distances, n_jobs = n_jobs, verbosity = verbosity)
        MI_DF.loc[len(MI_DF)] = [language, levels[-1], 'session', MI, MI_shuff, distances, var_MI, MI_shuff_var, len(flatlist(units))]

        # permute between words order, compute MI
        units = np.array([flatlist(np.random.permutation(i)) for i in data])
        (MI, var_MI), (MI_shuff, MI_shuff_var) = it.sequential_mutual_information(units, distances, n_jobs = n_jobs, verbosity = verbosity)
        MI_DF.loc[len(MI_DF)] = [language, levels[-1], 'shuffled_between_word', MI, MI_shuff, distances, var_MI, MI_shuff_var, len(flatlist(units))]

        # permute within word order, compute MI
        units = np.array([flatlist([np.random.permutation(word) for word in speaker]) for speaker in data])
        (MI, var_MI), (MI_shuff, MI_shuff_var) = it.sequential_mutual_information(units, distances, n_jobs = n_jobs, verbosity = verbosity)
        MI_DF.loc[len(MI_DF)] = [language, levels[-1], 'shuffled_within_word', MI, MI_shuff, distances, var_MI, MI_shuff_var, len(flatlist(units))]
    
    # save dataframe
    MI_DF.to_pickle(DATA_DIR / 'MI_DF/language/language_MI_DF.pickle')