# Calculate MI for each language and plot goodness of fit by length of analysis
1. load datasets
2. calculate MI

In [None]:
import pandas as pd
import numpy as np
from parallelspaper.config.paths import DATA_DIR, FIGURE_DIR
from parallelspaper.speech_datasets import LCOL_DICT
from parallelspaper import information_theory as it 
from tqdm.autonotebook import tqdm
from parallelspaper import model_fitting as mf
from parallelspaper.utils import save_fig

### load datasets

In [None]:
german_seqs = pd.read_pickle(DATA_DIR/'speech_seq_df/GECO_seq_df.pickle')
italian_seqs = pd.read_pickle(DATA_DIR/'speech_seq_df/AsiCA_seq_df.pickle')
english_seqs = pd.read_pickle(DATA_DIR/'speech_seq_df/BUCKEYE_seq_df.pickle')
japanese_seqs = pd.read_pickle(DATA_DIR/'speech_seq_df/CSJ_seq_df.pickle')

seq_df = pd.concat([german_seqs, italian_seqs, english_seqs, japanese_seqs])

In [None]:
seq_df.columns

In [None]:
seq_df[['language', 'levels']].T

In [None]:
subsets = [
    ['german', 'speaker/word/phoneme'],
    ['italian', 'speaker/word/phoneme'],
    ['english', 'speaker/utterance/word/phonetic'],
    ['japanese', 'speaker/word/phonemes'],
]

In [None]:
# subset only the main analyses
subset_seq_df = pd.concat([seq_df[(seq_df.language == l) & (seq_df.levels == lev)] for l, lev in subsets])

In [None]:
len(subset_seq_df)

### Calculate MI
- for each unit calculate MI within speaker, and within speaker when shuffling words when available

In [None]:
distances = np.arange(1,1001)
verbosity = 0; n_jobs = 20

In [None]:
def flatlist(list_of_lists):
    return [val for sublist in list_of_lists for val in sublist]

In [None]:
MI_DF = pd.DataFrame(columns=['language', 'unit', 'analysis', 'MI', 'MI_shuff', 'distances', 'MI_var', 'MI_shuff_var', 'n_elements'])

for idx, (language, levels, data) in tqdm(subset_seq_df.iterrows(), total = len(subset_seq_df)):
    levels = levels.split('/')
    
    # buckeye has an additional 'utterance' level to ignore
    if language == 'english':
        data = [flatlist(speaker) for speaker in data]
        if len(levels) == 4:
            levels = np.array(levels)[[0,2,3]].tolist()
        elif len(levels) == 3:
            levels = np.array(levels)[[0,2]].tolist()
            
    if len(levels) == 2:
        # speakers is the highest level or organization so just compute MI
        units = data
        (MI, var_MI), (MI_shuff, MI_shuff_var) = it.sequential_mutual_information(units, distances, n_jobs = n_jobs, verbosity = verbosity)
        MI_DF.loc[len(MI_DF)] = [language, levels[-1], 'session', MI, MI_shuff, distances, var_MI, MI_shuff_var, len(flatlist(units))]

    else:   
        # concatenate across words, compute MI
        units = np.array([flatlist(i) for i in data])
        (MI, var_MI), (MI_shuff, MI_shuff_var) = it.sequential_mutual_information(units, distances, n_jobs = n_jobs, verbosity = verbosity)
        MI_DF.loc[len(MI_DF)] = [language, levels[-1], 'session', MI, MI_shuff, distances, var_MI, MI_shuff_var, len(flatlist(units))]

    # save dataframe
    MI_DF.to_pickle(DATA_DIR / 'MI_DF/language/language_MI_DF_long.pickle')

### Calculate fit

In [None]:
from sklearn.externals.joblib import Parallel, delayed
n_jobs = 20; verbosity = 0

In [None]:
def get_fit(language, d, distances, sig):
    results_power, results_exp, results_pow_exp, best_fit_model = mf.fit_models(
        distances[:d], sig[:d])
    R2_exp, R2_concat, R2_power, AICc_exp, AICc_pow, AICc_concat = mf.fit_results(
        sig[:d], distances[:d],  results_exp, results_power, results_pow_exp)

    y_model = mf.get_y(mf.pow_exp_decay, results_pow_exp, distances)
    y_pow = mf.get_y(mf.powerlaw_decay, results_pow_exp, distances)
    y_exp = mf.get_y(mf.exp_decay, results_pow_exp, distances)

    R2_exp_comp = mf.r2(sig[:d] - y_pow[:d], y_exp[:d] -
                        results_pow_exp.params['intercept'].value, distances[:d], logscaled=True)
    s = sig[:d] - y_exp[:d]
    m = y_pow[:d]-results_pow_exp.params['intercept'].value
    mask = s > 0
    R2_pow_comp = mf.r2(s[mask], m[mask], distances[:d][mask], logscaled=True)
    # print(R2_pow_comp)
    #plt.plot(distances[:d], mf.residuals(s, m,distances[:d]))

    AICc_exp_comp = mf.AICc(d, len(results_exp.params), sig[:d] - y_pow[:d], y_exp[:d] -
                            results_pow_exp.params['intercept'].value, distances[:d], logscaled=True)
    AICc_pow_comp = mf.AICc(d, len(results_power.params),
                            sig[:d] - y_exp[:d], y_pow[:d]-results_pow_exp.params['intercept'].value, distances[:d], logscaled=True)
    return (language, d, R2_exp, R2_concat, R2_power, AICc_exp, AICc_pow, 
            AICc_concat, R2_pow_comp, R2_exp_comp, AICc_exp_comp, AICc_pow_comp)

In [None]:
# aic / r2 for individual components
fit_df = []

columns = ['language', 'd', 'R2_exp', 'R2_concat', 'R2_power', 'AICc_exp', 'AICc_pow', 
                                 'AICc_concat', 'R2_pow_comp', 'R2_exp_comp',  'AICc_exp_comp', 'AICc_pow_comp']

for axi, (idx, row) in enumerate(MI_DF.sort_values(by=['unit','analysis']).iterrows()):
    language = row.language
    sig = row.MI-row.MI_shuff
    with Parallel(n_jobs=n_jobs, verbose=verbosity) as parallel:
        x = parallel(
            delayed(get_fit)(language, d, row.distances, sig)
                 for d in tqdm(np.unique(np.linspace(16,1000, 200).astype(int))))
    
    fit_df_lang = pd.DataFrame(x, columns = columns)
    fit_df.append(fit_df_lang)
fit_df = pd.concat(fit_df)

In [None]:
fit_df.to_pickle(DATA_DIR / 'MI_DF/language/fit_df_long.pickle')

In [None]:
fit_df[:3]