# Calculate MI for each unit/language
1. load datasets
2. calculate MI

In [None]:
import pandas as pd
import numpy as np
from parallelspaper.config.paths import DATA_DIR, FIGURE_DIR
from parallelspaper.speech_datasets import LCOL_DICT

from parallelspaper import information_theory as it 
from parallelspaper.quickplots import plot_model_fits
from tqdm.autonotebook import tqdm
from parallelspaper import model_fitting as mf
from parallelspaper.utils import save_fig

In [None]:
import matplotlib.pyplot as plt
%matplotlib inline

### load datasets

In [None]:
english_seqs = pd.read_pickle(DATA_DIR/'speech_seq_df/BUCKEYE_seq_df.pickle')
japanese_seqs = pd.read_pickle(DATA_DIR/'speech_seq_df/CSJ_seq_df.pickle')
seq_df = pd.concat([english_seqs, japanese_seqs])

In [None]:
seq_df.columns

In [None]:
seq_df[['language', 'levels']].T

### Subset relevant utterance datasets

In [None]:
def flatlist(list_of_lists):
    return [val for sublist in list_of_lists for val in sublist]

#### For english

In [None]:
# subset english sequences
eng_utterance_seq_df = seq_df.query('levels == "speaker/utterance/word/phonetic"')
idx, language, levels, data = eng_utterance_seq_df.reset_index().loc[0]

In [None]:
### Shuffling
# phones < utterance < speakers 
eng_utterances = [[flatlist(utterance) for utterance in speaker] for speaker in data]
# utterance lengths
eng_utterance_lens = [len(utterance) for utterance in flatlist(eng_utterances)]
# shuffle order of phones within utterance
eng_utterance_shuffled_within = [[np.random.permutation(utterance) for utterance in speaker] for speaker in eng_utterances]
# shuffle order of utterance within speakers
eng_utterance_shuffled_between = [np.random.permutation(speaker) for speaker in eng_utterances]

In [None]:
print(np.median(eng_utterance_lens), np.mean(eng_utterance_lens))

#### For japanese

In [None]:
# subset jap sequences
jap_utterance_seq_df = seq_df.query('levels == "speaker/IPU/phonemes"')
idx, language, levels, data = jap_utterance_seq_df.reset_index().loc[0]

In [None]:
data[0][0]

In [None]:
### Shuffling
jap_utterances = data
# shuffle order of phones within utterance
jap_utterance_shuffled_within = [[np.random.permutation(utterance) for utterance in speaker] for speaker in jap_utterances]
# shuffle order of utterance within speakers
jap_utterance_shuffled_between = [np.random.permutation(speaker) for speaker in jap_utterances]
# utterance lengths
jap_utterance_lens = [len(utterance) for utterance in flatlist(jap_utterances)]

In [None]:
print(np.median(jap_utterance_lens), np.mean(jap_utterance_lens))
# utterance lengths
fig, ax = plt.subplots(ncols=2, figsize=(12,3))
ax[0].hist(jap_utterance_lens, bins = np.arange(1,200));
ax[0].set_title('Jap utterance sequence lens')

ax[1].hist(eng_utterance_lens, bins = np.arange(1,200));
ax[1].set_title('English utterance sequence lens')

### Calculate MI
- for each unit calculate MI within speaker, and within speaker when shuffling words when available

In [None]:
distances = np.arange(1,101)
verbosity = 0; n_jobs = 20

In [None]:
# calculate MI
MI_DF = pd.DataFrame(columns=['language', 'unit', 'analysis', 'MI', 'MI_shuff', 'distances', 'MI_var', 'MI_shuff_var', 'n_elements'])

In [None]:
# MI for shuffle within utterance
units = [flatlist(i) for i in eng_utterance_shuffled_within]
(MI, var_MI), (MI_shuff, MI_shuff_var) = it.sequential_mutual_information(units, distances, n_jobs = n_jobs, verbosity = verbosity)
MI_DF.loc[len(MI_DF)] = ['english', 'phone', 'shuffled_within_utterance', MI, MI_shuff, distances, var_MI, MI_shuff_var, len(flatlist(units))]

In [None]:
# MI for shuffle between utterance
units = [flatlist(i) for i in eng_utterance_shuffled_between]
(MI, var_MI), (MI_shuff, MI_shuff_var) = it.sequential_mutual_information(units, distances, n_jobs = n_jobs, verbosity = verbosity)
MI_DF.loc[len(MI_DF)] = ['english', 'phone', 'shuffled_between_utterance', MI, MI_shuff, distances, var_MI, MI_shuff_var, len(flatlist(units))]

In [None]:
# MI for shuffle within utterance
units = [flatlist(i) for i in jap_utterance_shuffled_within]
(MI, var_MI), (MI_shuff, MI_shuff_var) = it.sequential_mutual_information(units, distances, n_jobs = n_jobs, verbosity = verbosity)
MI_DF.loc[len(MI_DF)] = ['japanese', 'phoneme', 'shuffled_within_utterance', MI, MI_shuff, distances, var_MI, MI_shuff_var, len(flatlist(units))]

In [None]:
# MI for only between utterance
units = [flatlist(i) for i in jap_utterance_shuffled_between]
(MI, var_MI), (MI_shuff, MI_shuff_var) = it.sequential_mutual_information(units, distances, n_jobs = n_jobs, verbosity = verbosity)
MI_DF.loc[len(MI_DF)] = ['japanese', 'phoneme', 'shuffled_between_utterance', MI, MI_shuff, distances, var_MI, MI_shuff_var, len(flatlist(units))]

### Fit models

In [None]:
# prep for new data in dataframe
MI_DF = MI_DF.assign(**{i:np.nan for i in ['exp_results', 'pow_results', 'concat_results',
     'R2_exp', 'R2_concat', 'R2_power', 'AICc_exp',
     'AICc_concat', 'AICc_power', 'bestfitmodel', 'curvature', 'min_peak']})
MI_DF['curvature'] = MI_DF['curvature'].astype(object)

In [None]:
n = 100 # max distance for computation
for idx, row in tqdm(MI_DF.iterrows(), total=len(MI_DF)):
    # get signal
    sig = np.array(row.MI-row.MI_shuff)
    distances = row.distances
    sig = sig
    
    # fit models
    results_power, results_exp, results_pow_exp, best_fit_model = mf.fit_models(distances, sig)
    
    # get fit results
    R2_exp, R2_concat, R2_power, AICc_exp, \
        AICc_pow, AICc_concat = mf.fit_results(sig, distances, 
                                              results_exp, results_power,
                                              results_pow_exp)
    
    # get model y
    distances_mod = np.logspace(0,np.log10(n), base=10, num=1000)
    if best_fit_model == 'pow_exp':
        y_model = mf.get_y(mf.pow_exp_decay, results_pow_exp, distances_mod)
    elif best_fit_model == 'exp':
        y_model = mf.get_y(mf.exp_decay, results_exp, distances_mod)
    elif best_fit_model == 'pow':
        y_model = mf.get_y(mf.powerlaw_decay, results_power, distances_mod)
    
    # get curvature of model_y
    curvature_model = mf.curvature(np.log(y_model))
    
    # if the best fit model is pow_exp, then grab the min peak
    if best_fit_model == 'pow_exp':
        # get peaks of curvature
        peaks = np.where((
            (curvature_model[:-1] < curvature_model[1:])[1:] & (curvature_model[1:] < curvature_model[:-1])[:-1]
        ))
        min_peak = peaks[0][0]
    else:
        min_peak = np.nan

    # get save model fit results to MI_DF
    MI_DF.loc[idx, np.array(['exp_results', 'pow_results', 'concat_results',
                         'R2_exp', 'R2_concat', 'R2_power', 'AICc_exp',
                         'AICc_concat', 'AICc_power', 'bestfitmodel', 'curvature', 'min_peak'])] = [
        results_exp, results_power, results_pow_exp,
        R2_exp, R2_concat, R2_power, AICc_exp,
        AICc_concat, AICc_pow, best_fit_model,
        curvature_model, min_peak
    ]

    # quick plot of model fitting
    plot_model_fits(row.MI, row.MI_shuff, distances, results_power, results_exp, results_pow_exp)

    print(row.unit, row.analysis, best_fit_model, row.language)

In [None]:
MI_DF.to_pickle((DATA_DIR / 'MI_DF/language/language_MI_DF_fitted-utterance.pickle'))

### Plot shuffling analysis within vs between for utterances in japanese and english

In [None]:
fontsize=19
yoff=-.20
ncol = 4
nrow = len(MI_DF)//ncol
zoom = 5
fig, axs = plt.subplots(ncols=ncol, nrows=nrow, figsize=zoom*np.array([ncol,nrow]))
for axi, (idx, row) in enumerate(MI_DF.sort_values(by=['analysis', 'language', 'unit']).iterrows()):
    ax = axs.flatten()[axi]
    
    color = LCOL_DICT[row.language]
    sig = np.array(row.MI-row.MI_shuff)
    distances = row.distances
    sig = sig
    distances = distances
    # get signal limits
    sig_lims = np.log([np.min(sig[sig>0]), np.nanmax(sig)])
    sig_lims = [sig_lims[0] - (sig_lims[1]-sig_lims[0])/10,
                    sig_lims[1] + (sig_lims[1]-sig_lims[0])/10]
            
    if axi%ncol == 0:
            ax.set_ylabel('Mutual Information (bits)', labelpad=5, fontsize=fontsize)
            ax.yaxis.set_label_coords(yoff,0.5)
    if axi >= (nrow-1)*ncol:      
        ax.set_xlabel('Distance (phones)', labelpad=5, fontsize=fontsize)
    
    
    # plot real data
    ax.scatter(distances, sig, alpha = 1, s=40, color=color)
    
    best_fit_model = np.array(['exp','pow','pow_exp'])[np.argmin(row[['AICc_exp', 'AICc_power', 'AICc_concat']].values)]
    
    # set title
    analysis = 'within utterance' if row.analysis == 'shuffled_within_utterance' else 'between utterance'
    model_type = {'pow_exp': 'composite', 'exp': 'exponential', 'pow':'power law'}[best_fit_model]
    ax.set_title(' | '.join([row.language.capitalize(), analysis, model_type]), fontsize=fontsize)
    
    # plot model
    distances_model = np.logspace(0,np.log10(distances[-1]), base=10, num=1000)
    
    if best_fit_model == 'pow_exp':
        ax.axvline(distances_model[int(row.min_peak)], lw=3,alpha=0.5, color=color, ls='dashed')
        
    if best_fit_model == 'pow_exp':
        # model data
        #row.concat_results.params.intercept = 0
        y_model = mf.get_y(mf.pow_exp_decay, row.concat_results, distances_model)
        y_pow = mf.get_y(mf.powerlaw_decay, row.concat_results, distances_model)
        y_exp = mf.get_y(mf.exp_decay, row.concat_results, distances_model)

        ax.plot(distances_model, y_pow, ls='dotted', color= 'k', lw=5, alpha=0.5)
        ax.plot(distances_model, y_exp-row.concat_results.params['intercept'].value, ls='dashed', color= 'k', lw=5, alpha=0.5)

        # plot modelled data
        ax.plot(distances_model, y_model, alpha = 0.5, lw=10, color=color)
    
    elif best_fit_model == 'pow':
        y_model = mf.get_y(mf.powerlaw_decay, row.pow_results, distances_model)
        # plot modelled data
        ax.plot(distances_model, y_model, alpha = 0.5, lw=10, color=color)
        
        
    elif best_fit_model == 'exp':
        y_model = mf.get_y(mf.exp_decay, row.exp_results, distances_model)
        # plot modelled data
        ax.plot(distances_model, y_model, alpha = 0.5, lw=10, color=color)
        
    # axis params
    ax.set_xlim([distances[0], distances[-1]])
    sig_lims[0] = np.log(10e-6)
    ax.set_ylim(np.exp(sig_lims))
    ax.tick_params(which='both', direction='in', labelsize=14, pad=10)
    ax.tick_params(which='major', length=10, width =3)
    ax.tick_params(which='minor', length=5, width =2)
    ax.set_xscale( "log" , basex=10)
    ax.set_yscale( "log" , basey=10)
    ax.set_xticks([])
    for axis in ['top','bottom','left','right']:
        ax.spines[axis].set_linewidth(3)
        ax.spines[axis].set_color('k')
    
    ax.set_xlim([1,100])
    ax.set_xticks([1,10,100])
    ax.set_xticklabels(['1','10','100'])
    
    
save_fig(FIGURE_DIR/'speech_shuffle_utterance')