# Get model contribution by distance

In [None]:
import pandas as pd
from parallelspaper.config.paths import DATA_DIR, FIGURE_DIR
from parallelspaper.speech_datasets import LCOL_DICT
import numpy as np
from parallelspaper import model_fitting as mf
from parallelspaper.utils import save_fig
from parallelspaper import information_theory as it 

In [None]:
from matplotlib import gridspec
import matplotlib.pyplot as plt
%matplotlib inline

In [None]:
# stats
german_stats = pd.read_pickle(DATA_DIR/'stats_df/GECO_stats_df.pickle')
german_stats['Language'] = 'German'

italian_stats = pd.read_pickle(DATA_DIR/'stats_df/AsiCA_stats_df.pickle')
italian_stats['Language'] = 'Italian'

english_stats = pd.read_pickle(DATA_DIR/'stats_df/BUCKEYE_stats_df.pickle')
english_stats['Language'] = 'English'

japanese_stats = pd.read_pickle(DATA_DIR/'stats_df/CSJ_stats_df.pickle')
japanese_stats['Language'] = 'Japanese'

stats_df = pd.concat([german_stats, italian_stats, english_stats, japanese_stats])

In [None]:
MI_DF = pd.read_pickle(DATA_DIR / 'MI_DF/language/language_MI_DF_fitted.pickle')

In [None]:
# subset dataset to only look at the major units
subset_MI_DF = MI_DF[[(row.unit in ['phonetic', 'phonemes', 'phoneme', 'ortho-phonetic']) & (row.analysis in ['session'])  for idx, row in MI_DF.iterrows()]]

In [None]:
subset_MI_DF

In [None]:
distances = np.logspace(0,2, base=10, num=1000)

In [None]:
fig, axs = plt.subplots(ncols = 4, figsize=(16,4))
for axi, (idx, row) in enumerate(subset_MI_DF.iterrows()):
    langrow = stats_df.Language.values == row.language.capitalize()
    phone_lens = stats_df[langrow].phone_duration_s.values[0]
    median_phone_len = np.median(phone_lens)
    
    max_peak_dist = distances[int(np.argmax(row.curvature))]
    lower_mask = row.distances < max_peak_dist
    y_model = mf.get_y(mf.pow_exp_decay, row.concat_results, row.distances)
    y_pow = mf.get_y(mf.powerlaw_decay, row.concat_results, row.distances)
    y_exp = mf.get_y(mf.exp_decay, row.concat_results, row.distances)
    y = row.MI - row.MI_shuff
    
    print(
        row.language, 
        np.sum(y_pow[lower_mask]/y_model[lower_mask])/np.sum(lower_mask),
        median_phone_len*max_peak_dist
         )
    
    
    axs[axi].loglog(row.distances, y_model, color = LCOL_DICT[row.language])
    axs[axi].loglog(row.distances, y_pow, color = LCOL_DICT[row.language], ls='dotted')
    axs[axi].loglog(row.distances, y_exp, color = LCOL_DICT[row.language], ls='dashed')
