# View dataset statistics for each language
1. load datasets
2. plot/view statistics

In [None]:
import pandas as pd
import numpy as np
from parallelspaper.config.paths import DATA_DIR, FIGURE_DIR
from parallelspaper.speech_datasets import LCOL_DICT
from parallelspaper.utils import save_fig

In [None]:
german_stats = pd.read_pickle(DATA_DIR/'stats_df/GECO_stats_df.pickle')
german_stats['Language'] = 'German'

italian_stats = pd.read_pickle(DATA_DIR/'stats_df/AsiCA_stats_df.pickle')
italian_stats['Language'] = 'Italian'

english_stats = pd.read_pickle(DATA_DIR/'stats_df/BUCKEYE_stats_df.pickle')
english_stats['Language'] = 'English'

japanese_stats = pd.read_pickle(DATA_DIR/'stats_df/CSJ_stats_df.pickle')
japanese_stats['Language'] = 'Japanese'

stats_df = pd.concat([german_stats, italian_stats, english_stats, japanese_stats])

In [None]:
stats_df

In [None]:
for idx, row in stats_df.iterrows():
    print(row.Language)
    print(np.sum(np.array(row.word_length_phones) == 1)/len(row.word_length_phones))

In [None]:
from matplotlib import gridspec
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

In [None]:
LCOL_DICT

In [None]:
letters = ['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z']

In [None]:
bw = 0.25
yoff = -0.20

fig, axs = plt.subplots(ncols=2, figsize=(10, 4))
kwk = {"lw": 6, "bw": bw}
for ix, (idx, row) in enumerate(stats_df[stats_df.Language.isin(['English', 'Japanese'])].sort_values(by="Language").iterrows()):
    ax = axs.flatten()[ix]
    ax.annotate(letters[ix], xy=(-0.05, 1.05), xycoords="axes fraction", size=20, fontweight='bold', fontfamily='Arial')
    ax.hist(
        np.array(row.utterance_length_phones),
        density=True,
        bins=np.arange(0, 100, 2),
        color=LCOL_DICT[row.Language.lower()],
    )
    ax.set_xlim([0, 100])
    #ax.set_yscale("log")
    ax.set_xlabel("Utterance length (phones)", fontsize=18)

    ax.tick_params(axis="both", labelsize=14, pad=15)
    for axis in ["top", "bottom", "left", "right"]:
        ax.spines[axis].set_linewidth(3)
        ax.spines[axis].set_color("k")
    ax.grid(False)
    ax.tick_params(which="both", direction="in", labelsize=14, pad=10)
    ax.tick_params(which="major", length=10, width=3)
    ax.tick_params(which="minor", length=5, width=2)


axs[0].set_ylabel("Prob. Density", labelpad=5, fontsize=18)
axs[0].yaxis.set_label_coords(yoff, 0.5)

save_fig(FIGURE_DIR / "utt_len_phones")