In [8]:
# reload modules if needed
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [9]:
import json
import os
import numpy as np

import pandas as pd
from matplotlib import pyplot as plt
import seaborn as sns

from scipy.stats import pearsonr, spearmanr
from notebook_utils import get_distribution_over_vocabulary, get_mlm_results, get_downstream_results, corpus_sizes

In [10]:
import warnings
warnings.filterwarnings('ignore')

In [11]:
FIGURE_DIR = "/home/limisiewicz/my-luster/entangled-in-scripts/output/figures"
TABLE_DIR =  "/home/limisiewicz/my-luster/entangled-in-scripts/output/tables"

In [12]:
plt.rcParams['font.family'] = 'serif'
plt.rcParams['text.usetex'] = True
#plt.rcParams['text.usetex'] = False
plt.rcParams['font.family'] = 'serif'
plt.rcParams['font.serif'] = ['Times New Roman']

# plt.rcParams['font.monospace'] = 'Ubuntu Mono'
plt.rcParams['font.size'] = 26
plt.rcParams['axes.labelsize'] = 26
plt.rcParams["axes.labelweight"] = "bold"
plt.rcParams['axes.titlesize'] = 26
plt.rcParams['axes.titleweight'] = 'bold'
plt.rcParams['xtick.labelsize'] = 16
plt.rcParams['ytick.labelsize'] = 16
plt.rcParams['legend.fontsize'] = 18


## Parameteres

In [13]:
languages = ('ar', 'tr', 'zh','el', 'es', 'en')
alpha = 0.25
tokenization_types = ('multilingual', 'bpe', 'merged', 'nooverlap')
N_vocab = 120000


ft_type = 'PROBE'

# Tokenizers and Distributions over Vocabulary

In [14]:
vocab_distributions = dict()
vocab_dist_arrays = dict()
vocab_frequencies = dict()

for tok_type in tokenization_types:
    vocab_distributions[tok_type], vocab_frequencies[tok_type] \
        = get_distribution_over_vocabulary(tok_type, alpha, N_vocab, languages)
    vocab_dist_arrays[tok_type] = {lang: np.array(list(vocab_distributions[tok_type][lang].values()))
                                   for lang in languages + ('All',)}

In [15]:
vocab_distributions

{'multilingual': {'ar': OrderedDict([('0', 0.028106502296312938),
               ('1', 0.0),
               ('2', 0.028106502296312938),
               ('3', 0.0),
               ('4', 0.0),
               ('5', 0.11761247869133551),
               ('6', 0.0019267199213934263),
               ('7', 0.01303628882496822),
               ('8', 4.498815354559051e-06),
               ('9', 1.0275703253026922e-05),
               ('10', 0.0001271937795698059),
               ('11', 1.3291954456651742e-05),
               ('12', 0.0),
               ('13', 5.214535979147991e-06),
               ('14', 0.0),
               ('15', 8.690893298579984e-06),
               ('16', 1.6819434677840088e-05),
               ('17', 0.004843537081102138),
               ('18', 1.6359328562032913e-06),
               ('19', 0.0),
               ('20', 0.020880535747561244),
               ('21', 5.981379505493284e-06),
               ('22', 9.202122316143513e-07),
               ('23', 0.0),
              