In [1]:
import os
import sys
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib as mpl
from scipy import stats

sys.path.insert(1, os.path.join(sys.path[0], '..'))
from util import constants

  from .autonotebook import tqdm as notebook_tqdm


# Get File Paths

In [2]:
checkpoints_path = '../../checkpoint'

def get_fname(language, seed):
    results_path = os.path.join(checkpoints_path, language, 'seed_%02d' % seed)
    return os.path.join(results_path, 'codes.tsv')

# Load Data

In [3]:
def rename_iid_columns(df, wordform_column, length_column):
    df['iid'] = df[wordform_column]
    df['iid_length'] = df[length_column]
    del df['caplan']
    del df['caplan_length']
    
    return df

def compile_iid_results(df, compile_polysemy=False):
    compile_columns = {'iid_length': 'mean',
                       'frequencies': 'sum',
                       'natural': 'count',}
        
    df_iid = df.groupby('iid').agg(compile_columns)
    df_iid.rename({'natural': 'n_homophones'}, axis='columns', inplace=True)
    return df_iid


In [4]:
def load_data(fname, wordform_column='caplan', length_column='caplan_length'):
    # Get natural and fcfs code lengths
    df_freq = pd.read_csv(fname, sep='\t')
    del df_freq['Unnamed: 0']
    df_freq = rename_iid_columns(df_freq, wordform_column, length_column)
    return compile_iid_results(df_freq)

def get_homophony_rate(df):
    return (df.n_homophones > 1).mean()

def get_homophony_amount(df):
    return df.n_homophones.mean() - 1

def get_homophony_max(df):
    return df.n_homophones.max() - 1

# Print number of Homophones in IID samples

## Temperature 5

In [5]:
homophony_rates = []
homophony_ammounts = []
homophony_maxs = []
seed = 0

for language in constants.LANGUAGES:
    fname = get_fname(language, seed)
    df = load_data(fname)
    homophony_rate = get_homophony_rate(df)
    homophony_ammount = get_homophony_amount(df)
    homophony_max = get_homophony_max(df)
    homophony_rates += [homophony_rate]
    homophony_ammounts += [homophony_ammount]
    homophony_maxs += [homophony_max]
    
    print(f'{language}.\tRate: {homophony_rate*100:.2f}%\tAmmount: '
          f'{homophony_ammount*100:.2f}%\tMax: {homophony_max:.2f}')

print()
print('Overall.' + 
      f'\tRate (%): {100*sum(homophony_rates) / len(homophony_rates):.2f}%'
      f'\t\tAmmount (%): {100*sum(homophony_ammounts) / len(homophony_ammounts):.2f}%'
      f'\tMax (not %): {sum(homophony_maxs) / len(homophony_maxs):.2f}')

en.	Rate: 1.65%	Ammount: 8.79%	Max: 178.00
fi.	Rate: 1.52%	Ammount: 8.56%	Max: 358.00
he.	Rate: 4.14%	Ammount: 13.90%	Max: 187.00
id.	Rate: 1.82%	Ammount: 10.13%	Max: 203.00
pt.	Rate: 1.39%	Ammount: 6.55%	Max: 151.00
tr.	Rate: 1.54%	Ammount: 8.12%	Max: 223.00

Overall.	Rate (%): 2.01%		Ammount (%): 9.34%	Max (not %): 216.67


## Temperature .5

In [6]:
homophony_rates = []
homophony_ammounts = []
homophony_maxs = []
seed = 0

for language in constants.LANGUAGES:
    fname = get_fname(language, seed)
    df = load_data(fname, wordform_column='caplan_low_temperature', length_column='caplan_low_temperature_length')
    homophony_rate = get_homophony_rate(df)
    homophony_ammount = get_homophony_amount(df)
    homophony_max = get_homophony_max(df)
    homophony_rates += [homophony_rate]
    homophony_ammounts += [homophony_ammount]
    homophony_maxs += [homophony_max]
    
    print(f'{language}.\tRate: {homophony_rate*100:.2f}%\tAmmount: '
          f'{homophony_ammount*100:.2f}%\tMax: {homophony_max:.2f}')

print()
print('Overall.' + 
      f'\tRate (%): {100*sum(homophony_rates) / len(homophony_rates):.2f}%'
      f'\tAmmount (%): {100*sum(homophony_ammounts) / len(homophony_ammounts):.2f}%'
      f'\tMax (not %): {sum(homophony_maxs) / len(homophony_maxs):.2f}')

en.	Rate: 28.18%	Ammount: 148.41%	Max: 300.00
fi.	Rate: 32.32%	Ammount: 293.27%	Max: 866.00
he.	Rate: 40.96%	Ammount: 333.24%	Max: 391.00
id.	Rate: 30.08%	Ammount: 138.83%	Max: 466.00
pt.	Rate: 32.49%	Ammount: 246.52%	Max: 443.00
tr.	Rate: 34.65%	Ammount: 296.03%	Max: 800.00

Overall.	Rate (%): 33.11%	Ammount (%): 242.72%	Max (not %): 544.33
