In [1]:
import os
import sys
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib as mpl
from scipy import stats

sys.path.insert(1, os.path.join(sys.path[0], '..'))
from h05_analysis.compile_results import merge_homophone_entropy
from h06_results.plot_correlations import read_results
from util import constants
from util import util

  from .autonotebook import tqdm as notebook_tqdm


# Choose a Language and a Seed to Analyse

In [2]:
language = 'pt'
seed = 0

# Get File Paths

In [3]:
checkpoints_path = '../../checkpoint'
results_path = os.path.join(checkpoints_path, language, 'seed_%02d' % seed)

results_freq_codes_file = os.path.join(results_path, 'codes.tsv')
results_ent_natural_file = os.path.join(results_path, 'compiled_natural_polysemy.tsv')
results_ent_polyassign_file = os.path.join(results_path, 'compiled_polyassign_polysemy.tsv')

# Load Data

In [4]:
def rename_iid_columns(df):
    df['iid'] = df['caplan']
    df['iid_length'] = df['caplan_length']
    del df['caplan']
    del df['caplan_length']
    
    return df

def compile_iid_results(df, compile_polysemy=False):
    compile_columns = {'iid_length': 'mean',
                       'frequencies': 'sum',
                       'natural': 'count',}
        
    df_iid = df.groupby('iid').agg(compile_columns)
    df_iid.rename({'natural': 'n_homophones'}, axis='columns', inplace=True)
    return df_iid

In [5]:
# Get natural and fcfs code lengths
df_freq = pd.read_csv(results_freq_codes_file, sep='\t')
del df_freq['Unnamed: 0']
df_freq = rename_iid_columns(df_freq)
df_freq_iid = compile_iid_results(df_freq)

In [6]:
# Get polysemy results
df_nat = pd.read_csv(results_ent_natural_file, sep='\t')
df_polyassign = pd.read_csv(results_ent_polyassign_file, sep='\t')

# Print number of Homophones in IID samples

In [7]:
print('# Types in original data:\t\t', df_freq.shape[0])
print('# Types after merging homophones:\t', df_freq_iid.n_homophones.shape[0])
print(f'% Homophones per word type:\t\t {df_freq_iid.n_homophones.mean() - 1:.4f}')
print(f'% Types with at least one homophone:\t {(df_freq_iid.n_homophones > 1).mean():.4f}')

# Types in original data:		 129182
# Types after merging homophones:	 121242
% Homophones per word type:		 0.0655
% Types with at least one homophone:	 0.0139


# Experiment 1. Natural, FCFS, and IID Lexicons

In [8]:
df_freq.head(10)

Unnamed: 0,idx,natural_length,frequencies,natural,fcfs,caplan_low_temperature,fcfs_length,caplan_low_temperature_length,iid,iid_length
0,0,4,2816,após,facho,facho,5,5,fv,2
1,1,1,76645,a,expresso,expresso,8,8,r,1
2,2,9,184,conquista,contudo,contudo,7,7,âne,3
3,3,6,159,romana,especializadas,especializadas,14,14,eupôbovlonlbjeuasaontqõâtufári,30
4,4,3,10803,dos,espera,espera,6,6,ecísrêmodl,10
5,5,6,2,volcas,protegidos,protegidos,10,10,êdwéw,5
6,6,2,21969,no,continuidade,continuidade,12,12,jâdliã,6
7,7,3,914,fim,carta,carta,5,5,cgrxa,5
8,8,2,30,ca,possuinte,possuinte,9,9,pêçê,4
9,9,5,2260,ainda,participantes,participantes,13,13,jbzn,4


In [9]:
nat_corr, _ = stats.spearmanr(df_freq['frequencies'], df_freq['natural_length'])
fcfs_corr, _ = stats.spearmanr(df_freq['frequencies'], df_freq['fcfs_length'])
iid_corr, _ = stats.spearmanr(df_freq_iid['frequencies'], df_freq_iid['iid_length'])

print(f'Natural Frequency--length Correlation: {nat_corr:.4f}')
print(f'FCFS Frequency--length Correlation: {fcfs_corr:.4f}')
print(f'IID Frequency--length Correlation: {iid_corr:.4f}')

Natural Frequency--length Correlation: -0.0930
FCFS Frequency--length Correlation: -0.0638
IID Frequency--length Correlation: -0.0264


# Experiment 1. PolyFCFS, and PolyIID Lexicons

In [10]:
df_polyassign_iid = merge_homophone_entropy(df_polyassign)
df_polyassign_iid = rename_iid_columns(df_polyassign_iid)
df_polyassign.head(10)

Unnamed: 0,idx,poly_var,poly_cov,length,frequencies,natural,fcfs,caplan,caplan_low_temperature,natural_length,fcfs_length,caplan_length,caplan_low_temperature_length
0,0,179.27488,160.863791,1,3904,0,facho,fv,facho,1,5,2,5
1,1,160.591511,134.356104,1,974,1,expresso,r,expresso,1,8,1,8
2,2,161.211683,51.410558,1,116,2,contudo,âne,contudo,1,7,3,7
3,3,154.122497,129.996455,1,859,3,especializadas,eupôbovlonlbjeuasaontqõâtufári,especializadas,1,14,30,14
4,4,60.37099,-2413.497633,1,3,4,espera,ecísrêmodl,espera,1,6,10,6
5,5,142.006917,-1612.876036,1,32,5,protegidos,êdwéw,protegidos,1,10,5,10
6,6,177.717185,152.954744,1,4396,6,continuidade,jâdliã,continuidade,1,12,6,12
7,7,153.155124,93.176442,1,247,7,carta,cgrxa,carta,1,5,5,5
8,8,-inf,-inf,1,1,8,possuinte,pêçê,possuinte,1,9,4,9
9,9,98.994237,-2237.121251,1,9,9,participantes,jbzn,participantes,1,13,4,13


In [11]:
polyfcfs_corr, _ = stats.spearmanr(df_polyassign['frequencies'], df_polyassign['fcfs_length'])
polyiid_corr, _ = stats.spearmanr(df_polyassign_iid['frequencies'], df_polyassign_iid['iid_length'])

print(f'PolyFCFS Frequency--length Correlation: {polyfcfs_corr:.4f}')
print(f'PolyIID Frequency--length Correlation: {polyiid_corr:.4f}')

PolyFCFS Frequency--length Correlation: -0.0425
PolyIID Frequency--length Correlation: -0.0343


# Experiment 2. Natural, FCFS, and IID Lexicons

In [12]:
# We only have polysemy estimates for words with a frequency higher than 2, 
# but we actually used a cap of at least 10 to get more stable estimates.
# This choice does not seem to impact results a lot, though.
df_nat = df_nat[df_nat['frequencies'] > 10]
df_nat_iid = merge_homophone_entropy(df_nat)
df_nat_iid = rename_iid_columns(df_nat_iid)
df_nat.head(10)

Unnamed: 0,idx,poly_var,poly_cov,length,frequencies,natural,fcfs,caplan,caplan_low_temperature,natural_length,fcfs_length,caplan_length,caplan_low_temperature_length
0,0,148.257531,119.582082,4,1271,após,facho,fv,facho,4,5,2,5
1,1,180.952374,164.222689,1,34624,a,expresso,r,expresso,1,8,1,8
2,2,143.523644,-495.505061,9,78,conquista,contudo,âne,contudo,9,7,3,7
3,3,143.568068,-474.46646,6,79,romana,especializadas,eupôbovlonlbjeuasaontqõâtufári,especializadas,6,14,30,14
4,4,162.155127,144.354266,3,4784,dos,espera,ecísrêmodl,espera,3,6,10,6
6,6,162.643404,144.905828,2,9891,no,continuidade,jâdliã,continuidade,2,12,6,12
7,7,170.850702,125.862454,3,389,fim,carta,cgrxa,carta,3,5,5,5
9,9,167.461648,142.290441,5,1049,ainda,participantes,jbzn,participantes,5,13,4,13
10,10,168.898685,152.384093,2,21667,em,contrata,éçcmuy,contrata,2,8,6,8
11,11,137.226261,-561.709836,8,76,disputou,canto,ch,canto,8,5,2,5


In [13]:
nat_corr, _ = stats.spearmanr(df_nat['poly_var'], df_nat['natural_length'])
fcfs_corr, _ = stats.spearmanr(df_nat['poly_var'], df_nat['fcfs_length'])
iid_corr, _ = stats.spearmanr(df_nat_iid['poly_var'], df_nat_iid['iid_length'])

print(f'Natural Frequency--length Correlation: {nat_corr:.4f}')
print(f'FCFS Frequency--length Correlation: {fcfs_corr:.4f}')
print(f'IID Frequency--length Correlation: {iid_corr:.4f}')

Natural Frequency--length Correlation: -0.0782
FCFS Frequency--length Correlation: -0.0103
IID Frequency--length Correlation: -0.0064


# Experiment 2. PolyFCFS, and PolyIID Lexicons

In [14]:
# We only have polysemy estimates for words with a frequency higher than 2, 
# but we actually used a cap of at least 10 to get more stable estimates.
# This choice does not seem to impact results a lot, though.
df_polyassign = df_polyassign[df_polyassign['frequencies'] > 10]
df_polyassign_iid = merge_homophone_entropy(df_polyassign)
df_polyassign_iid = rename_iid_columns(df_polyassign_iid)
df_polyassign.head(10)

Unnamed: 0,idx,poly_var,poly_cov,length,frequencies,natural,fcfs,caplan,caplan_low_temperature,natural_length,fcfs_length,caplan_length,caplan_low_temperature_length
0,0,179.27488,160.863791,1,3904,0,facho,fv,facho,1,5,2,5
1,1,160.591511,134.356104,1,974,1,expresso,r,expresso,1,8,1,8
2,2,161.211683,51.410558,1,116,2,contudo,âne,contudo,1,7,3,7
3,3,154.122497,129.996455,1,859,3,especializadas,eupôbovlonlbjeuasaontqõâtufári,especializadas,1,14,30,14
5,5,142.006917,-1612.876036,1,32,5,protegidos,êdwéw,protegidos,1,10,5,10
6,6,177.717185,152.954744,1,4396,6,continuidade,jâdliã,continuidade,1,12,6,12
7,7,153.155124,93.176442,1,247,7,carta,cgrxa,carta,1,5,5,5
10,10,156.000944,119.870207,2,742,10,contrata,éçcmuy,contrata,2,8,6,8
11,11,131.900391,-1896.531214,2,21,11,canto,ch,canto,2,5,2,5
12,12,141.285867,-237.582581,2,89,12,acorda,nínu,acorda,2,6,4,6


In [15]:
polyfcfs_corr, _ = stats.spearmanr(df_polyassign['poly_var'], df_polyassign['fcfs_length'])
polyiid_corr, _ = stats.spearmanr(df_polyassign_iid['poly_var'], df_polyassign_iid['iid_length'])

print(f'PolyFCFS Frequency--length Correlation: {polyfcfs_corr:.4f}')
print(f'PolyIID Frequency--length Correlation: {polyiid_corr:.4f}')

PolyFCFS Frequency--length Correlation: -0.0215
PolyIID Frequency--length Correlation: 0.0025
