# Use randomized language model for each context and each syllable length to sample lexicons

Modeling after main04.py

## Setting up the environment

In [4]:
import sys
import csv
import pandas as pd
import yaml
import bz2
import numpy as np
from time import gmtime, strftime
from datetime import datetime
pd.set_option('max_rows', 10)
sys.path.insert(0, '04_Sampling_with_randomization/')
from assign_probs_to_words import *
from randomize_language_models_and_sampling import *
print("Setup complete.")

Setup complete.


## Meta-parameters

In [5]:
LanCode = 'tr'
n_phones = '4'
n_vowels = '2'
num_rounds = '11'
sample_num = '3'
write_real_probs = '1'
n_phones = int(n_phones)
n_vowels = int(n_vowels)
num_rounds = int(num_rounds)
num_max_segs = int(num_rounds) - 1
sample_num = int(sample_num)
write_real_probs = bool(int(write_real_probs))

## 1. Importing language models and sizes for each syllable length

### 1.1 language model

In [8]:
lm_dict_path = 'language_models/' + LanCode + '.LM.bysyl'
with bz2.open(lm_dict_path) as dbfile:
    lm_dict = yaml.load(dbfile)

In [9]:
lm_dict

{1: defaultdict(collections.Counter,
             {((),
               ()): Counter({('a0', True): 28,
                       ('aː0', True): 1,
                       ('b', False): 49,
                       ('d', False): 55,
                       ('dʒ', False): 23,
                       ('e0', True): 12,
                       ('f', False): 18,
                       ('g', False): 35,
                       ('h', False): 36,
                       ('i0', True): 14,
                       ('j', False): 32,
                       ('k', False): 76,
                       ('l', False): 22,
                       ('m', False): 28,
                       ('n', False): 27,
                       ('o0', True): 10,
                       ('p', False): 25,
                       ('s', False): 64,
                       ('t', False): 53,
                       ('tʃ', False): 13,
                       ('u0', True): 8,
                       ('v', False): 10,
                       ('y0', True)

### 1.2 orig lexicon -> $log_2(Pr(syllen))$ in df_counts_bysyl

In [12]:
words_path = 'mono_di_5000/' + LanCode + '.mono.di.5000'

print(words_path)
words = pd.read_csv(words_path, sep="\t")
words["syllen"] = words.ipa.map(lambda x: syl_count(x))
words["seglen"] = words.ipa.map(lambda x: seg_count(x))

mono_di_5000/tr.mono.di.5000


In [13]:
# logprob for each syllable length according to the real orig lexicon
words_total_count = len(words)
df_counts_bysyl = words.groupby(['syllen']).size().reset_index(name = 'counts')
df_counts_bysyl['logprob'] = np.log2(df_counts_bysyl.counts/words_total_count)
#probs_syl3 = df_counts_bysyl.loc[df_counts_bysyl['syllen'] == 3].reset_index()

In [14]:
df_counts_bysyl

Unnamed: 0,syllen,counts,logprob
0,1,700,-2.952334
1,2,4718,-0.199585


### 1.3 real lexicon used for counts in sampling -> len_usable_df

In [15]:
# 实际使用的orig lexicon
words_update = words[words.seglen <= num_max_segs]
# get sizes for every syllable length
syl_array = words_update["syllen"].unique()

wordlist_bysyl = {}
for syl_len in syl_array:
    wordlist_bysyl[syl_len] = words_update[words_update.syllen == syl_len]

In [16]:
words_update

Unnamed: 0,ipa,word,freq,syllen,seglen
0,b y0 t y0 n,bütün,965,2,5
1,j e0 ɾ,yer,953,1,3
2,g e0 l e0 n,gelen,950,2,5
3,a0 d a0 m,adam,948,2,4
4,k a0 ɾ ʃ ɯ0,karşı,943,2,5
...,...,...,...,...,...
5413,z a0 m,zam,6,1,3
5414,z a0 ɾ i0 f,zarif,6,2,5
5415,z o0 ɾ b a0,zorba,6,2,5
5416,z o0 ɾ l a0 ɾ,zorlar,6,2,6


Get number of words with combinations of syl-seg lengths to sample according to

In [17]:
words_update_len = words_update[['syllen', 'seglen']]

In [18]:
#len_usable_df = words_update_len.pivot_table(index='syllen', 
#                             columns='seglen', 
#                             aggfunc=len,
#                             fill_value = 0)

In [19]:
len_usable_df = pd.DataFrame(
    words_update_len.groupby(
        [words_update_len['syllen'],
         words_update_len['seglen']]
    ).size()
).reset_index()

In [20]:
len_usable_df.columns = ['syllen', 'seglen', 'count']

In [21]:
len_usable_df

Unnamed: 0,syllen,seglen,count
0,1,1,7
1,1,2,133
2,1,3,408
3,1,4,141
4,1,5,9
...,...,...,...
8,2,4,1143
9,2,5,2264
10,2,6,1065
11,2,7,87


### 1.4 enum by syllable

In [22]:
syl_array

array([2, 1])

In [25]:
enum_path_base = 'mono_di_5000/bySyl/' + LanCode + '.enum.word.10.syl'
enum_bysyl = {}
for syl_len in syl_array:
    syl = pd.read_csv(enum_path_base + str(syl_len) + '.csv', sep = ',')
    syl.columns = ['ipa','seglen']
    # only keep words that match the number of syllables
    syl['syllen'] = syl['ipa'].apply(syl_count)
    syl_filtered = syl[syl['syllen'] == syl_len]
    enum_bysyl[syl_len] = syl_filtered

In [26]:
enum_words = pd.concat([enum_bysyl[i] for i in syl_array])

In [27]:
enum_words

Unnamed: 0,ipa,seglen,syllen
0,b y0 t y0 n,5,2
1,b y0 t y0,4,2
2,b y0 t y0 k,5,2
3,b y0 t tʃ e0,5,2
4,b y0 l b y0 l,6,2
...,...,...,...
714,ʃ ɯ0 n,3,1
715,ʃ i0 ɾ k,4,1
716,ʃ i0 ʃ,3,1
717,ʒ e0 s t,4,1


In [28]:
enum_words_len = enum_words[['syllen', 'seglen']]

enum_len_usable_df = pd.DataFrame(
    enum_words_len.groupby(
        [enum_words_len['syllen'],
         enum_words_len['seglen']]
    ).size()
).reset_index()
enum_len_usable_df.columns = ['syllen', 'seglen', 'count']

In [34]:
enum_len_usable_df

Unnamed: 0,syllen,seglen,count
0,1,1,7
1,1,2,133
2,1,3,411
3,1,4,146
4,1,5,17
...,...,...,...
8,2,4,2208
9,2,5,7210
10,2,6,3726
11,2,7,811


Compare the syl-seg combination counts in the real lexicon and enumerated words

In [35]:
enum_len_usable_df = enum_len_usable_df.astype({'syllen': 'int64', 'seglen': 'int64'})
len_usable_df_all = pd.merge(len_usable_df, enum_len_usable_df, on = ['syllen', 'seglen'])
len_usable_df_all.columns = ['syllen', 'seglen', 'count_orig', 'count_enum']
len_usable_df_all['lack_enum'] = (len_usable_df_all['count_enum'] - len_usable_df_all['count_orig']) < 0

In [37]:
len_usable_df_all 

Unnamed: 0,syllen,seglen,count_orig,count_enum,lack_enum
0,1,1,7,7,False
1,1,2,133,133,False
2,1,3,408,411,False
3,1,4,141,146,False
4,1,5,9,17,False
...,...,...,...,...,...
8,2,4,1143,2208,False
9,2,5,2264,7210,False
10,2,6,1065,3726,False
11,2,7,87,811,False


check if the enumerated combination counts are valid (enumerated words in each seg-syl combination are more than number of words in the real lexicon)

In [39]:
if sum(len_usable_df_all.lack_enum) > 0:
    print(LanCode, ": enumberated wordforms do not cover every word in the training lexicon")
    exit()

## 2. Get a random language model and generate a new lexicon for each syllen

In [57]:
import os
# number of existing samples
#len([name for name in os.listdir('samples/tr')])
now = datetime.now()

In [58]:
new_sample = sampling_from_enum_with_dirichlet_lm(enum_bysyl, df_counts_bysyl, len_usable_df, lm_dict)

In [63]:
new_sample
#sample_writename = ('samples/' + 
#        LanCode + '/'
#        + 'sample.'
#        + str(now.month) + str(now.day) + '_' 
#        + str(now.hour) + str(now.minute) + str(now.second) 
#        + '_' + str(now.microsecond)
#        + '.bz2'
#        )
#sample_writename
#write to 'samples/tr' the new sample
#new_sample.to_csv(sample_writename, index = False, compression = 'bz2')

Unnamed: 0,ipa,logprob,seglen,syllen
542,u0,-12.408056,1,1
367,y0,-12.387314,1,1
361,ɯ0,-12.473917,1,1
658,aː0,-12.330421,1,1
112,i0,-12.370184,1,1
...,...,...,...,...
8532,s t ɾ e0 n k s i0,-18.254735,8,2
3449,k o0 m p l e0 k s,-12.459718,8,2
8278,s u0 ɾ p ɾ i0 z m,-13.386633,8,2
13692,l a0 s t l a0 ɾ m,-22.143592,8,2
