In [1]:
from pathlib import Path
import torch
from torch.nn import CrossEntropyLoss
from torch.optim import Adam 
from torch.utils.data import DataLoader

from eff.data.dataset import CLTSDataset
from eff.train import get_class_weights_balanced, get_train_test_valid_split
from eff.train.dataset import TrainDataset, UnmaskedTestSet, \
    ConsonantMaskingTestSet, VowelMaskingTestSet
from eff.train.scripts import train, test
from eff.util import constants
from eff.util.util import save_results

from eff.data import load_ipa_transcriptions
from eff.data.dataset import CLTSDataset


json_path_fin = Path("../../data/en-wikt/kaikki.org-dictionary-Finnish-all-non-inflected-senses.json")
json_path_tur = Path("../../data/en-wikt/kaikki.org-dictionary-Turkish-all-non-inflected-senses.json")
# Too few IPA-transcribed items for Estonian
# json_path_ekk = Path("../../data/en-wikt/kaikki.org-dictionary-Estonian-all-non-inflected-senses.json")
json_path_arb = Path("../../data/en-wikt/kaikki.org-dictionary-Arabic-all-non-inflected-senses.json")
json_path_hye = Path("../../data/en-wikt/kaikki.org-dictionary-Armenian-all-non-inflected-senses.json")
base_path = Path("./out/wikt_unique")

In [2]:
ipa_trans_fin = load_ipa_transcriptions(json_path_fin)
ipa_trans_tur = load_ipa_transcriptions(json_path_tur)
ipa_trans_hye = load_ipa_transcriptions(json_path_hye)
ipa_trans_arb = load_ipa_transcriptions(json_path_arb)

print(len(ipa_trans_fin))
print(len(ipa_trans_tur))
print(len(ipa_trans_hye))
print(len(ipa_trans_arb))

N = min(len(ipa_trans_fin), len(ipa_trans_tur), len(ipa_trans_hye), len(ipa_trans_arb))
print("N =", N)
clts_ds_fin = CLTSDataset(ipa_trans_fin[:N], unique_sequences=True)
clts_ds_tur = CLTSDataset(ipa_trans_tur[:N], unique_sequences=True)
clts_ds_hye = CLTSDataset(ipa_trans_hye[:N], unique_sequences=True)
clts_ds_arb = CLTSDataset(ipa_trans_arb[:N], unique_sequences=True)

76714
4829
14972
7821
N = 4829


In [3]:
data = dict(
    fin=clts_ds_fin, tur=clts_ds_tur,
    hye=clts_ds_hye, arb=clts_ds_arb
)

In [4]:
batch_size = 32
n_layers = 1
embedding_size = 64
hidden_size = 256
dropout = 0.33
patience = 1

In [5]:
# %%capture log

from collections import defaultdict
import itertools
from importlib import reload

from eff.model import lstm
LstmLM = reload(lstm)
from eff.train import generate_batch

datasets = defaultdict(lambda: defaultdict(lambda: {}))
res = defaultdict(lambda: defaultdict(lambda: {}))
models = {}
criteria = {}

for lang_id, clts_dataset in data.items():
    print(lang_id)
    datasets[lang_id]['clts'] = clts_dataset
    train_words, valid_words, test_words = get_train_test_valid_split(clts_dataset.words, \
        test_size=0.3, valid_size=0.1)

    train_set = TrainDataset(
                    words=train_words,
                    input_alphabet=clts_dataset.input_alphabet,
                    output_alphabet=clts_dataset.output_alphabet,
                    bipa=clts_dataset.bipa,
                    masking=0.25
                )
    valid_set = TrainDataset( 
                    words=valid_words,
                    input_alphabet=clts_dataset.input_alphabet,
                    output_alphabet=clts_dataset.output_alphabet,
                    bipa=clts_dataset.bipa, 
                    masking=0.25
                )
    test_set = UnmaskedTestSet(
                    words=test_words,
                    input_alphabet=clts_dataset.input_alphabet,
                    output_alphabet=clts_dataset.output_alphabet,
                    bipa=clts_dataset.bipa
                )
    
    test_set_vowel = VowelMaskingTestSet(
                        words=test_words,
                        input_alphabet=clts_dataset.input_alphabet,
                        output_alphabet=clts_dataset.output_alphabet,
                        bipa=clts_dataset.bipa
                    )

    test_set_consonant = ConsonantMaskingTestSet(
                            words=test_words,
                            input_alphabet=clts_dataset.input_alphabet,
                            output_alphabet=clts_dataset.output_alphabet,
                            bipa=clts_dataset.bipa
                        )

    datasets[lang_id]['torch']['unmasked'] = test_set
    datasets[lang_id]['torch']['vowel_masking'] = test_set_vowel
    datasets[lang_id]['torch']['consonant_masking'] = test_set_consonant

    train_loader = DataLoader(train_set, batch_size=batch_size, collate_fn=generate_batch)
    valid_loader = DataLoader(valid_set, batch_size=batch_size, collate_fn=generate_batch)
    test_loader = DataLoader(test_set, batch_size=batch_size, collate_fn=generate_batch)
    test_loader_vowel = DataLoader(test_set_vowel, batch_size=batch_size, collate_fn=generate_batch)
    test_loader_consonant = DataLoader(test_set_consonant, batch_size=batch_size, collate_fn=generate_batch)
    
    train_labels = list(itertools.chain.from_iterable([t.cpu().tolist() for t in train_set._Y]))  
    
    missing_labels = list(set(clts_dataset.output_alphabet.indices).difference(set(train_labels)))
    train_labels = train_labels + [clts_dataset.output_alphabet.PAD_IDX] + missing_labels
    weight = get_class_weights_balanced(ignore_classes=[clts_dataset.pad_idx, clts_dataset.mask_idx], \
        classes=clts_dataset.output_alphabet.indices, y=train_labels)

    criterion = CrossEntropyLoss(weight=weight)
    criteria[lang_id] = criterion
    model = lstm.LstmLM(
        input_dim=len(clts_dataset.input_alphabet),
        output_dim=len(clts_dataset.output_alphabet),
        embedding_dim=64,
        hidden_dim=256,
        dropout=0.33,
        n_layers=2,
        loss_fn=criterion
    )
    # print(model)
    model.to(constants.device)
    optimizer = Adam(model.parameters())
    
    train(model, train_loader, valid_loader, optimizer, criterion, patience=patience)
    
    models[lang_id] = model

    logprobs, target_indices, targets = test(model, test_loader, criterion)
    res[lang_id]['unmasked']['logprobs'] = logprobs
    res[lang_id]['unmasked']['targets'] = targets
    res[lang_id]['unmasked']['indices'] = target_indices
    
    logprobs, target_indices, targets = test(model, test_loader_vowel, criterion)
    res[lang_id]['vowel_masking']['logprobs'] = logprobs
    res[lang_id]['vowel_masking']['targets'] = targets
    res[lang_id]['vowel_masking']['indices'] = target_indices

    logprobs, target_indices, targets = test(model, test_loader_consonant, criterion)
    res[lang_id]['consonant_masking']['logprobs'] = logprobs
    res[lang_id]['consonant_masking']['targets'] = targets
    res[lang_id]['consonant_masking']['indices'] = target_indices


fin
Epoch	Loss	Perplexity
1	0.0635	0	
2	0.0613	0	
3	0.0601	0	
	Patience lost, remaining patience: 0
4	0.0595	0	
Test loss: 0.05927572977934645
Test perplexity: 0
Test loss: 0.060217752283503886
Test perplexity: 0
Test loss: 0.060627393207639595
Test perplexity: 0
tur
Epoch	Loss	Perplexity
1	0.107	0	
	Patience lost, remaining patience: 0
2	0.1084	0	
Test loss: 0.10215801971820487
Test perplexity: 0
Test loss: 0.10011725981782794
Test perplexity: 0
Test loss: 0.10301283374120619
Test perplexity: 0
hye
Epoch	Loss	Perplexity
1	0.0627	0	
2	0.0616	0	
3	0.0601	0	
4	0.0591	0	
	Patience lost, remaining patience: 0
5	0.0592	0	
Test loss: 0.059826581312066404
Test perplexity: 0
Test loss: 0.05294920096654488
Test perplexity: 0
Test loss: 0.06236687182150109
Test perplexity: 0
arb
Epoch	Loss	Perplexity
1	0.0619	0	
2	0.0565	0	
	Patience lost, remaining patience: 0
3	0.0615	0	
Test loss: 0.03834541480234064
Test perplexity: 0
Test loss: 0.039504065947246814
Test perplexity: 0
Test loss: 0.0403213475

In [6]:
save_results(base_path, datasets, res, criteria, models)