In [None]:
%load_ext autoreload
%autoreload 2

import sys
import time
import random
import torch

import numpy as np
import torch.nn as nn
import torch.optim as optim

from src.model import *
from src.load_data import load_data

from pathlib import Path
from torchtext import data
from torchtext import datasets
from torchtext.vocab import Vectors

In [None]:
SEED = 1234
MIN_FREQ = 1
BATCH_SIZE = 64

random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
torch.backends.cudnn.deterministic = True

device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')

In [None]:
windows = [2, 5, 10]
vector_sizes = [0, 50, 100, 500]
languages = ['bxr']#['bxr', 'myv', 'kv']#['bxr', 'myv', 'kv']
methods = ['cbow', 'sg', 'glove']#[-1, 'cbow', 'sg', 'glove', 'pmi']

In [None]:
scores_d = {}
for language in languages:
    data_generator = load_data(language = language, SEED = SEED)
    
    scores_d[language] = {}
    for method in methods:
        if method not in scores_d[language]:
            scores_d[language][method] = {}
        
        for vector_size in vector_sizes:
            if vector_size not in scores_d[language][method]:
                scores_d[language][method][vector_size] = {}
                    
            for window in windows:            
                if window not in scores_d[language][method][vector_size]:
                    scores_d[language][method][vector_size][window] = []
                
                print("language:", language, "method:", method, "vector_size:", vector_size, "window:", window)
                if method != -1:
                    path = Path("./embeddings/"+language+"/"+str(vector_size)+"/"+str(window)+"/"+method)
                    if not path.is_file():
                        print("File", path, "doesn't exist")
                        continue
                    else:
                        print("loaded word embeddings", path)
                else:
                    if vector_size == 0:
                        print("skipping")
                        continue
                
                for NUM, TEXT, LEMMA, UD_TAGS, train_data, val_data in data_generator.get_fold_data():
                    
                    print("train_data", len(train_data.examples), "val_data", len(val_data.examples))
                    
                    if method == -1:
                        TEXT.build_vocab(train_data, min_freq = MIN_FREQ)
                        emb_size = vector_size
                    else:
                        !rm -rf /tmp/vec
                        _vectors = Vectors(name=path, cache='/tmp/vec')
                        TEXT.build_vocab(train_data,
                            min_freq = MIN_FREQ,
                            vectors = _vectors,
                            unk_init = torch.Tensor.normal_
                        )
                        emb_size = TEXT.vocab.vectors[1].shape[0]
                        
                        if vector_size !=0:
                            assert vector_size==TEXT.vocab.vectors[1].shape[0], "Different sizes"
                        
                    LEMMA.build_vocab(train_data)
                    NUM.build_vocab(train_data)
                    UD_TAGS.build_vocab(train_data)
                    
                    print(f"Unique tokens in TEXT vocabulary: {len(TEXT.vocab)}")
                    print(f"Unique tokens in UD_TAG vocabulary: {len(UD_TAGS.vocab)}")
                    
                    train_iterator, val_iterator = data.BucketIterator.splits(
                                                        (train_data, val_data),
                                                        sort=True,
                                                        sort_key=lambda x: len(x.text),
                                                        sort_within_batch=False,
                                                        batch_size=BATCH_SIZE, 
                                                        repeat=False,
                                                        shuffle=True,
                                                        device=device)
                    
                    pos_score = pos(TEXT, UD_TAGS, train_iterator, val_iterator, emb_size, epochs=30, verbose = False)
                    print("score:", pos_score)
                    scores_d[language][method][vector_size][window].append(pos_score)
                scores_d[language][method][vector_size][window] = np.average(scores_d[language][method][vector_size][window])

In [None]:
count = 0
for idx, vec in enumerate(TEXT.vocab.vectors):
    if vec[0] == 0 and not TEXT.vocab.itos[idx].isdigit():
#         print(TEXT.vocab.itos[idx])
        count += 1

print('словарь', len(TEXT.vocab.vectors))
print('не найдено', count)

In [None]:
TEXT.vocab.stoi[TEXT.pad_token]

In [None]:
def pos(TEXT, UD_TAGS, train_iterator, val_iterator, emb_size, epochs=10, verbose = True):
      
    INPUT_DIM = len(TEXT.vocab)
    EMBEDDING_DIM = emb_size
    HIDDEN_DIM = 128
    OUTPUT_DIM = len(UD_TAGS.vocab)
    N_LAYERS = 1
    BIDIRECTIONAL = False
    DROPOUT = 0.25
    PAD_IDX = TEXT.vocab.stoi[TEXT.pad_token]

    model = BiLSTMPOSTagger(INPUT_DIM, 
                        EMBEDDING_DIM, 
                        HIDDEN_DIM, 
                        OUTPUT_DIM, 
                        N_LAYERS, 
                        BIDIRECTIONAL, 
                        DROPOUT,
                        PAD_IDX)
    
    model.apply(init_weights)
    print(f'The model has {count_parameters(model):,} trainable parameters')
    
    if TEXT.vocab.vectors is not None:
        print("load pretrained embs")
        pretrained_embeddings = TEXT.vocab.vectors
        print('embeddings', pretrained_embeddings.shape)
    
        model.embedding.weight.data.copy_(pretrained_embeddings)
        model.embedding.weight.data[PAD_IDX] = torch.zeros(EMBEDDING_DIM)
    else:
        print("no loading any pretrained embs")
    
    optimizer = optim.Adam(model.parameters())
    TAG_PAD_IDX = UD_TAGS.vocab.stoi[UD_TAGS.pad_token]
    criterion = nn.CrossEntropyLoss(ignore_index = TAG_PAD_IDX)
    
    model = model.to(device)
    criterion = criterion.to(device)
    
    for epoch in range(epochs):

        start_time = time.time()
    
        train_loss = train(model, train_iterator, optimizer, criterion, TAG_PAD_IDX)
        valid_loss, valid_acc = evaluate(model, val_iterator, criterion, TAG_PAD_IDX)
    
        end_time = time.time()
        epoch_mins, epoch_secs = epoch_time(start_time, end_time)
            
        if verbose:
            print(f'Epoch: {epoch+1:02} | Epoch Time: {epoch_mins}m {epoch_secs}s')
            print(f'\t Val. Loss: {valid_loss:.3f} |  Val. Acc: {valid_acc*100:.2f}%')

    test_loss, test_acc = evaluate(model, val_iterator, criterion, TAG_PAD_IDX)
    return test_acc

In [None]:
import pprint
pp = pprint.PrettyPrinter(indent=4)
pp.pprint(scores_d)

In [None]:
import json
with open('extrinsic.json', 'w', encoding='utf-8') as f:
    json.dump(scores_d, f, sort_keys=False, indent=4, ensure_ascii=False)