In [1]:
%load_ext autoreload
%autoreload 2
from model import *

import numpy as np
from pathlib import Path

import time
import random
SEED = 1234

random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
torch.backends.cudnn.deterministic = True

In [2]:
windows = [2, 5, 10]
min_count = 10
vector_sizes = [0, 50, 100, 500]
languages = ['bxr', 'myv', 'kv']
methods = ['cbow', 'sg', 'glove', 'pmi']

In [None]:
import sys

scores_d = {}
for language in languages:
    scores_d[language]={}
    for vector_size in vector_sizes:
        for window in windows:            
            pt = "./embeddings/"+language+"/"+str(vector_size)+"/"+str(window)+"/"
            path = Path(pt)
            wes = list(path.iterdir())
            print(wes)
            for we in wes:
                l = we.stem
                
                if l not in scores_d[language]:
                    scores_d[language][l] = {}
                    
                if vector_size not in scores_d[language][l]:
                    scores_d[language][l][vector_size] = {}
                    
                print(str(we.absolute()))
                pos_score = pos(language, str(we.absolute()), epochs=10, vector_load=True)
                print("score:", pos_score)
                scores_d[language][l][vector_size][window] = pos_score

In [8]:
import sys
for method, scores in scores_d['kv'].items():
    print(method)
    for vector_size, windows in scores.items():
#         print(windows.keys())
        ws = [str(round(a,3)) for a in list(windows.values())]
        print("&",vector_size," & "," & ".join(ws),"\\\\")

EpmiWeighting
& 0  &  0.254 & 0.258 & 0.265 \\
PlmiWeighting
& 0  &  0.258 & 0.269 & 0.249 \\
pmi
& 0  &  0.301 & 0.308 & 0.318 \\
& 50  &  0.221 & 0.171 & 0.193 \\
& 100  &  0.218 & 0.215 & 0.226 \\
& 500  &  0.219 & 0.216 & 0.218 \\
cbow
& 0  &  0.185 & 0.178 & 0.185 \\
& 50  &  0.171 & 0.173 & 0.17 \\
& 100  &  0.185 & 0.178 & 0.185 \\
& 500  &  0.199 & 0.196 & 0.206 \\
PlogWeighting
& 0  &  0.317 & 0.323 & 0.305 \\
PpmiWeighting
& 0  &  0.301 & 0.308 & 0.318 \\
ft
& 50  &  0.188 & 0.188 & 0.192 \\
& 100  &  0.198 & 0.191 & 0.197 \\
& 500  &  0.238 & 0.231 & 0.229 \\
sg
& 50  &  0.153 & 0.119 & 0.113 \\
& 100  &  0.174 & 0.167 & 0.163 \\
& 500  &  0.215 & 0.213 & 0.209 \\
glove
& 50  &  0.18 & 0.178 & 0.175 \\
& 100  &  0.184 & 0.195 & 0.186 \\
& 500  &  0.242 & 0.245 & 0.24 \\


In [None]:
scores_d

In [3]:
import torch
import torch.nn as nn
import torch.optim as optim

from torchtext.legacy import data
from torchtext.legacy import datasets
from torchtext.vocab import Vectors

def pos(language, vector_f, epochs=10, vector_load=True):
    
    NUM = data.Field(lower = True)
    TEXT = data.Field(lower = True)
    LEMMA = data.Field(lower = True)
    UD_TAGS = data.Field(unk_token=None)
        
    fields = (("num", NUM), ("text", TEXT), ("lemma", LEMMA), ("udtags", UD_TAGS))
        
    train_data, valid_data, test_data = datasets.UDPOS.splits(
                                                            fields,
                                                            root='./extrinsic/'+language,
                                                            train='train.conll', 
                                                            test='test.conll',
                                                            validation='valid.conll'
                                                          )
        
    print(f"Number of training examples: {len(train_data)}")
    print(f"Number of validation examples: {len(valid_data)}")
    print(f"Number of testing examples: {len(test_data)}")
    
    MIN_FREQ = 1
    
    if vector_load:
        !rm -rf /tmp/vec    
        _vectors = Vectors(name=vector_f, cache='/tmp/vec')
        TEXT.build_vocab(train_data, 
                         min_freq = MIN_FREQ, 
                         vectors = _vectors,
                         unk_init = torch.Tensor.normal_
                        )
        EMBEDDING_DIM = TEXT.vocab.vectors[1].shape[0]
    else:
        TEXT.build_vocab(train_data, 
                     min_freq = MIN_FREQ)
        EMBEDDING_DIM = 100

    UD_TAGS.build_vocab(test_data)
    LEMMA.build_vocab(test_data)
    NUM.build_vocab(test_data)
    
    print(f"Unique tokens in TEXT vocabulary: {len(TEXT.vocab)}")
    print(f"Unique tokens in UD_TAG vocabulary: {len(UD_TAGS.vocab)}")
    
    BATCH_SIZE = 10

    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

    train_iterator, valid_iterator, test_iterator = data.BucketIterator.splits(
        (train_data, valid_data, test_data), 
        batch_size = BATCH_SIZE,
        device = device)
    
    INPUT_DIM = len(TEXT.vocab)
    HIDDEN_DIM = 128
    OUTPUT_DIM = len(UD_TAGS.vocab)
    N_LAYERS = 2
    BIDIRECTIONAL = True
    DROPOUT = 0.25
    PAD_IDX = TEXT.vocab.stoi[TEXT.pad_token]

    model = BiLSTMPOSTagger(INPUT_DIM, 
                        EMBEDDING_DIM, 
                        HIDDEN_DIM, 
                        OUTPUT_DIM, 
                        N_LAYERS, 
                        BIDIRECTIONAL, 
                        DROPOUT,
                        PAD_IDX)
    
    model.apply(init_weights)
    print(f'The model has {count_parameters(model):,} trainable parameters')
    
    if vector_load:
        pretrained_embeddings = TEXT.vocab.vectors
        print(pretrained_embeddings.shape)
    
        model.embedding.weight.data.copy_(pretrained_embeddings)
        model.embedding.weight.data[PAD_IDX] = torch.zeros(EMBEDDING_DIM)
    
    optimizer = optim.Adam(model.parameters())
    TAG_PAD_IDX = UD_TAGS.vocab.stoi[UD_TAGS.pad_token]
    criterion = nn.CrossEntropyLoss(ignore_index = TAG_PAD_IDX)
    
    model = model.to(device)
    criterion = criterion.to(device)
    
    N_EPOCHS = epochs

#     best_valid_loss = float('inf')
    for epoch in range(N_EPOCHS):

        start_time = time.time()
    
        train_loss = train(model, train_iterator, optimizer, criterion, TAG_PAD_IDX)
        valid_loss, valid_acc = evaluate(model, valid_iterator, criterion, TAG_PAD_IDX)
    
        end_time = time.time()

        epoch_mins, epoch_secs = epoch_time(start_time, end_time)
    
        torch.save(model.state_dict(), 'tut1-model.pt')
    
        print(f'Epoch: {epoch+1:02} | Epoch Time: {epoch_mins}m {epoch_secs}s')
        print(f'\t Val. Loss: {valid_loss:.3f} |  Val. Acc: {valid_acc*100:.2f}%')
        
    model.load_state_dict(torch.load('tut1-model.pt'))
    test_loss, test_acc = evaluate(model, test_iterator, criterion, TAG_PAD_IDX)
    return test_acc