In [2]:
# Spacy for Vietnamese 
import spacy
nlp = spacy.load('vi_spacy_model')
doc = nlp('Cộng đồng xử lý ngôn ngữ tự nhiên')
for token in doc:
    print(token.text, token.lemma_, token.pos_, token.tag_, token.dep_,
            token.shape_, token.is_alpha, token.is_stop)

Cộng_đồng Cộng_đồng X N nsubj xxxxxxxxx False False
xử_lý xử_lý X V ROOT xxxxx False True
ngôn_ngữ ngôn_ngữ X N obj xxxxxxxx False False
tự_nhiên tự_nhiên X A compound xxxxxxxx False False


In [4]:
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F

from torchtext.datasets import Multi30k
from torchtext.data import Field, BucketIterator

import matplotlib.pyplot as plt
import matplotlib.ticker as ticker

import spacy
import numpy as np

import random
import math
import time

In [5]:
SEED = 1234

random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
torch.cuda.manual_seed(SEED)
torch.backends.cudnn.deterministic = True

In [6]:
spacy_vi = spacy.load('vi_spacy_model')
!python3 -m spacy download en
spacy_en = spacy.load('en')

[38;5;2m✔ Download and installation successful[0m
You can now load the model via spacy.load('en_core_web_sm')
[38;5;2m✔ Linking successful[0m
/home/bluesky/anaconda3/lib/python3.8/site-packages/en_core_web_sm -->
/home/bluesky/anaconda3/lib/python3.8/site-packages/spacy/data/en
You can now load the model via spacy.load('en')


In [8]:
def tokenize_vi(text):
    """
    Tokenizes German text from a string into a list of strings
    """
    return [tok.text for tok in spacy_vi.tokenizer(text)]

def tokenize_en(text):
    """
    Tokenizes English text from a string into a list of strings
    """
    return [tok.text for tok in spacy_en.tokenizer(text)]

In [9]:
SRC = Field(tokenize = tokenize_en, 
            init_token = '<sos>', 
            eos_token = '<eos>', 
            lower = True, 
            batch_first = True)
TRG = Field(tokenize = tokenize_vi, 
            init_token = '<sos>', 
            eos_token = '<eos>', 
            lower = True, 
            batch_first = True)



# TODO
# build function tokenizer
# build function load and preprocessing data




In [None]:
train_data, valid_data, test_data = Multi30k.splits(exts=('.en', '.vn'), 
                                                    fields=(SRC, TRG))

In [None]:
SRC.build_vocab(train_data, min_freq = 2)
TRG.build_vocab(train_data, min_freq = 2)

In [None]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [None]:
BATCH_SIZE = 128

train_iterator, valid_iterator, test_iterator = BucketIterator.splits(
    (train_data, valid_data, test_data), 
     batch_size = BATCH_SIZE,
     device = device)