In [2]:
!pip install keras
!pip install torch
!pip install torchtext
!pip install mosestokenizer



In [10]:
!pip install gensim

Collecting gensim
  Downloading gensim-4.2.0-cp38-cp38-win_amd64.whl (24.0 MB)
Collecting smart-open>=1.8.1
  Downloading smart_open-6.0.0-py3-none-any.whl (58 kB)
Collecting Cython==0.29.28
  Downloading Cython-0.29.28-py2.py3-none-any.whl (983 kB)
Installing collected packages: smart-open, Cython, gensim
  Attempting uninstall: Cython
    Found existing installation: Cython 0.29.24
    Uninstalling Cython-0.29.24:
      Successfully uninstalled Cython-0.29.24
Successfully installed Cython-0.29.28 gensim-4.2.0 smart-open-6.0.0


# Imports

In [37]:
import pandas as pd
import numpy as np
import torch.optim as optim
import numpy as np
import pandas as pd
from torch.utils.data import DataLoader
import torch
import torch.nn as nn
import torchtext
from mosestokenizer import *
import gensim
from tqdm import tqdm
import collections
import math
from tqdm import tqdm
import warnings
warnings.filterwarnings("ignore")
warnings.simplefilter(action='ignore', category=FutureWarning)

# Load model and data

In [38]:
from gensim.models import Word2Vec
w2v_model = Word2Vec.load('../resources/georgian_word2vec.model')
w2v = w2v_model.wv

In [39]:
sequences = pd.read_csv('../data/data.csv',nrows=100000).values.flatten()
nrows = len(sequences)
train = sequences[:int(nrows*0.9)]
test = sequences[int(nrows*0.9):]

# Words To Index

In [40]:
word_counts = collections.Counter()
word_counts['<pad>'] = 1

for sentence in train:
    for word in sentence.split():
        word_counts[word] += 1
        
unique_words = sorted(word_counts, key=word_counts.get, reverse=True)
vocab_size = len(unique_words) + 1

In [41]:
index_to_word = {index: word for index, word in enumerate(unique_words)}

In [42]:
word_to_index = {word: index for index, word in enumerate(unique_words)}


# Embeddings

In [43]:
index_to_embedings = {}

for idx, token in index_to_word.items():
    try:
        index_to_embedings[idx] = w2v[token]
    except:
        index_to_embedings[idx] = torch.zeros(100)

In [44]:
embedding_vectors = [x for _, x in index_to_embedings.items()]

# Model

In [45]:
class LSTM(nn.Module):
    def __init__(self,*, n_vocab, num_layers, embedding_dim, hidden_size):
        super(LSTM, self).__init__()
        self.n_vocab = n_vocab
        self.num_layers = num_layers
        self.embedding_dim = embedding_dim
        self.hidden_size = hidden_size
        
        self.emb = nn.Embedding.from_pretrained(
            torch.FloatTensor(embedding_vectors), 
            freeze=False
        ) 
        
        self.lstm = nn.LSTM(
            input_size=self.embedding_dim,
            hidden_size=self.hidden_size,
            num_layers=self.num_layers,
            batch_first=True,
            dropout=0.3,
        )

        self.fc = nn.Linear( self.hidden_size, vocab_size)

    def forward(self, x):
        h0 = torch.zeros(self.num_layers, x.size(0), self.hidden_size)
        c0 = torch.zeros(self.num_layers, x.size(0), self.hidden_size)
        x = self.emb(x)
        output, state = self.lstm(x, (h0, c0))
        logits = self.fc(output)
        return logits, state

In [46]:
model = LSTM(
    n_vocab=139769, # vocab_size
    num_layers=2,
    embedding_dim = 100,
    hidden_size= 128
)
model.load_state_dict(torch.load('../resources/nn_lstm_model_state'))

<All keys matched successfully>

In [47]:
def word2index(w):
    try:
        return word_to_index[w]
    except:
        return word_to_index['<pad>']
def predict(model, text, next_words=100):
    model.eval()

    words = text.split(' ')

    for i in range(0, next_words):
        x = torch.tensor([[word2index(w) for w in words[i:]]])
        y_pred, (state_h, state_c) = model(x)

        last_word_logits = y_pred[0][-1]
        p = torch.nn.functional.softmax(last_word_logits, dim=0).detach().numpy()
        word_index = np.random.choice(len(last_word_logits), p=p)
        while(index_to_word[word_index] == '<pad>'):
            word_index = np.random.choice(len(last_word_logits), p=p)
        words.append(index_to_word[word_index])
    return ' '.join(words)

In [48]:
predict(model, 'ბიჭი', 5)

'ბიჭი ჩვენს განცხადებებს გავიდა # #'

In [51]:
avg_sequence_len = (sum([len(x.split()) for x in sequences]) // len(sequences))
pad_index = word_to_index['<pad>']
n_gram_size = avg_sequence_len // 2
test_input = []
for i in range(len(test)):
    test[i] = [pad_index if w not in word_to_index else word_to_index[w] for w in train[i].split()]
def pad(x):
    if len(x) < avg_sequence_len:
            x += [pad_index] * (avg_sequence_len - len(x))
    return x

for line in test:
    pad(line)
    for i in range(len(line) - n_gram_size):
        n_gram_sequence = line[i:i+n_gram_size]
        test_input.append(n_gram_sequence)
# pad or strip data, also add <eos>
def collate_batch(batch):
    label_list, text_list = [], []
    for x in batch:
        label_list.append(x[1:])
        x = x[:-1]
        text_list.append(torch.Tensor(x).long())
    text_list = torch.stack(text_list, dim=0)
    label_list = torch.Tensor(label_list).long()
    
    return text_list, label_list
test_dl = DataLoader(test_input, 
                      batch_size=128, 
                      shuffle=False, 
                      collate_fn=collate_batch)

# Computing perplexity on test data

In [64]:
def compute_perplexity(model, dl):
    """
    Compute perplexity
    """
    model.eval()
    criterion = nn.CrossEntropyLoss()
    with torch.no_grad(): # tells Pytorch not to store values of intermediate computations for backward pass because we not gonna need gradients.
        loss = 0
        for batch, (x, y) in tqdm(enumerate(dl)):
            y_pred, (state_h, state_c) = model(x)
            loss += criterion(y_pred.transpose(1, 2), y)
    model.train()
    return np.exp(loss / len(dl))
    
compute_perplexity(model, test_dl)


643it [06:49,  1.57it/s]


tensor(812.8319)