In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import math
import numpy as np
import gensim
import gensim.downloader
!pip install datasets
import datasets
from datasets import load_dataset
import pandas as pd
import torchtext
from torchtext.data import get_tokenizer
import warnings
warnings.filterwarnings('ignore')
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
wiki_data_train = load_dataset("wikitext", 'wikitext-2-v1', split="train").shuffle()
wiki_data_test = load_dataset("wikitext", 'wikitext-2-v1', split="test").shuffle()
WIKI_TRAIN = pd.DataFrame(wiki_data_train)
WIKI_TEST = pd.DataFrame(wiki_data_test)
#WIKI_ALL = pd.concat([WIKI_TRAIN, WIKI_TEST])
my_tokenizer = get_tokenizer("basic_english")
UNK_token = 0
UNK_symbol = '<unk>'
class Vocab:
  def __init__(self, name=''):
    self.name = name
    self._word2index = {UNK_symbol: UNK_token}
    self._word2count = {UNK_symbol: 0}
    self._index2word = {UNK_token: UNK_symbol}
    self._n_words = 1
  def get_words(self):
    return list(self._word2count.keys())
  def num_words(self):
    return self._n_words
  def word2index(self, word):
    if word in self._word2index:
      return self._word2index[word]
    else:
      return self._word2index[UNK_symbol]
  def index2word(self, word):
    return self._index2word[word]
  def word2count(self, word):
    return self._word2count[word]
  def add_sentence(self, sentence):
    for word in sentence.split(' '):
      self.add_word(word)
  def add_word(self, word):
    if word not in self._word2index:
      self._word2index[word] = self._n_words
      self._word2count[word] = 1
      self._index2word[self._n_words] = word
      self._n_words += 1
    else:
      self._word2count[word] += 1

In [None]:
CBOW_MAX_LENGTH = 400
CBOW_WINDOW = 4

In [None]:
def prep_cbow_data(data_frame, tokenizer_fn, window=2, max_length=50):
  data_out = []
  data_oo = []
  vocab = Vocab()
  for i in range(len(data_frame["text"])):
    tokens = tokenizer_fn(data_frame["text"][i])
    lento = len(tokens)
    if lento < max_length:
      end = lento
    else:
      end = max_length
    indices = np.zeros(end)
    for j in range(lento):
      vocab.add_word(tokens[j])
    for j in range(end):
      indices[j] = vocab.word2index(tokens[j])
    for j in range(window,end-window):
      x = []
      y = 0
      for k in range(j-window,j+window+1):
        if k != j:
          x.append(indices[k])
        else:
          y = indices[k]
      data_out.append((x,y))
      x.append(y)
      data_oo.append(x)
  return data_out, vocab,data_oo

In [None]:
CBOW_DATA, CBOW_VOCAB, DATA = prep_cbow_data(WIKI_TRAIN, tokenizer_fn=my_tokenizer, window=CBOW_WINDOW, max_length=CBOW_MAX_LENGTH)
print("len dataframe=", len(WIKI_TRAIN), "len data=", len(CBOW_DATA))

In [None]:
!pip install twinning
DATA = np.array(DATA)
CBOW_DATA = np.array(CBOW_DATA)

In [None]:
from twinning import twin
INDEX = twin(DATA,r=2)
CBOW_DATA = CBOW_DATA[np.int64(INDEX)]
CBOW_DATA = CBOW_DATA.tolist()
del DATA
del INDEX

In [None]:
def get_batch(data, index, batch_size=10):
  dd = list(zip(*data[index*batch_size:(index+1)*batch_size]))
  x = np.array(dd[0])
  y = np.array(dd[1])
  x = torch.tensor(x)
  y = torch.tensor(y)
  x = x.to(torch.int64)
  x = x.to(DEVICE)
  y = y.to(torch.int64)
  y = y.to(DEVICE)
  return x, y
class CBOW(nn.Module):
  def __init__(self, vocab_size, embed_size):
    super(CBOW, self).__init__()
    self.emb1 = nn.Embedding(int(vocab_size),int(embed_size))
    self.linear2 = nn.Linear(int(embed_size),int(vocab_size))
    self.log3 = nn.LogSoftmax()
  def forward(self, x):
    probs = None
    emb1 = self.emb1(x).sum(axis=1)
    linear2 = self.linear2(emb1)
    probs = self.log3(linear2)
    probs = probs.to(torch.float)
    return probs
def train_cbow(model, data, num_epochs, batch_size, criterion, optimizer):
  for epoch in range(int(num_epochs)):
    losses = []
    for i in range(len(data)//int(batch_size)):
      x, y = get_batch(data, i, int(batch_size))
      y_hat = model(x)
      loss = criterion(y_hat, y)
      optimizer.zero_grad()
      loss.backward()
      losses.append(loss.item())
      optimizer.step()
      if i % 100 == 0:
        print('iter', i, 'loss', np.array(losses).mean())
    print('epoch', epoch, 'loss', np.array(losses).mean())

In [None]:
CBOW_DATA[0:10]

In [None]:
WIKI_TRAIN["text"][0:2]

In [None]:
CBOW_EMBED_DIMENSIONS = 70
CBOW_BATCH_SIZE = 418
CBOW_NUM_EPOCHS = 13
CBOW_LEARNING_RATE = 0.00132

In [None]:
cbow_model = CBOW(CBOW_VOCAB.num_words(), CBOW_EMBED_DIMENSIONS)
cbow_model.to(DEVICE)
CBOW_CRITERION = nn.NLLLoss()
CBOW_OPTIMIZER = torch.optim.AdamW(cbow_model.parameters(), lr=CBOW_LEARNING_RATE)
train_cbow(cbow_model, CBOW_DATA, num_epochs=CBOW_NUM_EPOCHS, batch_size=CBOW_BATCH_SIZE, criterion=CBOW_CRITERION, optimizer=CBOW_OPTIMIZER)

In [None]:
def prep_test_data(data_frame, vocab, tokenizer_fn, window=2, max_length=50):
  data_out = []
  for row in data_frame['text']:
    tokens = tokenizer_fn(row)
    token_ids = [vocab.word2index(w) for w in tokens]
    if len(token_ids) >= (window*2)+1:
      token_ids = token_ids[0:min(len(token_ids), max_length)]
      for i in range(window, len(token_ids)-window):
        x = token_ids[i-window:i] + token_ids[i+1:i+window+1]
        y = token_ids[i]
        data_out.append((x, y))
  return data_out
TEST_DATA = prep_test_data(WIKI_TEST, CBOW_VOCAB, tokenizer_fn=my_tokenizer, window=CBOW_WINDOW, max_length=CBOW_MAX_LENGTH)
def test_cbow_performance(model, data, batch_size):
  num_correct = 0.
  for i in range(len(data)//batch_size):
    x, y = get_batch(data, i, batch_size)
    y_hat = model(x)
    y_hat = torch.topk(y_hat, 10, dim=1).indices
    num_correct += ((y_hat - y.unsqueeze(dim=1)) == 0).any(dim=1).sum()
  accuracy = num_correct / (len(data) // batch_size * batch_size)
  return accuracy
accuracy = test_cbow_performance(cbow_model, TEST_DATA, 512)
print(accuracy)

In [None]:
def test_cbow_performance(model, data, batch_size):
    num_correct = 0.
    for i in range(len(data)//batch_size):
      x, y = get_batch(data, i, batch_size)
      y_hat = model(x)
      y_hat = torch.topk(y_hat, 10, dim=1).indices
      num_correct += ((y_hat - y.unsqueeze(dim=1)) == 0).any(dim=1).sum()
    accuracy = num_correct / (len(data) // batch_size * batch_size)
    return float(accuracy)
def prep_test_data(data_frame, vocab, tokenizer_fn, window=2, max_length=50):
    data_out = []
    for row in data_frame['text']:
      tokens = tokenizer_fn(row)
      token_ids = [vocab.word2index(w) for w in tokens]
      if len(token_ids) >= (window*2)+1:
        token_ids = token_ids[0:min(len(token_ids), max_length)]
        for i in range(window, len(token_ids)-window):
          x = token_ids[i-window:i] + token_ids[i+1:i+window+1]
          y = token_ids[i]
          data_out.append((x, y))
    return data_out

In [None]:
!pip install bayesian-optimization

In [None]:
def black_box_function(CBOW_EMBED_DIMENSIONS, CBOW_BATCH_SIZE,CBOW_NUM_EPOCHS ,CBOW_LEARNING_RATE):
  cbow_model = CBOW(CBOW_VOCAB.num_words(), CBOW_EMBED_DIMENSIONS)
  cbow_model.to(DEVICE)
  CBOW_CRITERION = nn.NLLLoss()
  CBOW_OPTIMIZER = torch.optim.AdamW(cbow_model.parameters(), lr=CBOW_LEARNING_RATE)
  train_cbow(cbow_model, CBOW_DATA, num_epochs=CBOW_NUM_EPOCHS, batch_size=CBOW_BATCH_SIZE, criterion=CBOW_CRITERION, optimizer=CBOW_OPTIMIZER)
  TEST_DATA = prep_test_data(WIKI_TEST, CBOW_VOCAB, tokenizer_fn=my_tokenizer, window=CBOW_WINDOW, max_length=CBOW_MAX_LENGTH)
  accuracy = test_cbow_performance(cbow_model, TEST_DATA, 512)
  return accuracy
from bayes_opt import BayesianOptimization
pbounds = {'CBOW_EMBED_DIMENSIONS': (50, 200), 'CBOW_BATCH_SIZE': (128, 1024), "CBOW_NUM_EPOCHS":(1,15) ,"CBOW_LEARNING_RATE":(1e-4,10e-4)}
optimizer = BayesianOptimization(
    f=black_box_function,
    pbounds=pbounds,
    verbose=2, # verbose = 1 prints only when a maximum is observed, verbose = 0 is silent
    random_state=1,
)

In [None]:
optimizer.maximize(
    init_points=6,
    n_iter=200,
)