<a href="https://colab.research.google.com/github/wadaka0821/nlp-tutorial/blob/main/questions/3_5_word2vec_implementation_question.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install datasets

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting datasets
  Downloading datasets-2.10.0-py3-none-any.whl (469 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m469.0/469.0 KB[0m [31m5.3 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting huggingface-hub<1.0.0,>=0.2.0
  Downloading huggingface_hub-0.12.1-py3-none-any.whl (190 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m190.3/190.3 KB[0m [31m7.6 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting xxhash
  Downloading xxhash-3.2.0-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (213 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m213.0/213.0 KB[0m [31m3.4 MB/s[0m eta [36m0:00:00[0m
Collecting multiprocess
  Downloading multiprocess-0.70.14-py38-none-any.whl (132 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m132.0/132.0 KB[0m [31m4.6 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting responses<0

In [None]:
from datasets import load_dataset
import nltk
import gensim
nltk.download('punkt')

seed = 42

dataset = load_dataset("ACL-OCL/acl-anthology-corpus")

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


Downloading readme:   0%|          | 0.00/3.88k [00:00<?, ?B/s]

Downloading and preparing dataset parquet/ACL-OCL--acl-anthology-corpus to /root/.cache/huggingface/datasets/ACL-OCL___parquet/ACL-OCL--acl-anthology-corpus-850594295bb268da/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec...


Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Downloading data:   0%|          | 0.00/515M [00:00<?, ?B/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Dataset parquet downloaded and prepared to /root/.cache/huggingface/datasets/ACL-OCL___parquet/ACL-OCL--acl-anthology-corpus-850594295bb268da/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec. Subsequent calls will reuse this data.


  0%|          | 0/1 [00:00<?, ?it/s]

In [None]:
# アブストだけ取り出す
abstracts = [dataset['train'][i]['abstract'] for i in range(len(dataset['train']))]
# 単語分割
abstracts_tokenized = [nltk.word_tokenize(abst.lower()) for abst in abstracts if abst]

In [None]:
import torch

In [None]:
# Pytorchのシード値を固定
# 必要に応じてpythonやnumpyなどのシード値も固定する必要があります（再現性を持たせたい場合）
torch.manual_seed(seed)
torch.cuda.manual_seed(seed)
torch.backends.cudnn.deterministic = True
torch.use_deterministic_algorithms = True

device = 'cuda' if torch.cuda.is_available() else 'cpu'

In [None]:
# 使用可能なデバイスの確認
f'{device=}'

"device='cuda'"

In [None]:
class Word2Vec(torch.nn.Module):
  def __init__(self, corpus, dim=100):
    super(Word2Vec, self).__init__()

    self.dim = dim
    self.vocab = dict()
    self.vocab_size = 0
    print('start building vocabulary')
    self.build_vocab(corpus)
    print(f'finish building vocabulary\nvocabulary size is {self.vocab_size}')
    self.embedding = torch.nn.Linear(self.vocab_size, dim, bias=False)
    self.output_layer = torch.nn.Linear(dim, self.vocab_size, bias=False)

  def build_vocab(self, corpus):
    for sent in corpus:
      for word in sent:
        if word not in self.vocab:
          self.vocab[word] = len(self.vocab)
    self.vocab_size = len(self.vocab)

  def word2id(self, x):
    ids = [self.vocab[i] for i in x]
    return ids

  def forward(self, x):
    x = torch.nn.functional.one_hot(x, num_classes=self.vocab_size)
    h = self.embedding(x.float())
    logits = self.output_layer(h)

    return logits

In [None]:
N = 10000
model = Word2Vec(abstracts_tokenized[:N])

start building vocabulary
finish building vocabulary
vocabulary size is 41407


In [None]:
window = 2
x, y = list(), list()
for sent in abstracts_tokenized[:N]:
  for i in range(len(sent)):
    tmp_y = sent[max(0, i-window):i]
    if i < len(sent)-1:
      tmp_y += sent[min(i+1, len(sent)):min(i+window, len(sent))]
    y += tmp_y
    x += [sent[i] for _ in range(len(tmp_y))]

In [None]:
len(x)

4059869

In [None]:
x_ids = model.word2id(x)
y_ids = model.word2id(y)

In [None]:
batch_size = 512

datasets = torch.utils.data.TensorDataset(torch.tensor(x_ids), torch.tensor(y_ids))
dataloader = torch.utils.data.DataLoader(datasets, batch_size=batch_size)

In [None]:
from tqdm import tqdm

In [None]:
optimizer = torch.optim.Adam(model.parameters(), lr=1e-2)
MAX_EPOCH = 5
criterion = torch.nn.CrossEntropyLoss()

model.to(device)

for epoch in range(1, MAX_EPOCH+1):
  for it, batch in tqdm(enumerate(dataloader, 1), total=len(dataloader)):
    optimizer.zero_grad()

    pred = model(batch[0].to(device))
    loss = criterion(pred, batch[1].to(device))
    loss.backward()

    optimizer.step()

    # print(f'{epoch=}, {it=}, loss={loss.item():.5f}', end='\r')

100%|██████████| 7930/7930 [02:03<00:00, 64.38it/s]
100%|██████████| 7930/7930 [02:08<00:00, 61.63it/s]
100%|██████████| 7930/7930 [02:05<00:00, 63.06it/s]
100%|██████████| 7930/7930 [02:02<00:00, 64.53it/s]
100%|██████████| 7930/7930 [02:02<00:00, 64.69it/s]


In [None]:
def most_similar(model, word):
  model.to('cpu')
  sims = list()
  with torch.no_grad():
    x = torch.nn.functional.one_hot(torch.tensor(model.vocab[word]), num_classes=model.vocab_size)
    word_vec = model.embedding(x.float())
    sims = torch.matmul(model.embedding.weight.T, word_vec) / torch.linalg.norm(word_vec) / torch.linalg.norm(model.embedding.weight.T, dim=1)
    # for i in model.vocab.keys():
    #   if i == word:
    #     continue
    #   vec = torch.nn.functional.one_hot(torch.tensor(model.vocab[i]), num_classes=model.vocab_size)
    #   vec = model.embedding(vec.float())
    #   sims.append([i, torch.nn.CosineSimilarity()(word_vec.view(1, -1), vec.view(1, -1))])
  return sims

In [None]:
torch.topk(most_similar(model, 'translation'), k=6)

torch.return_types.topk(
values=tensor([1.0000, 0.4428, 0.4320, 0.4244, 0.4181, 0.4156]),
indices=tensor([  177,   768, 24269,   316,   815, 14265]))

In [None]:
for i, j in model.vocab.items():
  if j in [  177,   768, 24269,   316,   815, 14265]:
    print(i)

translation
learning
estimation
translations
norms
shallow-transfer


In [None]:
torch.topk(most_similar(model, 'rnn'), k=6)

torch.return_types.topk(
values=tensor([1.0000, 0.4047, 0.3845, 0.3694, 0.3506, 0.3465]),
indices=tensor([12208,  3587,  3552,  5802, 31092, 25040]))

In [None]:
for i, j in model.vocab.items():
  if j in [12208,  3587,  3552,  5802, 31092, 25040]:
    print(i)

transformer
encoder
t5
rnn
lwans
yih


In [None]:
torch.topk(most_similar(model, 'sentiment'), k=6)

torch.return_types.topk(
values=tensor([1.0000, 0.4898, 0.4843, 0.4838, 0.4818, 0.4800]),
indices=tensor([  895, 34433,  1173, 31848,  3234,  5662]))

In [None]:
for i, j in model.vocab.items():
  if j in [  895, 34433,  1173, 31848,  3234,  5662]:
    print(i)

sentiment
polarity
opinion
emotion
domain-based
discourse-link


## 問題
---
今回は word2vec の学習に skip-gram法を使用しました．別の方法として，CBOWという方法もあります．この CBOW による実装をしてみてください．