# Getting Embeddings

In [7]:
from transformers import BertModel, BertTokenizer,BertForPreTraining, BertConfig
from tqdm.auto import tqdm
import torch
import nltk
from pathlib import Path
import re
from scipy.spatial.distance import  cosine
import pickle
import numpy as np
import pandas as pd

DATA_DIR = Path('data')
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')

In [None]:

def get_embeddings(model, tokens):
    with torch.no_grad():
        output = model(**tokens)
        embedding = output.last_hidden_state[0][1]
        return torch.reshape(embedding, (768,))


In [16]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained("bert-base-uncased")
tokens = {
    "medicine": tokenizer("medicine", return_tensors='pt', truncation=True),
    "cat": tokenizer("cat", return_tensors='pt', truncation=True),
    "kitty": tokenizer('kitty', return_tensors='pt', truncation=True),
    "feline": tokenizer('feline', return_tensors='pt', truncation=True),
    "doctor": tokenizer("doctor", return_tensors='pt', truncation=True),
    "medical": tokenizer("medical", return_tensors='pt', truncation=True),
}
embeddings = {}
for key, val in tokens.items():
    embeddings[key] = get_embeddings(model, val)

print("cat-kitty: " + f"{cosine(embeddings['cat'], embeddings['kitty'])}")
print('cat-doctor: ' + f"{cosine(embeddings['cat'], embeddings['doctor'])}")
print('medicine-cat: ' +
      f"{cosine(embeddings['medicine'], embeddings['cat'])}")
print('medicine-medical: ' +
      f"{cosine(embeddings['medicine'], embeddings['medical'])}")


Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


NameError: name 'get_embeddings' is not defined

In [17]:
tokenizer('[MASK]')

{'input_ids': [101, 103, 102], 'token_type_ids': [0, 0, 0], 'attention_mask': [1, 1, 1]}

In [None]:
cat_emb = get_embeddings(cat_outputs)
hello_emb = get_embeddings(hello_outputs)
hi_emb = get_embeddings(hi_outputs)

print(cosine(hi_emb, hello_emb))
print(cosine(hi_emb, cat_emb))

In [None]:
cosine([1], [0.1])

# Building vocab for the corpus

In [None]:
df = pd.read_csv(data_dir / 'dataset.csv')
df.head()

In [None]:
try:
    nltk.data.find('tokenizers/punkt.zip')
except LookupError:
    nltk.download('punkt')

try:
    nltk.data.find('corpora/stopwords.zip')
except:
    nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')
stop_words = nltk.corpus.stopwords.words('english')

In [None]:
def vocab_preprocess(row, lemmatize=True):
    from nltk.stem import WordNetLemmatizer
    row = row.lower()
    row = row.split('-')[1::]
    row = ''.join(row)
    lemmatizer = WordNetLemmatizer()
    # tokenize words
    words = re.findall(re.compile('[a-zA-Z]+'), row)
    # Remove stop words
    words = [word for word in words if word not in stop_words]
    if (lemmatize):
        words = [lemmatizer.lemmatize(word) for word in words]

    return words

In [None]:
desc = df['description'].astype(str)
data = []
for i, row in desc.iteritems():
    data.append(vocab_preprocess(row))
vocab = set()
for words in data:
    vocab = vocab.union(set(words))
with open("vocab.pkl", "wb") as f:
    pickle.dump(vocab, f)

# Build global rankings

In [None]:

SEEDS = ['finance', 'medicine', 'sports', 'technology']

In [None]:
vocab = None
with open('vocab.pkl', "rb") as f:
    vocab = pickle.load(f)
print(len(vocab))

In [None]:

topic_embeddings = []
for seed in SEEDS:
    topic_embeddings.append(get_embeddings(model, tokenizer(seed, return_tensors='pt')))

# df_dict = { "_vocab": list(vocab) }
# for seed in SEEDS:
#     df_dict[seed] = [np.NaN for _ in range(len(vocab))]
# cos_scores = pd.DataFrame(df_dict)
# cos_scores = cos_scores.set_index(['_vocab'])


In [None]:
res = []
for word in tqdm(vocab):
    token = tokenizer(word, return_tensors='pt')
    if len(token['input_ids']) > 3:
        print(f"WARNING: Word '{word}' is not in BERT's vocabulary")
    word_emb = get_embeddings(model, token)
    res_row = []
    for topic in topic_embeddings:
        res_row.append(cosine(topic, word_emb))
    res.append(res_row)


In [None]:
arr = 1 - np.array(res)
res_dict = {
    '_vocab': list(vocab),
}
for i, topic in enumerate(SEEDS):
    res_dict[topic] = arr[:, i]
res_df = pd.DataFrame(res_dict)
res_df = res_df.set_index(['_vocab'])
res_df.to_csv('global_cos_similarity.csv')

# Initialize word sets from $e$

In [None]:

NUM_WORDS_PER_SET = 10



In [None]:
res_df = pd.read_csv('global_cos_similarity.csv')
res_df = res_df.set_index(['_vocab'])
word_set = {}
added_words = set()
for seed in SEEDS:
    word_set[seed] = []
    col = res_df[seed]
    sorted_col = col.sort_values()[::-1]
    i = 0
    for word, _ in sorted_col.iteritems():
        if i == NUM_WORDS_PER_SET + 1:
            break
        if word not in added_words:
            word_set[seed].append(word)
            added_words.add(word)
            i += 1

pd.DataFrame(word_set)

In [None]:
res_df

In [None]:
# build local embeddings
## 1. Train local embeddings

In [3]:
# Train tokenizers

dataset = pd.read_csv(DATA_DIR / 'dataset.csv')
dataset.shape

(127600, 3)

In [5]:
import random
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
sentence_a = []
sentence_b = []
label = []
news = dataset['description']
for i, paragraph in dataset['description'].iteritems():
    sentences = [
        sentence for sentence in paragraph.split('.') if sentence != ''
    ]
    num_sentences = len(sentences)
    if num_sentences > 1:
        start = random.randint(0, num_sentences-2)
        # 50/50 whether is IsNextSentence or NotNextSentence
        if random.random() >= 0.5:
            # this is IsNextSentence
            sentence_a.append(sentences[start])
            sentence_b.append(sentences[start+1])
            label.append(0)
        else:
            index = random.randint(0, dataset.shape[0] - 1)
            # this is NotNextSentence
            sentence_a.append(sentences[start])
            sentence_b.append(dataset['description'][index])
            label.append(1)

In [6]:
inputs = tokenizer(sentence_a, sentence_b, return_tensors='pt',
                   max_length=512, truncation=True, padding='max_length')

In [7]:
inputs['next_sentence_label'] = torch.LongTensor([label]).T
inputs['labels'] = inputs.input_ids.detach().clone()

In [8]:
# create random array of floats with equal dimensions to input_ids tensor
rand = torch.rand(inputs.input_ids.shape)
# create mask array
mask_arr = (rand < 0.15) * (inputs.input_ids != 101) * \
           (inputs.input_ids != 102) * (inputs.input_ids != 0)

In [9]:
selection = []

for i in range(inputs.input_ids.shape[0]):
    selection.append(
        torch.flatten(mask_arr[i].nonzero()).tolist()
    )
for i in range(inputs.input_ids.shape[0]):
    inputs.input_ids[i, selection[i]] = 103
with open('raw_inputs.pkl', 'wb') as f:
    pickle.dump(inputs, f)
inputs.keys()

dict_keys(['input_ids', 'token_type_ids', 'attention_mask', 'next_sentence_label', 'labels'])

In [None]:
class MyDataset(torch.utils.data.Dataset):
    def __init__(self, encodings):
        self.eoncodings = encodings

    def __len__(self):
        return len(self.eoncodings['input_ids'])

    def __getitem__(self, i):
        return {key: torch.tensor(val[i]) for key, val in self.eoncodings.items() }

In [None]:

dataset = MyDataset(inputs)
loader = torch.utils.data.DataLoader(dataset, batch_size=16, shuffle=True)
model = BertForPreTraining.from_pretrained('bert-base-uncased')
model.to(device)

In [None]:
from transformers import AdamW

model.train()

In [None]:

# initialize optimizer
optim = AdamW(model.parameters(), lr=5e-5)
from tqdm.notebook import tqdm  # for our progress bar

epochs = 2

for epoch in range(epochs):
    # setup loop with TQDM and dataloader
    loop = tqdm(loader, leave=True)
    for batch in loop:
        # initialize calculated gradients (from prev step)
        optim.zero_grad()
        # pull all tensor batches required for training
        input_ids = batch['input_ids'].to(device)
        token_type_ids = batch['token_type_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        next_sentence_label = batch['next_sentence_label'].to(device)
        labels = batch['labels'].to(device)
        # process
        outputs = model(input_ids, attention_mask=attention_mask,
                        token_type_ids=token_type_ids,
                        next_sentence_label=next_sentence_label,
                        labels=labels)
        # extract loss
        loss = outputs.loss
        # calculate loss for every parameter that needs grad update
        loss.backward()
        # update parameters
        optim.step()
        # print relevant info to progress bar
        loop.set_description(f'Epoch {epoch}')
        loop.set_postfix(loss=loss.item())

#

In [15]:
with open('tokens.pkl', 'rb') as f:
    tokens = pickle.load(f)
tokens['input_ids'].shape

torch.Size([54764, 512])