# Getting Embeddings

In [1]:
from transformers import BertModel, BertTokenizer,BertForPreTraining, BertConfig
from tqdm.auto import tqdm
import torch
import nltk
from pathlib import Path
import re
from scipy.spatial.distance import  cosine
import pickle
import numpy as np
import pandas as pd
from joblib import Parallel, delayed
DATA_DIR = Path('data')
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
SEEDS = ['finance', 'medicine', 'sports', 'technology']
NUM_WORDS_PER_SET = 10

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained("bert-base-uncased")


def get_embeddings(model, tokens, embedding_size=768):
    with torch.no_grad():
        output = model(**tokens)
        embedding = output.last_hidden_state[0][1]
        return torch.reshape(embedding, (embedding_size, ))


Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [None]:

tokens = {
    "medicine": tokenizer("medicine", return_tensors='pt', truncation=True),
    "cat": tokenizer("cat", return_tensors='pt', truncation=True),
    "kitty": tokenizer('kitty', return_tensors='pt', truncation=True),
    "feline": tokenizer('feline', return_tensors='pt', truncation=True),
    "doctor": tokenizer("doctor", return_tensors='pt', truncation=True),
    "medical": tokenizer("medical", return_tensors='pt', truncation=True),
}
embeddings = {}
for key, val in tokens.items():
    embeddings[key] = get_embeddings(model, val)

print("cat-kitty: " + f"{cosine(embeddings['cat'], embeddings['kitty'])}")
print('cat-doctor: ' + f"{cosine(embeddings['cat'], embeddings['doctor'])}")
print('medicine-cat: ' +
      f"{cosine(embeddings['medicine'], embeddings['cat'])}")
print('medicine-medical: ' +
      f"{cosine(embeddings['medicine'], embeddings['medical'])}")


In [None]:
tokenizer('[MASK]')

In [None]:
cat_emb = get_embeddings(cat_outputs)
hello_emb = get_embeddings(hello_outputs)
hi_emb = get_embeddings(hi_outputs)

print(cosine(hi_emb, hello_emb))
print(cosine(hi_emb, cat_emb))

In [None]:
cosine([1], [0.1])

# Building vocab for the corpus

In [7]:
df = pd.read_csv(Path('data') / 'dataset.csv')
df.head()

Unnamed: 0,category,title,description
0,3,Wall St. Bears Claw Back Into the Black (Reuters),"Reuters - Short-sellers, Wall Street's dwindli..."
1,3,Carlyle Looks Toward Commercial Aerospace (Reu...,Reuters - Private investment firm Carlyle Grou...
2,3,Oil and Economy Cloud Stocks' Outlook (Reuters),Reuters - Soaring crude prices plus worries\ab...
3,3,Iraq Halts Oil Exports from Main Southern Pipe...,Reuters - Authorities have halted oil export\f...
4,3,"Oil prices soar to all-time record, posing new...","AFP - Tearaway world oil prices, toppling reco..."


In [8]:
try:
    nltk.data.find('tokenizers/punkt.zip')
except LookupError:
    nltk.download('punkt')

try:
    nltk.data.find('corpora/stopwords.zip')
except:
    nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')
stop_words = nltk.corpus.stopwords.words('english')

[nltk_data] Downloading package punkt to /home/zhang/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /home/zhang/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /home/zhang/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


In [9]:
def vocab_preprocess(row, lemmatize=True):
    from nltk.stem import WordNetLemmatizer
    row = row.lower()
    row = row.split('-')[1::]
    row = ''.join(row)
    lemmatizer = WordNetLemmatizer()
    # tokenize words
    words = re.findall(re.compile('[a-zA-Z]+'), row)
    # Remove stop words
    words = [word for word in words if word not in stop_words]
    if (lemmatize):
        words = [lemmatizer.lemmatize(word) for word in words]

    return words

In [None]:
desc = df['description'].astype(str)
data = []
for i, row in desc.iteritems():
    data.append(vocab_preprocess(row))
vocab = set()
for words in data:
    vocab = vocab.union(set(words))
with open(DATA_DIR / "vocab" / "global_vocab.pkl", "wb") as f:
    pickle.dump(vocab, f)

# Build global rankings

In [2]:
vocab = None
with open(DATA_DIR / "vocab" / "global_vocab.pkl", "rb") as f:
    vocab = pickle.load(f)
print(len(vocab))

45394


In [3]:

topic_embeddings = []
for seed in SEEDS:
    topic_embeddings.append(get_embeddings(model, tokenizer(seed, return_tensors='pt')))



In [4]:
res = []
for word in tqdm(vocab):
    token = tokenizer(word, return_tensors='pt')
    if len(token['input_ids']) > 3:
        print(f"WARNING: Word '{word}' is not in BERT's vocabulary")
    word_emb = get_embeddings(model, token)
    res_row = []
    for topic in topic_embeddings:
        res_row.append(cosine(topic, word_emb))
    res.append(res_row)


  0%|          | 0/45394 [00:00<?, ?it/s]

In [6]:
arr = 1 - np.array(res)
res_dict = {
    '_vocab': list(vocab),
}
for i, topic in enumerate(SEEDS):
    res_dict[topic] = arr[:, i]
res_df = pd.DataFrame(res_dict)
res_df = res_df.set_index(['_vocab'])
res_df.to_csv(DATA_DIR / 'global_cos_similarity.csv')

In [8]:
res_df.sort_values(by='finance', ascending=False).head(10)

Unnamed: 0_level_0,finance,medicine,sports,technology
_vocab,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
finance,1.0,0.72356,0.561142,0.68801
accounting,0.85287,0.753559,0.54479,0.710905
logistics,0.841821,0.743687,0.547152,0.750695
marketing,0.836664,0.715334,0.589341,0.735878
banking,0.831688,0.732591,0.513997,0.692174
procurement,0.786893,0.691719,0.476832,0.664342
financial,0.777519,0.647263,0.509762,0.705595
debt,0.773262,0.65907,0.459897,0.63541
engineering,0.773229,0.726308,0.627797,0.805677
advertising,0.769661,0.681448,0.512919,0.688041


# Initialize word sets from $e$

In [10]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained("bert-base-uncased")
res_df = pd.read_csv('global_cos_similarity.csv')
res_df = res_df.set_index(['_vocab'])
word_set = {}
added_words = set()
for seed in SEEDS:
    word_set[seed] = []
    col = res_df[seed]
    sorted_col = col.sort_values()[::-1]
    i = 0
    for word, _ in sorted_col.iteritems():
        if i == NUM_WORDS_PER_SET + 1:
            break
        if word not in added_words:
            word_set[seed].append(word)
            added_words.add(word)
            i += 1

pd.DataFrame(word_set)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Unnamed: 0,finance,medicine,sports,technology
0,finance,medicine,sport,technology
1,accounting,pharmacy,baseball,robotics
2,logistics,pathology,basketball,telecommunication
3,marketing,medical,athletics,communication
4,banking,biology,gymnastics,journalism
5,procurement,surgery,volleyball,philosophy
6,financial,health,football,industrial
7,debt,astronomy,swimming,drama
8,engineering,industry,hockey,integration
9,advertising,nutrition,education,sociology


In [None]:
res_df

# Local Knowlege using pretrained BERT

In [6]:
# Prepare datasets
from transformers import BertTokenizer, DataCollatorForLanguageModeling
from datasets import load_dataset, concatenate_datasets

dataset = load_dataset('ag_news')

Using the latest cached version of the module from /home/alexzhang/.cache/huggingface/modules/datasets_modules/datasets/ag_news/bc2bcb40336ace1a0374767fc29bb0296cdaf8a6da7298436239c54d79180548 (last modified on Tue May 31 15:48:55 2022) since it couldn't be found locally at ag_news., or remotely on the Hugging Face Hub.
Using custom data configuration default
Reusing dataset ag_news (/home/alexzhang/.cache/huggingface/datasets/ag_news/default/0.0.0/bc2bcb40336ace1a0374767fc29bb0296cdaf8a6da7298436239c54d79180548)


  0%|          | 0/2 [00:00<?, ?it/s]

In [5]:
def tokenize_function(examples):
    return tokenizer(examples["text"], padding="max_length", truncation=True)


In [7]:
t_dataset = dataset.map(tokenize_function, batched=True, num_proc=12)

             

#5:   0%|          | 0/10 [00:00<?, ?ba/s]

 

#10:   0%|          | 0/10 [00:00<?, ?ba/s]

     

#1:   0%|          | 0/10 [00:00<?, ?ba/s]

#2:   0%|          | 0/10 [00:00<?, ?ba/s]

#3:   0%|          | 0/10 [00:00<?, ?ba/s]

 

#4:   0%|          | 0/10 [00:00<?, ?ba/s]

#0:   0%|          | 0/10 [00:00<?, ?ba/s]

  

#6:   0%|          | 0/10 [00:00<?, ?ba/s]

#7:   0%|          | 0/10 [00:00<?, ?ba/s]

  

#8:   0%|          | 0/10 [00:00<?, ?ba/s]

#11:   0%|          | 0/10 [00:00<?, ?ba/s]

#9:   0%|          | 0/10 [00:00<?, ?ba/s]

                 

#0:   0%|          | 0/1 [00:00<?, ?ba/s]

#1:   0%|          | 0/1 [00:00<?, ?ba/s]

#2:   0%|          | 0/1 [00:00<?, ?ba/s]

#3:   0%|          | 0/1 [00:00<?, ?ba/s]

 

#4:   0%|          | 0/1 [00:00<?, ?ba/s]

#8:   0%|          | 0/1 [00:00<?, ?ba/s]

    

#5:   0%|          | 0/1 [00:00<?, ?ba/s]

#10:   0%|          | 0/1 [00:00<?, ?ba/s]

#6:   0%|          | 0/1 [00:00<?, ?ba/s]

#9:   0%|          | 0/1 [00:00<?, ?ba/s]

  

#7:   0%|          | 0/1 [00:00<?, ?ba/s]

#11:   0%|          | 0/1 [00:00<?, ?ba/s]

# Load local knowledge BERT

In [8]:
t_dataset

DatasetDict({
    train: Dataset({
        features: ['text', 'label', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 120000
    })
    test: Dataset({
        features: ['text', 'label', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 7600
    })
})

In [4]:
model = BertModel.from_pretrained('models/bert-pretrained-pretrained')
model.eval()
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

Some weights of the model checkpoint at models/bert-pretrained-pretrained were not used when initializing BertModel: ['cls.predictions.transform.dense.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias', 'cls.predictions.decoder.bias', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [5]:
with open('vocab.pkl', 'rb') as f:
    vocab = pickle.load(f)

In [6]:
def job(vocab, topic, tokenizer, model):
    res_col = []
    loop = tqdm(vocab)
    loop.set_description(f"topic: {topic}")
    topic_token = tokenizer(topic, return_tensors='pt')
    topic_emb = get_embeddings(model, topic_token)
    for word in loop:
        token = tokenizer(word, return_tensors='pt')
        if len(token['input_ids']) > 3:
            print(f"WARNING: Word '{word}' is not in BERT's vocabulary")
        word_emb = get_embeddings(model, token)
        res_col.append(cosine(topic_emb, word_emb))
    return res_col

In [7]:
res = []
for topic in SEEDS:
    res_col = job(vocab, topic, tokenizer, model)
    res.append(res_col)

  0%|          | 0/45394 [00:00<?, ?it/s]

  0%|          | 0/45394 [00:00<?, ?it/s]

KeyboardInterrupt: 

In [9]:

arr = np.array(res)
arr = arr.T
res_dict = {
    '_vocab': list(vocab),
}
for i, topic in enumerate(SEEDS):
    res_dict[topic] = arr[:, i]
res_df = pd.DataFrame(res_dict)
res_df = res_df.set_index(['_vocab'])
res_df.to_csv('local_embeddings_bert.csv')

Unnamed: 0_level_0,finance,medicine,sports,technology
_vocab,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
medicine,0.323753,0.0,0.435356,0.22605
psychiatry,0.278695,0.135984,0.464712,0.258565
purification,0.360454,0.143276,0.460375,0.24468
wine,0.298581,0.146669,0.465945,0.242062
spear,0.353989,0.148336,0.470348,0.342442
cure,0.362462,0.150394,0.495592,0.292631
anthropology,0.31639,0.15104,0.422313,0.272204
seal,0.377228,0.151204,0.478585,0.286898
genetics,0.292066,0.151213,0.408178,0.30439
rocket,0.329482,0.151498,0.43169,0.244864


In [None]:
# BERTopic

In [None]:
from bertopic import BERTopic

seeds = np.array([SEEDS]).T.tolist()
topic_model = BERTopic(seed_topic_list=seeds)
docs = pd.read_csv(DATA_DIR / 'dataset.csv')['description']
topic_model.fit_transform(docs)

In [18]:
similar_topics, similarity = topic_model.find_topics("finance", top_n=5)
topic_model.get_topic(similar_topics[1])

[('finance', 0.030844395493737967),
 ('imf', 0.02548989436190981),
 ('monetary', 0.021516056886781838),
 ('ministers', 0.017461659355147346),
 ('budget', 0.015147534599346599),
 ('imfworld', 0.01371422148963831),
 ('crisisracked', 0.01371422148963831),
 ('justpassed', 0.01371422148963831),
 ('asteppedup', 0.01371422148963831),
 ('gatheredunder', 0.01371422148963831)]

In [None]:
seeds = np.array([SEEDS]).T
seeds.tolist()