# Getting Embeddings

In [45]:
import nltk
nltk.download('stopwords')

[nltk_data] Error loading stopwords: <urlopen error [Errno 111]
[nltk_data]     Connection refused>


False

In [1]:
from transformers import BertModel, BertTokenizer,BertForPreTraining, BertConfig
from tqdm.auto import tqdm
import torch
import nltk
from pathlib import Path
import re
from scipy.spatial.distance import  cosine
import pickle
import numpy as np
import pandas as pd
from joblib import Parallel, delayed
DATA_DIR = Path('data')
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
SEEDS = ['finance', 'medicine', 'sports', 'technology']
NUM_WORDS_PER_SET = 10

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained("bert-base-uncased")


def get_embeddings(model, tokens, embedding_size=768):
    with torch.no_grad():
        output = model(**tokens)
        embedding = output.last_hidden_state[0][1]
        return torch.reshape(embedding, (embedding_size, ))


Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.weight', 'cls.predictions.decoder.weight', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [2]:
def get_embeddings_batch(model, tokens, embedding_size=768, batch_size=4):
    with torch.no_grad():
        output = model(**tokens)
        embedding = output.last_hidden_state[:, 1, :]
        return embedding


def cos_distance_batch(topic, words):
    return np.inner(
        topic, words) / (np.linalg.norm(topic) * np.linalg.norm(words, axis=1))


def job(vocab, topic, tokenizer, model, batch_size=4):
    res_col = np.zeros((len(vocab), ))
    vocab = list(vocab)
    loop = tqdm(range(0, len(vocab), batch_size))
    loop.set_description(f"topic: {topic}")
    topic_token = tokenizer(topic,
                            return_tensors='pt',
                            padding=True,
                            max_length=10,
                            truncation=True)
    topic_emb = get_embeddings(model, topic_token)
    for batch_index in loop:
        lo = batch_index
        hi = min(batch_index + batch_size, len(vocab))
        batch = vocab[batch_index:batch_index + batch_size]
        tokens = tokenizer(batch,
                           return_tensors='pt',
                           padding='max_length',
                           max_length=10,
                           truncation=True)
        # if len(token['input_ids']) > 3:
        #     print(f"WARNING: Word '{word}' is not in BERT's vocabulary")
        word_embs = get_embeddings_batch(model, tokens)
        res_col[lo:hi] = cos_distance_batch(topic_emb, word_embs)
        # res_col.append(cosine(topic_emb, word_emb))
    return res_col

In [None]:
token = tokenizer("medicine", return_tensors='pt', truncation=True)
print(token.input_ids.shape)
out = model(**token)
out.last_hidden_state.shape

In [None]:
tokenizer('[MASK]')

In [None]:
cat_emb = get_embeddings(cat_outputs)
hello_emb = get_embeddings(hello_outputs)
hi_emb = get_embeddings(hi_outputs)

print(cosine(hi_emb, hello_emb))
print(cosine(hi_emb, cat_emb))

In [None]:
cosine([1], [0.1])

# Building vocab for the corpus

In [None]:
df = pd.read_csv(Path('data') / 'content_cleaned.txt')
df.head()

In [13]:

stop_words = nltk.corpus.stopwords.words('english')

In [46]:
def vocab_preprocess(row, lemmatize=True):
    from nltk.stem import WordNetLemmatizer
    row = row.lower().strip()
    words = row.split(' ')
    # Remove stop words
    words = [word for word in words if word not in stop_words]
    res = {}
    for word in words:
        count = res.get(word, 0)
        count += 1
        res[word] = count
    if (lemmatize):
        words = [lemmatizer.lemmatize(word) for word in words]

    return res

In [47]:
MIN_COUNT=3
f = open('data/content_cleaned.txt')
lines = f.readlines()
f.close()
data = []
vocab = {}
for i, row in tqdm(enumerate(lines)):
    freq = vocab_preprocess(row, False)
    for _word, _count in freq.items():
        count = vocab.get(_word, 0) + _count
        vocab[_word] = count
uncommon_words = []
for word, count in tqdm(vocab.items()):
    if count < MIN_COUNT:
       uncommon_words.append(word)
print(len(uncommon_words))
print(len(vocab.keys()))
for word in uncommon_words:
    del vocab[word]
print(len(vocab.keys()))
with open(DATA_DIR / "vocab" / "global_vocab_no_lemmatize.pkl", "wb") as f:
    pickle.dump(vocab, f)

0it [00:00, ?it/s]

  0%|          | 0/48223 [00:00<?, ?it/s]

22346
48223
25877


# Build global rankings

In [4]:
vocab = None
with open(DATA_DIR / "vocab" / "global_vocab_no_lemmatize.pkl", "rb") as f:
    vocab = pickle.load(f)
print(len(vocab))

25877


In [5]:

topic_embeddings = []
cols = []
for seed in SEEDS:
    cols.append(job(vocab, seed, tokenizer, model, batch_size=128))


  0%|          | 0/203 [00:00<?, ?it/s]

  0%|          | 0/203 [00:00<?, ?it/s]

  0%|          | 0/203 [00:00<?, ?it/s]

  0%|          | 0/203 [00:00<?, ?it/s]

In [6]:
arr = np.array(cols)
res_dict = {
    '_vocab': list(vocab),
}
for i, topic in enumerate(SEEDS):
    print(arr[i].shape)
    res_dict[topic] = arr[i]
res_df = pd.DataFrame(res_dict)
res_df = res_df.set_index(['_vocab'])
res_df.to_csv('./results/global_cos_similarity.csv')
res_df.sort_values('finance', ascending=False).head(10)

(25877,)
(25877,)
(25877,)
(25877,)


Unnamed: 0_level_0,finance,medicine,sports,technology
_vocab,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
finance,1.0,0.72356,0.561143,0.68801
accounting,0.852869,0.753559,0.54479,0.710905
logistics,0.841821,0.743688,0.547153,0.750695
marketing,0.836663,0.715334,0.589341,0.735878
banking,0.831688,0.732591,0.513998,0.692174
securities,0.802517,0.647551,0.520464,0.690203
procurement,0.786893,0.69172,0.476832,0.664342
financial,0.777519,0.647263,0.509762,0.705595
finances,0.777289,0.650105,0.544972,0.582143
debt,0.773261,0.65907,0.459897,0.63541


In [7]:
df = pd.read_csv('./results/global_cos_similarity.csv', index_col='_vocab')
for seed in SEEDS:
    line = df.sort_values(seed, ascending=False).head(4).index[1::]
    print(" ".join(line))

accounting logistics marketing
dentistry pharmacy pathology
sport baseball basketball
technologies engineering robotics


In [11]:
res_df.sort_values(by='technology', ascending=False).head(10)

Unnamed: 0_level_0,finance,medicine,sports,technology
_vocab,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
technology,0.688011,0.640761,0.592079,1.0
technologies,0.685335,0.582082,0.502387,0.819644
engineering,0.773229,0.726307,0.627797,0.805677
robotics,0.702049,0.674241,0.586133,0.802258
telecommunications,0.703807,0.68484,0.60068,0.784153
systems,0.673512,0.610776,0.539074,0.770397
telecommunication,0.686691,0.670584,0.500605,0.756258
communication,0.75589,0.723222,0.544936,0.756241
logistics,0.841821,0.743688,0.547153,0.750695
journalism,0.641313,0.656924,0.595752,0.74847


# Initialize word sets from $e$

In [2]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained("bert-base-uncased")
res_df = pd.read_csv('global_cos_similarity.csv')
res_df = res_df.set_index(['_vocab'])
word_set = {}
added_words = set()
for seed in SEEDS:
    word_set[seed] = []
    col = res_df[seed]
    sorted_col = col.sort_values()[::-1]
    i = 0
    for word, _ in sorted_col.iteritems():
        if i == NUM_WORDS_PER_SET + 1:
            break
        if word not in added_words:
            word_set[seed].append(word)
            added_words.add(word)
            i += 1

word_set_df = pd.DataFrame(word_set)
word_set_df.head()

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.seq_relationship.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [8]:
for topic, words in word_set.items():
    print(f"{' '.join(words[1:4])}")

accounting logistics marketing
pharmacy pathology medical
baseball basketball athletics
robotics telecommunication communication


# Local Knowlege using pretrained BERT

In [None]:
# Prepare datasets
from transformers import BertTokenizer, DataCollatorForLanguageModeling
from datasets import load_dataset, concatenate_datasets
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=True,
    mlm_probability=0.15,
)
dataset = load_dataset('ag_news')

In [None]:
concatenate_datasets([dataset['train'], dataset['test']], axis=0)

In [None]:
%%bash

python3 scripts/train.py \
    -o ./models \
    -t ./data/tokens/tokens-pretrained-30522.pkl \
    -n bert-pretrined-30522 

# Compute Local Cosine similarity

In [None]:
model = BertModel.from_pretrained('models/bert-pretrained-pretrained')
model.eval()
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

In [None]:
a = np.ones((768, ))
b = np.random.rand(4, 768)
np_cos = np.inner(a, b) / (np.linalg.norm(a) * np.linalg.norm(b, axis=1))
scp_cos = [1-cosine(a, b[i]) for i in range(4)]
print(np_cos.shape)
print(scp_cos)

In [4]:
with open('vocab.pkl', 'rb') as f:
    vocab = pickle.load(f)

In [63]:
def get_embeddings_batch(model, tokens, embedding_size=768, batch_size=4):
    with torch.no_grad():
        output = model(**tokens)
        embedding = output.last_hidden_state[:, 1, :]
        return embedding


def cos_distance_batch(topic, words):
    return np.inner(topic, words) / (np.linalg.norm(topic) * np.linalg.norm(words, axis=1))


def job(vocab, topic, tokenizer, model, batch_size=4):
    res_col = np.zeros((len(vocab), ))
    vocab = list(vocab)
    loop = tqdm(range(0, len(vocab), batch_size))
    loop.set_description(f"topic: {topic}")
    topic_token = tokenizer(topic, return_tensors='pt', padding=True, max_length=10, truncation=True)
    topic_emb = get_embeddings(model, topic_token)
    for batch_index in loop:
        lo = batch_index
        hi = min(batch_index + batch_size, len(vocab))
        batch = vocab[batch_index:batch_index + batch_size]
        tokens = tokenizer(batch, return_tensors='pt', padding='max_length', max_length=10, truncation=True)
        # if len(token['input_ids']) > 3:
        #     print(f"WARNING: Word '{word}' is not in BERT's vocabulary")
        word_embs = get_embeddings_batch(model, tokens)
        res_col[lo:hi] = cos_distance_batch(topic_emb, word_embs)
        # res_col.append(cosine(topic_emb, word_emb))
    return res_col

In [64]:
res = []
for topic in SEEDS:
    res_col = job(vocab, topic, tokenizer, model, batch_size=256)
    res.append(res_col)

  0%|          | 0/94 [00:00<?, ?it/s]

  0%|          | 0/94 [00:00<?, ?it/s]

  0%|          | 0/94 [00:00<?, ?it/s]

  0%|          | 0/94 [00:00<?, ?it/s]

In [65]:

arr = np.array(res)
arr = arr.T
res_dict = {    
    '_vocab': list(vocab),
}
for i, topic in enumerate(SEEDS):
    res_dict[topic] = arr[:, i]
res_df = pd.DataFrame(res_dict)
res_df = res_df.set_index(['_vocab'])
res_df.to_csv('local_embeddings_bert-pretrained-pretrained.csv')

In [66]:
res_df = pd.read_csv('./local_embeddings_bert-pretrained-pretrained.csv', index_col='_vocab')
res_df.sort_values('finance', ascending=False).head(10)

Unnamed: 0_level_0,finance,medicine,sports,technology
_vocab,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
finance,1.0,0.72356,0.561143,0.68801
accounting,0.852869,0.753559,0.54479,0.710905
logistics,0.841821,0.743688,0.547153,0.750695
marketing,0.836663,0.715334,0.589341,0.735878
banking,0.831688,0.732591,0.513998,0.692174
securities,0.802517,0.647551,0.520464,0.690203
procurement,0.786893,0.69172,0.476832,0.664342
financial,0.777519,0.647263,0.509762,0.705595
finances,0.777289,0.650105,0.544972,0.582143
debt,0.773261,0.65907,0.459897,0.63541


In [32]:
# BERTopic

In [None]:
from bertopic import BERTopic

seeds = np.array([SEEDS]).T.tolist()
topic_model = BERTopic(seed_topic_list=seeds)
docs = pd.read_csv(DATA_DIR / 'dataset.csv')['description']
topic_model.fit_transform(docs)

In [44]:
similar_topics, similarity = topic_model.find_topics("finance", top_n=5)
for i, topic in enumerate(similar_topics):
    print(topic_model.get_topic(topic)[0][0], similarity[i])

monetary 0.791343401281863
finance 0.7557908228754602
funds 0.7344708126048249
freddie 0.6878446711889306
securities 0.6788982583268262


# CatE
**kwown issue**: Seg Fault when seed words > 3

In [5]:
import pandas as pd

local_emb_vocab_df = pd.read_csv('./results/CatE/emb_seeds_w.txt', sep=' ', skiprows=[0], index_col=0, header=None)
local_emb_vocab_df.dropna(axis=1, inplace=True)

local_emb_topic_df = pd.read_csv('./results/CatE/emb_seeds_t.txt', sep=' ', skiprows=[0], index_col=0, header=None)
local_emb_topic_df.dropna(axis=1, inplace=True)


In [6]:
def compute_cate_cos_similarity(vocab, vocab_embs, topic_emb, topic, batch_size=32, show_progress=True):
    res_col = np.zeros((len(vocab), ))
    loop = range(0, len(vocab), batch_size)
    if show_progress:
        loop = tqdm(loop)
        loop.set_description(f"topic: {topic}")
    for batch_index in loop:
        lo = batch_index
        hi = min(batch_index + batch_size, len(vocab))

        batch = vocab[lo:hi]
        batch_embs = vocab_embs.loc[batch, :]
        res_col[lo:hi] = cos_distance_batch(topic_emb, batch_embs)
        # res_col.append(cosine(topic_emb, word_emb))
    return res_col

In [7]:
vocab_f = open('./data/vocab/global_vocab_no_lemmatize.pkl', 'rb')
vocab = pickle.load(vocab_f)
vocab_f.close()
not_in_vocab_words = []
for word in vocab.keys():
    if word not in local_emb_vocab_df.index:
        not_in_vocab_words.append(word)
print(len(not_in_vocab_words))
for word in not_in_vocab_words:
    del vocab[word]
for word in local_emb_vocab_df.index:
    if word not in vocab:
        local_emb_vocab_df =  local_emb_vocab_df.drop(word, axis=0)

1


In [8]:
res = {}
res['_vocab'] = vocab.keys()
for topic in local_emb_topic_df.index:
    topic_emb = local_emb_topic_df.loc[topic, :]
    res_col = compute_cate_cos_similarity(list(vocab.keys()), local_emb_vocab_df, topic_emb, topic, show_progress=False)
    res[topic] = res_col


In [9]:
res_df = pd.DataFrame(res)
res_df = res_df.set_index(['_vocab'])
res_df.to_csv('./results/cate_local_embeddings.csv')

# Compute Ensemble ranking [Proposed]
Which does not work as expected -> CatE has different embeddings for the same word in topic and vocab

In [4]:
import pandas as pd

local_emb_vocab_df = pd.read_csv('./results/CatE/emb_seeds_w.txt', sep=' ', skiprows=[0], index_col=0, header=None)
local_emb_vocab_df.dropna(axis=1, inplace=True)

local_emb_topic_df = pd.read_csv('./results/CatE/emb_seeds_t.txt', sep=' ', skiprows=[0], index_col=0, header=None)
local_emb_topic_df.dropna(axis=1, inplace=True)

In [43]:
def compute_vocab_cos_similarity(emb):
    out_shape = (emb.shape[0], emb.shape[0])
    df = pd.DataFrame(np.zeros((out_shape)), index=emb.index, columns=emb.index)
    for word in tqdm(emb.index):
        topic_emb = emb.loc[word, :]
        res_row = cos_distance_batch(topic_emb, emb)
        df.loc[word, :] = res_row
    return df
res = compute_vocab_cos_similarity(local_emb_vocab_df)

  0%|          | 0/25876 [00:00<?, ?it/s]

In [44]:
res.loc['accounting', 'unumprovident']

0.5662511389486806

In [50]:
cosine(local_emb_vocab_df.loc['accounting', :], local_emb_topic_df.loc['accounting', :])

0.33416083167506283

# Compute emsemble ranking

In [35]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler
global_cos_df = pd.read_csv('results/global_cos_similarity.csv', index_col=0)
local_cos_df = pd.read_csv('./results/cate_local_embeddings.csv', index_col=0)
# min_max_scalar


In [61]:
def ensemble_ranking(score_g, score_l, rho):
    exponent = 1 / rho
    base = 0.5 * np.power(1/score_g, rho) + 0.5 * np.power(1/score_l, rho) 
    return 1/np.power(base, exponent)

def scale_data(data):
    scaler = MinMaxScaler()
    return scaler.fit_transform(data)

def compute_vocab_ensemble_rankings(score_l_df, score_g_df, vocab, rho=1): 
    raw = np.zeros((len(vocab), score_l_df.shape[1])).astype(np.double)
    res = pd.DataFrame(raw, index=vocab, columns=score_g_df.columns)
    for topic_idx in range(score_l_df.shape[1]):
        score_l = score_l_df.loc[vocab, score_l_df.columns[topic_idx]]
        score_g = score_g_df.loc[vocab, score_g_df.columns[topic_idx]]
        res.iloc[:, topic_idx] = ensemble_ranking(score_l, score_g, rho)
    return res
s_global_cos_df = scale_data(global_cos_df)
local_cos_df.loc[:, :] = scale_data(local_cos_df)
global_cos_df.loc[:, :] = scale_data(global_cos_df)
res = compute_vocab_ensemble_rankings(local_cos_df, global_cos_df, local_cos_df.index)
res['finance'].sort_values(ascending=False)

_vocab
accounting    0.880459
marketing     0.843212
logistics     0.842892
financial     0.799560
securities    0.789042
                ...   
oxley         0.028208
forgotten     0.027069
morten        0.016974
winky         0.000000
horns         0.000000
Name: finance, Length: 25876, dtype: float64

In [32]:
print(global_cos_df.loc['sirikit', 'finance']), print(local_cos_df.loc['sirikit', 'accounting'])
ensemble_ranking(global_cos_df.loc['demonstrated', 'finance'],local_cos_df.loc['demonstrated', 'accounting'], 1)

0.2320390492677688
-0.0146787450410294


0.48877448630042725