# Getting Embeddings

In [1]:
from transformers import BertModel, BertTokenizer
from scipy.spatial.distance import cosine
import torch

import pickle
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained("bert-base-uncased")

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [2]:

def get_embeddings(model, tokens):
    with torch.no_grad():
        output = model(**tokens)
        embedding = output.last_hidden_state[0][1]
        return torch.reshape(embedding, (768,))


In [116]:
tokens = {
    "medicine": tokenizer("medicine", return_tensors='pt', truncation=True),
    "cat": tokenizer("cat", return_tensors='pt', truncation=True),
    "kitty": tokenizer('kitty', return_tensors='pt', truncation=True),
    "feline": tokenizer('feline', return_tensors='pt', truncation=True),
    "doctor": tokenizer("doctor", return_tensors='pt', truncation=True),
    "medical": tokenizer("medical", return_tensors='pt', truncation=True),
}
embeddings = {}
for key, val in tokens.items():
    embeddings[key] = get_embeddings(model, val)

print("cat-kitty: " + f"{cosine(embeddings['cat'], embeddings['kitty'])}")
print('cat-doctor: ' + f"{cosine(embeddings['cat'], embeddings['doctor'])}")
print('medicine-cat: ' +
      f"{cosine(embeddings['medicine'], embeddings['cat'])}")
print('medicine-medical: ' +
      f"{cosine(embeddings['medicine'], embeddings['medical'])}")


SyntaxError: invalid syntax (890100369.py, line 9)

In [86]:
cat_emb = get_embeddings(cat_outputs)
hello_emb = get_embeddings(hello_outputs)
hi_emb = get_embeddings(hi_outputs)

print(cosine(hi_emb, hello_emb))
print(cosine(hi_emb, cat_emb))

NameError: name 'cat_outputs' is not defined

In [77]:
cosine([1], [0.1])

0

# Building vocab for the corpus

In [1]:
import pandas as pd
import torchtext.transforms as T
import nltk
from pathlib import Path
import re
import numpy as np 
import pickle
data_dir = Path('data')

In [60]:
df = pd.read_csv(data_dir / 'dataset.csv')
df.head()

Unnamed: 0,category,title,description
0,3,Wall St. Bears Claw Back Into the Black (Reuters),"Reuters - Short-sellers, Wall Street's dwindli..."
1,3,Carlyle Looks Toward Commercial Aerospace (Reu...,Reuters - Private investment firm Carlyle Grou...
2,3,Oil and Economy Cloud Stocks' Outlook (Reuters),Reuters - Soaring crude prices plus worries\ab...
3,3,Iraq Halts Oil Exports from Main Southern Pipe...,Reuters - Authorities have halted oil export\f...
4,3,"Oil prices soar to all-time record, posing new...","AFP - Tearaway world oil prices, toppling reco..."


/home/zhang/miniconda3/envs/honours


In [57]:
try:
    nltk.data.find('tokenizers/punkt.zip')
except LookupError:
    nltk.download('punkt')

try:
    nltk.data.find('corpora/stopwords.zip')
except:
    nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')
stop_words = nltk.corpus.stopwords.words('english')

[nltk_data] Downloading package punkt to /home/zhang/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /home/zhang/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /home/zhang/nltk_data...
[nltk_data]   Unzipping corpora/omw-1.4.zip.


In [61]:
def vocab_preprocess(row, lemmatize=True):
    from nltk.stem import WordNetLemmatizer
    row = row.lower()
    row = row.split('-')[1::]
    row = ''.join(row)
    lemmatizer = WordNetLemmatizer()
    # tokenize words
    words = re.findall(re.compile('[a-zA-Z]+'), row)
    # Remove stop words
    words = [word for word in words if word not in stop_words]
    if (lemmatize):
        words = [lemmatizer.lemmatize(word) for word in words]

    return words

In [78]:
desc = df['description'].astype(str)
data = []
for i, row in desc.iteritems():
    data.append(vocab_preprocess(row))
vocab = set()
for words in data:
    vocab = vocab.union(set(words))
with open("vocab.pkl", "wb") as f:
    pickle.dump(vocab, f)

45394

# Build global rankings

In [4]:
import pandas as pd
import numpy as np
from scipy.spatial.distance import  cosine
import pickle
from tqdm import  tqdm
SEEDS = ['finance', 'medicine', 'sports', 'technology']

In [2]:
vocab = None
with open('vocab.pkl', "rb") as f:
    vocab = pickle.load(f)
print(len(vocab))

45394


In [6]:

topic_embeddings = []
for seed in SEEDS:
    topic_embeddings.append(get_embeddings(model, tokenizer(seed, return_tensors='pt')))

# df_dict = { "_vocab": list(vocab) }
# for seed in SEEDS:
#     df_dict[seed] = [np.NaN for _ in range(len(vocab))]
# cos_scores = pd.DataFrame(df_dict)
# cos_scores = cos_scores.set_index(['_vocab'])


[tensor([ 1.0937e-01, -1.4895e-01, -4.4813e-01,  8.3246e-02,  8.5608e-02,
          2.1707e-01, -3.4751e-01,  4.2658e-01, -4.5252e-01,  1.4972e-01,
         -2.3706e-01,  2.5392e-02,  6.5334e-01, -1.8769e-01, -4.3382e-01,
          1.0637e-01,  1.4481e-01, -3.9181e-01,  4.7078e-01,  3.0770e-02,
         -5.8344e-01, -1.2246e-01, -3.2593e-01, -2.3762e-01,  3.9824e-01,
          1.0029e+00,  2.9451e-02,  1.2889e-01, -4.9328e-01,  1.7769e-01,
          1.0046e-02,  1.1854e-01,  1.2704e-01,  1.0712e+00,  5.0924e-01,
          6.2405e-02,  2.4562e-01, -2.5446e-01, -4.6117e-01,  3.8513e-01,
          1.7485e-01,  7.4768e-02,  5.6576e-01, -9.4762e-01,  4.1179e-01,
         -6.9610e-01,  8.1111e-01, -2.4790e-01,  2.2211e-01,  3.8557e-01,
         -7.6049e-01,  4.1278e-01,  1.4913e-01, -3.3000e-01, -1.0360e-01,
          7.2983e-01,  4.0192e-01,  2.0712e-01, -9.8151e-02, -3.4812e-01,
          6.3719e-01, -1.8989e-01, -7.5830e-02, -1.7886e-01,  7.9193e-01,
         -2.7618e-01,  5.1947e-01,  4.

In [7]:
res = []
for word in tqdm(vocab):
    token = tokenizer(word, return_tensors='pt')
    if len(token['input_ids']) > 3:
        print(f"WARNING: Word '{word}' is not in BERT's vocabulary")
    word_emb = get_embeddings(model, token)
    res_row = []
    for topic in topic_embeddings:
        res_row.append(cosine(topic, word_emb))
    res.append(res_row)


100%|██████████| 45394/45394 [12:37<00:00, 59.92it/s]


In [18]:
arr = 1 - np.array(res)
res_dict = {
    '_vocab': list(vocab),
}
for i, topic in enumerate(SEEDS):
    res_dict[topic] = arr[:, i]
res_df = pd.DataFrame(res_dict)
res_df = res_df.set_index(['_vocab'])
res_df.to_csv('global_cos_similarity.csv')

Unnamed: 0_level_0,finance,medicine,sports,technology
_vocab,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
greensboro,0.495349,0.485258,0.368076,0.511225
icapp,0.312538,0.318983,0.166868,0.282009
rentokil,0.326850,0.281411,0.231884,0.358316
gref,0.296345,0.292976,0.203246,0.315045
daniel,0.486941,0.505451,0.373799,0.507774
...,...,...,...,...
shelby,0.489004,0.482302,0.343133,0.509093
nebraskabased,0.339947,0.299676,0.285620,0.336530
fourplayer,0.322774,0.235028,0.236218,0.262300
mourning,0.516796,0.532588,0.348531,0.506080


# Initialize word sets from $e$

In [3]:
import numpy as np
import pandas as pd
NUM_WORDS_PER_SET = 10



In [23]:
res_df = pd.read_csv('global_cos_similarity.csv')
res_df = res_df.set_index(['_vocab'])
word_set = {}
added_words = set()
for seed in SEEDS:
    word_set[seed] = []
    col = res_df[seed]
    sorted_col = col.sort_values()[::-1]
    i = 0
    for word, _ in sorted_col.iteritems():
        if i == NUM_WORDS_PER_SET + 1:
            break
        if word not in added_words:
            word_set[seed].append(word)
            added_words.add(word)
            i += 1
pd.DataFrame(word_set)

Unnamed: 0,finance,medicine,sports,technology
0,finance,medicine,sport,technology
1,accounting,pharmacy,baseball,robotics
2,logistics,pathology,basketball,telecommunication
3,marketing,medical,athletics,communication
4,banking,biology,gymnastics,journalism
5,procurement,surgery,volleyball,philosophy
6,financial,health,football,industrial
7,debt,astronomy,swimming,drama
8,engineering,industry,hockey,integration
9,advertising,nutrition,education,sociology


In [9]:
res_df

Unnamed: 0_level_0,finance,medicine,sports,technology
_vocab,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
greensboro,0.495349,0.485258,0.368076,0.511225
icapp,0.312538,0.318983,0.166868,0.282009
rentokil,0.326850,0.281411,0.231884,0.358316
gref,0.296345,0.292976,0.203246,0.315045
daniel,0.486941,0.505451,0.373799,0.507774
...,...,...,...,...
shelby,0.489004,0.482302,0.343133,0.509093
nebraskabased,0.339947,0.299676,0.285620,0.336530
fourplayer,0.322774,0.235028,0.236218,0.262300
mourning,0.516796,0.532588,0.348531,0.506080
