# Obtain Word Embeddings for Token Classifiers

### Table of Contents

[0.](#0) Preprocessing

[1.](#1) SpaCy's sense2cec

[2.](#2) GloVe

[3.](#3) Custom with fastText

In [1]:
## In both virtual envs
import config
import pandas as pd
import numpy as np
import re

## In gender-bias virtual env only
# import utils
# import spacy
# import nltk
# from nltk.stem import WordNetLemmatizer

## In fasttext virtual env only
from gensim.models import FastText
from gensim import utils
from gensim.test.utils import get_tmpfile

<a id="0"></a>
## 0. Preprocessing

In [3]:
df_train = pd.read_csv(config.tokc_path+"model_input/token_train.csv", index_col=0)
df_dev = pd.read_csv(config.tokc_path+"model_input/token_validate.csv", index_col=0)
print(df_train.shape, df_dev.shape)
df_train.head()

(467564, 10) (157740, 10)


Unnamed: 0,description_id,sentence_id,ann_id,token_id,token,token_offsets,pos,tag,field,subset
3,1,1,99999,3,Title,"(17, 22)",NN,O,Title,train
4,1,1,99999,4,:,"(22, 23)",:,O,Title,train
5,1,1,99999,5,Papers,"(24, 30)",NNS,O,Title,train
6,1,1,99999,6,of,"(31, 33)",IN,O,Title,train
7,1,1,14384,7,The,"(34, 37)",DT,B-Unknown,Title,train


Lemmatize the tokens:

In [10]:
# lmtzr = WordNetLemmatizer()

In [11]:
# tokens_train = list(df_train.token)
# lemmas_train = [lmtzr.lemmatize(token) for token in tokens_train]
# tokens_dev = list(df_dev.token)
# lemmas_dev = [lmtzr.lemmatize(token) for token in tokens_dev]

In [12]:
# df_train.insert((list(df_train.columns).index("token")+1), "lemma", lemmas_train)
# df_dev.insert((list(df_dev.columns).index("token")+1), "lemma", lemmas_dev)

Obtain the vocabulary of the annotated data:

In [4]:
df = pd.concat([df_train, df_dev])  # df_train

In [10]:
unique_tokens = list(set(list(df.token)))
# unique_lemmas = list(set(list(df.lemma))) 
# unique_lemmas = [lemma for lemma in unique_lemmas if lemma.isalpha()]
# lemmas_lower = [lemma.lower() for lemma in unique_lemmas]
# unique_lemmas_lower = list(set(lemmas_lower))
unique_words = [token for token in unique_tokens if token.isalpha()]  # keep tokens with only alphabetic characters
print(len(unique_words), len(unique_tokens)) #, len(unique_lemmas), len(unique_lemmas_lower))

unique_tokens_lower = [token.lower() if token.isalpha() else token for token in unique_tokens]
unique_tokens_lower = list(set(unique_tokens_lower))
unique_words_lower = [token.lower() for token in unique_words]
unique_words_lower = list(set(unique_words_lower))
print(len(unique_words_lower), len(unique_tokens_lower))

32957 42272
27687 37002


<a id="1"></a>
## 1. SpaCy's sense2vec

Load [spaCy's contextual word embeddings](https://github.com/explosion/sense2vec), which were trained on 2015 Reddit posts:

In [36]:
nlp = spacy.load("en_core_web_sm")
s2v = nlp.add_pipe("sense2vec")
s2v.from_disk(config.s2v_reddit_path)

#-------------
# doc = nlp("A sentence about natural language processing.")
# assert doc[3:6].text == "natural language processing"
# freq = doc[3:6]._.s2v_freq
# vector = doc[3:6]._.s2v_vec
# most_similar = doc[3:6]._.s2v_most_similar(3)
# # [(('machine learning', 'NOUN'), 0.8986967),
# #  (('computer vision', 'NOUN'), 0.8636297),
# #  (('deep learning', 'NOUN'), 0.8573361)]

<sense2vec.component.Sense2VecComponent at 0x7f87677c5d30>

In [38]:
print(unique_words[:20])

['Cliffs', 'rpm', 'Heal', 'hereditary', 'Medjid', 'presumedly', 'Mondays', 'Routine', 'recipientESPMedawar', 'Kirkuk', 'Seton', 'Venado', 'Edith', 'Mackay', 'Visiting', 'interwar', 'atherogenic', 'Cawdor', 'jockey', 'Burgesses']


In [75]:
not_in_s2v = []
for word in unique_words:
    lowercased = word.lower()
    w = (nlp(lowercased))[0]
    if w._.s2v_vec is None:
        not_in_s2v += [word]

print("Total words in vocabulary not in Sense2Vec:", len(not_in_s2v))
print("Proportion of vocabulary not in Sense2Vec:",(len(not_in_s2v))/len(unique_words))

Total words in vocabulary not in Sense2Vec: 11529
Proportion of vocabulary not in Sense2Vec: 0.40388859695218077


In [61]:
# print(not_in_s2v[:100])
# print(not_in_s2v[1000:1100])
print(not_in_s2v[-100:])

['recipient', 'squabs', 'Darby', 'poulterer', 'Scotsman', 'sepolero', 'Morphogenetic', 'Hynes', 'Repleta', 'Simal', 'Mai', 'Arithmetic', 'Duce', 'Mme', 'lectureKatchalsky', 'inA', 'Tulsk', 'Berg', 'Mode', 'Bohme', 'Envelope', 'furnitureKoestler', 'Burmester', 'Evang', 'Ilona', 'compagne', 'BarnArthur', 'Cant', 'GollyArthur', 'Hatano', 'ElectionsAccompanied', 'accomodement', 'Finney', 'poetarum', 'Realites', 'Margaropus', 'LI', 'Lennox', 'Basberg', 'Ignacio', 'Glennie', 'Duffus', 'Takagi', 'Aliza', 'emir', 'Hersham', 'Rossini', 'Bald', 'Magnus', 'Pattison', 'Skefhill', 'Sorrel', 'Landolphin', 'Staub', 'ME', 'Alumbadi', 'Model', 'Majesties', 'Harian', 'trichocysts', 'Wandervogel', 'Rumped', 'Waetjen', 'Verbena', 'ofThe', 'sturzte', 'Lee', 'Copernicus', 'Verasis', 'Altenberg', 'unnumbered', 'environs', 'Neutral', 'Wynne', 'Mossman', 'ReneeESPCutten', 'ReadingSent', 'sequelae', 'Calary', 'Soviets', 'AustriaKoestler', 'Karlsburg', 'Peckham', 'Macdougall', 'Goldschmidt', 'electroencephalogra

In [74]:
# x = "lettersBaillie"
# y = "writer'"
# z = "MunsterSent"
newly_found = 0
for word in not_in_s2v:
    found = re.findall("[A-Z]{0,1}[a-z]+", word)
    for f in found:
        lowercased = f.lower()
        w = nlp(lowercased)[0]
        if not w._.s2v_vec is None:
            newly_found += 1
print(newly_found)

2659


In [76]:
print("Proportion not found but possible to find:", newly_found/(len(not_in_s2v)))
print("Proportion of vocabulary not possible to find in Sense2Vec:",(len(not_in_s2v)-newly_found)/len(unique_words))

Proportion not found but possible to find: 0.23063578801283718
Proportion of vocabulary not possible to find in Sense2Vec: 0.3107374321247154


I'm not sure it's worth reworking the tokenization and part-of-speech tagging to increase Sense2Vec's coverage of the vocabulary by only about 9%, so we'll keep going with the model input data as it is for now.  

<a id="2"></a>
## 2. GloVe

Load the [GloVe word embeddings](https://github.com/stanfordnlp/GloVe), which were trained on 2014 English Wikipedia entries and Gigaword 5:

*Note: could also try [GN-GloVe](https://github.com/uclanlp/gn_glove), which supposedly has gender-neutral word embeddings*

In [15]:
# Reference: https://medium.com/analytics-vidhya/basics-of-using-pre-trained-glove-vectors-in-python-d38905f356db
dimensions = ["50", "100", "200", "300"]  # pretrained GloVe embeddings come as vectors with one of these four dimensions
d = dimensions[0]  # start small to begin with
glove_path = config.inf_data_path+"glove.6B/glove.6B.{}d.txt".format(d)

In [18]:
glove = dict()
with open(glove_path, "r") as f:
    for line in f:
        values = line.split()
        word = values[0]
        vector = np.asarray(values[1:], "float32")
        glove[word] = vector
print(glove["recipient"])

[-0.13412    1.5041     0.39706   -0.41357    0.71336    0.63438
  0.1214     0.16746    1.4104     0.013127   0.68386    0.85236
  0.92599    0.098158   0.425     -0.83799    0.08482    0.22135
 -0.15247    0.72654    0.052728  -1.0064    -0.032626  -0.63342
  0.0039789 -1.5288    -0.74005   -1.2051    -1.0227    -0.10588
  1.2225     0.14845   -0.1641    -0.52887   -0.29012    0.59774
  0.62847    0.49003   -0.14227   -1.2193     0.56094   -0.17673
 -0.11216   -0.41801   -0.40841   -0.41748   -0.40276    0.25091
 -0.43016    0.26412  ]


Calculate GloVe's coverage of our vocabulary: 

In [19]:
not_in_glove = []
in_glove = dict()
for word in unique_words:
    lowercased = word.lower()
    try:
        vector = glove[lowercased]
        in_glove[word] = vector
    except KeyError:
        not_in_glove += [word]

print("Total words in vocabulary not in GloVe:", len(not_in_glove))
print("Proportion of vocabulary not in GloVe:",(len(not_in_glove))/len(unique_words))

Total words in vocabulary not in GloVe: 6453
Proportion of vocabulary not in GloVe: 0.1958005886458112


In [24]:
still_not_found = []
newly_found = 0
# partial_glove_match = dict()
for word in not_in_glove:
    found = re.findall("[A-Z]{0,1}[a-z]+", word)
    for f in found:
        lowercased = f.lower()
        try:
            vector = glove[lowercased]
            in_glove[word] = vector  # partial_glove_match[word] = vector
            newly_found += 1
        except KeyError:
            still_not_found += [word]
print("Not possible to find in GloVe:", len(still_not_found))
print("Proportion not found but possible to find in GloVe:", newly_found/(len(not_in_glove)))
print("Proportion of vocabulary not possible to find in GloVe:",(len(still_not_found))/len(unique_words))

Not possible to find in GloVe: 4187
Proportion not found but possible to find in GloVe: 0.7311328064466139
Proportion of vocabulary not possible to find in GloVe: 0.12704433049124617


GloVe has much better coverage than sense2vec, as expected due to the better domain match (Wikipedia entries are more similar to archival metadata descriptions than Reddit comments!).

In [25]:
words_to_vectors = in_glove.copy()
key_array = np.array(words_to_vectors.keys())
for word in unique_words:
    if word not in key_array:
        if word.lower() in key_array:
            vector = words_to_vectors[word.lower()]
        else:
            vector = np.array([])
        words_to_vectors[word] = vector

In [26]:
assert len(words_to_vectors) == len(unique_words)

Create a dataset associating each token to a GloVe word embedding:

In [55]:
embedding_col_name = "glove_embedding"
embedding_dict = glove

In [56]:
train_embeddings = utils.createEmbeddingDataFrame(df_train, embedding_dict, embedding_col_name)
train_embeddings.head()

Unnamed: 0,token_id,glove_embedding,token
3,3,"[-0.15234, 0.98085, 1.0065, 0.97812, 0.6628, 0...",Title
4,4,[],:
5,5,"[-0.15234, 0.98085, 1.0065, 0.97812, 0.6628, 0...",Papers
6,6,"[-0.15234, 0.98085, 1.0065, 0.97812, 0.6628, 0...",of
7,7,"[-0.15234, 0.98085, 1.0065, 0.97812, 0.6628, 0...",The


In [57]:
dev_embeddings = utils.createEmbeddingDataFrame(df_dev, embedding_dict, embedding_col_name)
dev_embeddings.head()

Unnamed: 0,token_id,glove_embedding,token
172,154,"[-0.15234, 0.98085, 1.0065, 0.97812, 0.6628, 0...",After
173,155,"[-0.15234, 0.98085, 1.0065, 0.97812, 0.6628, 0...",his
174,156,"[-0.15234, 0.98085, 1.0065, 0.97812, 0.6628, 0...",ordination
175,157,"[-0.15234, 0.98085, 1.0065, 0.97812, 0.6628, 0...",he
176,158,"[-0.15234, 0.98085, 1.0065, 0.97812, 0.6628, 0...",spent


Save the data:

In [58]:
# train_embeddings.to_csv(config.tokc_path+"glove_embeddings_train.csv")
# dev_embeddings.to_csv(config.tokc_path+"glove_embeddings_dev.csv")

<a id="3"></a>
## 3. Custom with fastText

Train word embeddings on my own data (metadata descriptions from the CRC's Archives catalog) using fastText.

*References:* 
* *https://radimrehurek.com/gensim/models/fasttext.html*
* *https://radimrehurek.com/gensim/auto_examples/tutorials/run_fasttext.html#sphx-glr-auto-examples-tutorials-run-fasttext-py*

In [2]:
class CorpusIterator:
    def implodeDataFrame(self, df, cols_to_groupby):
        cols_to_agg = list(df.columns)
        for col in cols_to_groupby:
            cols_to_agg.remove(col)
        agg_dict = dict.fromkeys(cols_to_agg, lambda x: x.tolist())
        return df.groupby(cols_to_groupby).agg(agg_dict).reset_index().set_index(cols_to_groupby)
    
    def __iter__(self):
        df_train = pd.read_csv(config.tokc_path+"model_input/token_train.csv", usecols=["sentence_id", "token"])
        df_train_imploded = self.implodeDataFrame(df_train, ["sentence_id"])
        sentences = list(df_train_imploded.token)
        for token_list in sentences:
            yield(token_list)

Define the hyperparameters for the unsupervised training of the fastText model (essentially a word2vec model that uses using character n-grams so subwords can help to assign embeddings to unseen words):

In [3]:
# Specify training architecture (default = "cbow" for Continuous Bag of Words)
models = ["cbow", "skipgram"]
model = models[0]
# Specify the learning rate (default = 0.025)
alpha = 0.025
# Specify the training objective (default = "ns")
# losses = ["ns", "hs", "softmax"]
# loss = losses[0]
# Specify the number of negative words to sample for 'ns' training objective (default = 5)
negative = 5
# Specify the threshold for downsampling higher-frequency words (default = 0.001)
sample = 0.001
# Specify the word embeddings' dimensions
vector_dimensions = 100
# Specify the context window (default is 5) 
context_window = 5
# Specify the number of epochs (default is 5)
epochs = 5
# Specify the threshold of word occurrences (ignore words that occur less than specified number of times; default = 5)
min_count = 5
# Specify the minimum and maximum length of character ngrams (defaults are 3 and 6)
min_n = 2
max_n = 6  # if 0, no character n-grams (subword vectors) will be used
# Specify the number of buckets for hashing ngrams (default = 2000000) 
bucket = 2000000
# Sort vocabulary by descending frequency (default = 1)
sorted_vocab = 1
# Specify the number of threads to use (default = 12)
# threads = 12

In [4]:
model = FastText(
    alpha=alpha, negative=negative, sample=sample,
    vector_size=vector_dimensions, window=context_window, 
    epochs=epochs, min_count=min_count, min_n=min_n, 
    max_n=max_n, bucket=bucket, sorted_vocab=sorted_vocab
)

In [5]:
model.build_vocab(corpus_iterable=CorpusIterator())
total_examples = model.corpus_count

In [6]:
model.train(corpus_iterable=CorpusIterator(), total_examples=total_examples, epochs=epochs)

(1307490, 2337820)

In [7]:
model.wv["recipient"]

array([-0.6756068 ,  1.5919273 , -0.56705076,  1.6880698 ,  0.9636156 ,
        0.31721365,  0.25568432,  0.2058282 , -0.1416149 , -0.18989426,
       -0.54189414, -1.4998999 ,  0.03471487, -0.9126452 , -0.0782093 ,
       -0.14311433, -1.1163957 ,  1.0090419 ,  1.6154461 ,  0.65505093,
       -0.8446712 , -1.4965539 , -1.4920071 , -1.0338659 , -0.9557528 ,
        0.34576824, -0.69574195, -1.7805381 ,  1.5230308 , -0.47035334,
        0.03263632, -1.5507953 , -0.7169255 ,  0.15754858,  1.8103783 ,
        0.4070575 ,  0.2572637 ,  1.3328292 , -0.14403729, -1.196998  ,
        2.234289  , -1.3798985 ,  0.6903577 , -2.6350315 , -0.94290125,
        1.2201816 , -1.5511761 ,  0.26433146,  1.7597557 ,  3.1027613 ,
        2.854742  , -1.8140292 ,  2.5589483 ,  1.5211413 ,  1.2342783 ,
       -0.47556603,  2.3050432 ,  0.23175682,  1.2051044 ,  0.06902517,
        0.6075335 , -1.9958506 , -0.40674683,  0.3392531 ,  0.19527328,
       -3.3649073 ,  1.6847279 ,  0.7022183 , -0.39973742,  0.74

Save the model:

In [8]:
file_name = get_tmpfile(config.tokc_path+"fasttext100.model")
model.save(file_name)

In [9]:
len(model.wv)

7277

In [11]:
type(model.wv.key_to_index)

dict

In [12]:
print(model.wv.key_to_index["recipient"])

104


In [16]:
model.wv[104]

array([ 0.03737615, -0.13094172,  0.4192771 , -0.10888413,  0.4322802 ,
       -0.6235069 , -0.04176774,  0.5860596 , -0.97697514,  0.9668318 ,
        1.0936502 ,  0.7192852 ,  0.43130356, -0.13834317,  0.14606117,
        1.014545  ,  0.46894675, -0.02479243,  0.38670725, -1.0797498 ,
        1.552295  , -0.4531086 , -0.12248089,  0.24791214,  0.28702378,
        0.5095818 ,  0.14123417,  0.07884683, -0.2801781 ,  0.2914682 ,
        0.9706851 , -0.4849089 ,  0.98705786,  0.17181778, -1.0430001 ,
       -0.34546047,  0.13864943,  0.13679619,  0.48705992, -0.31251132,
       -0.48371068,  0.33515218, -0.07702316, -0.3278501 , -0.21714082,
       -0.28916112,  0.36632246, -0.72787535, -0.8380801 ,  0.3025424 ,
        0.42312002,  0.05307492,  0.17401198, -1.0034671 , -1.0878724 ,
       -0.83836   ,  0.8597549 , -1.3812542 , -0.8466691 , -0.7410131 ,
       -1.7701464 ,  0.05303479,  0.4593584 ,  0.03552771, -0.01448729,
        1.8695406 ,  1.0092819 ,  0.05009551,  2.3934693 ,  0.44

In [17]:
model.wv["recipient"]

array([-0.6756068 ,  1.5919273 , -0.56705076,  1.6880698 ,  0.9636156 ,
        0.31721365,  0.25568432,  0.2058282 , -0.1416149 , -0.18989426,
       -0.54189414, -1.4998999 ,  0.03471487, -0.9126452 , -0.0782093 ,
       -0.14311433, -1.1163957 ,  1.0090419 ,  1.6154461 ,  0.65505093,
       -0.8446712 , -1.4965539 , -1.4920071 , -1.0338659 , -0.9557528 ,
        0.34576824, -0.69574195, -1.7805381 ,  1.5230308 , -0.47035334,
        0.03263632, -1.5507953 , -0.7169255 ,  0.15754858,  1.8103783 ,
        0.4070575 ,  0.2572637 ,  1.3328292 , -0.14403729, -1.196998  ,
        2.234289  , -1.3798985 ,  0.6903577 , -2.6350315 , -0.94290125,
        1.2201816 , -1.5511761 ,  0.26433146,  1.7597557 ,  3.1027613 ,
        2.854742  , -1.8140292 ,  2.5589483 ,  1.5211413 ,  1.2342783 ,
       -0.47556603,  2.3050432 ,  0.23175682,  1.2051044 ,  0.06902517,
        0.6075335 , -1.9958506 , -0.40674683,  0.3392531 ,  0.19527328,
       -3.3649073 ,  1.6847279 ,  0.7022183 , -0.39973742,  0.74