# Obtain Word Embeddings for Token Classifiers

### Table of Contents

[0.](#0) Preprocessing

[1.](#1) SpaCy's sense2cec

[2.](#2) GloVe

[3.](#3) Custom with fastText

In [1]:
import config
import pandas as pd
import numpy as np
import re, os
import utils as my_utils
# import spacy
# import nltk
from gensim.models import FastText
from gensim.utils import tokenize
from gensim import utils
from gensim.test.utils import get_tmpfile

<a id="0"></a>
## 0. Preprocessing

In [2]:
df_train = pd.read_csv(config.tokc_path+"model_input/token_train.csv", index_col=0)
df_dev = pd.read_csv(config.tokc_path+"model_input/token_validate.csv", index_col=0)
print(df_train.shape, df_dev.shape)
df_train.head()

(467564, 10) (157740, 10)


Unnamed: 0,description_id,sentence_id,ann_id,token_id,token,token_offsets,pos,tag,field,subset
3,1,1,99999,3,Title,"(17, 22)",NN,O,Title,train
4,1,1,99999,4,:,"(22, 23)",:,O,Title,train
5,1,1,99999,5,Papers,"(24, 30)",NNS,O,Title,train
6,1,1,99999,6,of,"(31, 33)",IN,O,Title,train
7,1,1,14384,7,The,"(34, 37)",DT,B-Unknown,Title,train


Lemmatize the tokens:

In [3]:
# lmtzr = WordNetLemmatizer()

In [4]:
# tokens_train = list(df_train.token)
# lemmas_train = [lmtzr.lemmatize(token) for token in tokens_train]
# tokens_dev = list(df_dev.token)
# lemmas_dev = [lmtzr.lemmatize(token) for token in tokens_dev]

In [5]:
# df_train.insert((list(df_train.columns).index("token")+1), "lemma", lemmas_train)
# df_dev.insert((list(df_dev.columns).index("token")+1), "lemma", lemmas_dev)

Obtain the vocabulary of the annotated data:

In [6]:
df = pd.concat([df_train, df_dev])  # df_train

In [7]:
unique_tokens = list(set(list(df.token)))
# unique_lemmas = list(set(list(df.lemma))) 
# unique_lemmas = [lemma for lemma in unique_lemmas if lemma.isalpha()]
# lemmas_lower = [lemma.lower() for lemma in unique_lemmas]
# unique_lemmas_lower = list(set(lemmas_lower))
unique_words = [token for token in unique_tokens if token.isalpha()]  # keep tokens with only alphabetic characters
print(len(unique_words), len(unique_tokens)) #, len(unique_lemmas), len(unique_lemmas_lower))

unique_tokens_lower = [token.lower() if token.isalpha() else token for token in unique_tokens]
unique_tokens_lower = list(set(unique_tokens_lower))
unique_words_lower = [token.lower() for token in unique_words]
unique_words_lower = list(set(unique_words_lower))
print(len(unique_words_lower), len(unique_tokens_lower))

32957 42272
27687 37002


<a id="1"></a>
## 1. SpaCy's sense2vec

Load [spaCy's contextual word embeddings](https://github.com/explosion/sense2vec), which were trained on 2015 Reddit posts:

In [36]:
nlp = spacy.load("en_core_web_sm")
s2v = nlp.add_pipe("sense2vec")
s2v.from_disk(config.s2v_reddit_path)

#-------------
# doc = nlp("A sentence about natural language processing.")
# assert doc[3:6].text == "natural language processing"
# freq = doc[3:6]._.s2v_freq
# vector = doc[3:6]._.s2v_vec
# most_similar = doc[3:6]._.s2v_most_similar(3)
# # [(('machine learning', 'NOUN'), 0.8986967),
# #  (('computer vision', 'NOUN'), 0.8636297),
# #  (('deep learning', 'NOUN'), 0.8573361)]

<sense2vec.component.Sense2VecComponent at 0x7f87677c5d30>

In [38]:
print(unique_words[:20])

['Cliffs', 'rpm', 'Heal', 'hereditary', 'Medjid', 'presumedly', 'Mondays', 'Routine', 'recipientESPMedawar', 'Kirkuk', 'Seton', 'Venado', 'Edith', 'Mackay', 'Visiting', 'interwar', 'atherogenic', 'Cawdor', 'jockey', 'Burgesses']


In [75]:
not_in_s2v = []
for word in unique_words:
    lowercased = word.lower()
    w = (nlp(lowercased))[0]
    if w._.s2v_vec is None:
        not_in_s2v += [word]

print("Total words in vocabulary not in Sense2Vec:", len(not_in_s2v))
print("Proportion of vocabulary not in Sense2Vec:",(len(not_in_s2v))/len(unique_words))

Total words in vocabulary not in Sense2Vec: 11529
Proportion of vocabulary not in Sense2Vec: 0.40388859695218077


In [61]:
# print(not_in_s2v[:100])
# print(not_in_s2v[1000:1100])
print(not_in_s2v[-100:])

['recipient', 'squabs', 'Darby', 'poulterer', 'Scotsman', 'sepolero', 'Morphogenetic', 'Hynes', 'Repleta', 'Simal', 'Mai', 'Arithmetic', 'Duce', 'Mme', 'lectureKatchalsky', 'inA', 'Tulsk', 'Berg', 'Mode', 'Bohme', 'Envelope', 'furnitureKoestler', 'Burmester', 'Evang', 'Ilona', 'compagne', 'BarnArthur', 'Cant', 'GollyArthur', 'Hatano', 'ElectionsAccompanied', 'accomodement', 'Finney', 'poetarum', 'Realites', 'Margaropus', 'LI', 'Lennox', 'Basberg', 'Ignacio', 'Glennie', 'Duffus', 'Takagi', 'Aliza', 'emir', 'Hersham', 'Rossini', 'Bald', 'Magnus', 'Pattison', 'Skefhill', 'Sorrel', 'Landolphin', 'Staub', 'ME', 'Alumbadi', 'Model', 'Majesties', 'Harian', 'trichocysts', 'Wandervogel', 'Rumped', 'Waetjen', 'Verbena', 'ofThe', 'sturzte', 'Lee', 'Copernicus', 'Verasis', 'Altenberg', 'unnumbered', 'environs', 'Neutral', 'Wynne', 'Mossman', 'ReneeESPCutten', 'ReadingSent', 'sequelae', 'Calary', 'Soviets', 'AustriaKoestler', 'Karlsburg', 'Peckham', 'Macdougall', 'Goldschmidt', 'electroencephalogra

In [74]:
# x = "lettersBaillie"
# y = "writer'"
# z = "MunsterSent"
newly_found = 0
for word in not_in_s2v:
    found = re.findall("[A-Z]{0,1}[a-z]+", word)
    for f in found:
        lowercased = f.lower()
        w = nlp(lowercased)[0]
        if not w._.s2v_vec is None:
            newly_found += 1
print(newly_found)

2659


In [76]:
print("Proportion not found but possible to find:", newly_found/(len(not_in_s2v)))
print("Proportion of vocabulary not possible to find in Sense2Vec:",(len(not_in_s2v)-newly_found)/len(unique_words))

Proportion not found but possible to find: 0.23063578801283718
Proportion of vocabulary not possible to find in Sense2Vec: 0.3107374321247154


I'm not sure it's worth reworking the tokenization and part-of-speech tagging to increase Sense2Vec's coverage of the vocabulary by only about 9%, so we'll keep going with the model input data as it is for now.  

<a id="2"></a>
## 2. GloVe

Load the [GloVe word embeddings](https://github.com/stanfordnlp/GloVe), which were trained on 2014 English Wikipedia entries and Gigaword 5:

*Note: could also try [GN-GloVe](https://github.com/uclanlp/gn_glove), which supposedly has gender-neutral word embeddings*

In [8]:
# Reference: https://medium.com/analytics-vidhya/basics-of-using-pre-trained-glove-vectors-in-python-d38905f356db
dimensions = ["50", "100", "200", "300"]  # pretrained GloVe embeddings come as vectors with one of these four dimensions
d = dimensions[1]  # [0]
glove_path = config.inf_data_path+"glove.6B/glove.6B.{}d.txt".format(d)

In [9]:
glove = dict()
with open(glove_path, "r") as f:
    for line in f:
        values = line.split()
        word = values[0]
        vector = np.asarray(values[1:], "float32")
        glove[word] = vector
print(glove["recipient"])

[ 0.063877   0.95793   -0.053323  -0.068542   0.76758   -0.27335
 -0.043212  -0.39447    0.15885    0.25465   -0.34075   -0.30437
  0.24691    0.49041   -0.54421   -0.026556   0.99498   -0.22903
 -0.083907   0.40962   -1.3918    -0.37756   -0.5675     0.090421
  0.71336    0.43176   -0.057562  -0.34407    1.3235    -0.82601
  0.46754    1.1343     0.44713    0.29694    0.61125    0.080119
 -0.95791    0.43931   -0.74273    0.4412    -0.068448   0.74451
  0.16243    0.1931     0.85294    0.39898    0.24571   -0.3771
 -0.96994    0.19199    0.057375   0.047835   0.74642   -0.075984
 -0.54556   -0.72614   -0.010644  -0.60529    1.0421    -0.03876
  0.18461    0.53881   -0.225      0.47586    0.63071   -0.6616
 -0.51847    0.90297    1.1178    -0.01349    0.19686    0.13684
 -0.38346    0.59652    0.3418     0.80315    0.061273  -0.48047
 -0.38057   -0.47128    0.45696    0.44741   -0.18594   -0.29276
 -0.8917     0.092826   0.20231   -0.72893    0.58968   -0.64259
 -0.34245    0.0076589  

Calculate GloVe's coverage of our vocabulary: 

In [10]:
not_in_glove = []
in_glove = dict()
for word in unique_words:
    lowercased = word.lower()
    try:
        vector = glove[lowercased]
        in_glove[word] = vector
    except KeyError:
        not_in_glove += [word]

print("Total words in vocabulary not in GloVe:", len(not_in_glove))
print("Proportion of vocabulary not in GloVe:",(len(not_in_glove))/len(unique_words))

Total words in vocabulary not in GloVe: 6453
Proportion of vocabulary not in GloVe: 0.1958005886458112


In [11]:
still_not_found = []
newly_found = 0
# partial_glove_match = dict()
for word in not_in_glove:
    found = re.findall("[A-Z]{0,1}[a-z]+", word)
    for f in found:
        lowercased = f.lower()
        try:
            vector = glove[lowercased]
            in_glove[word] = vector  # partial_glove_match[word] = vector
            newly_found += 1
        except KeyError:
            still_not_found += [word]
print("Not possible to find in GloVe:", len(still_not_found))
print("Proportion not found but possible to find in GloVe:", newly_found/(len(not_in_glove)))
print("Proportion of vocabulary not possible to find in GloVe:",(len(still_not_found))/len(unique_words))

Not possible to find in GloVe: 4187
Proportion not found but possible to find in GloVe: 0.7311328064466139
Proportion of vocabulary not possible to find in GloVe: 0.12704433049124617


GloVe has much better coverage than sense2vec, as expected due to the better domain match (Wikipedia entries are more similar to archival metadata descriptions than Reddit comments!).

In [12]:
words_to_vectors = in_glove.copy()
key_array = np.array(words_to_vectors.keys())
for word in unique_words:
    if word not in key_array:
        if word.lower() in key_array:
            vector = words_to_vectors[word.lower()]
        else:
            vector = np.array([])
        words_to_vectors[word] = vector

In [13]:
assert len(words_to_vectors) == len(unique_words)

Create a dataset associating each token to a GloVe word embedding:

In [14]:
embedding_col_name = "glove_embedding"
embedding_dict = glove

In [16]:
train_embeddings = my_utils.createEmbeddingDataFrame(df_train, embedding_dict, embedding_col_name, int(d))
train_embeddings.head()

Unnamed: 0,token_id,glove_embedding,token
3,3,"[0.63367, -0.59531, 1.2248, -0.21655, 0.81292,...",Title
4,4,[],:
5,5,"[-0.080022, -0.0055914, -0.5385, -0.26939, 0.6...",Papers
6,6,"[-0.1529, -0.24279, 0.89837, 0.16996, 0.53516,...",of
7,7,"[-0.038194, -0.24487, 0.72812, -0.39961, 0.083...",The


In [17]:
dev_embeddings = my_utils.createEmbeddingDataFrame(df_dev, embedding_dict, embedding_col_name, int(d))
dev_embeddings.head()

Unnamed: 0,token_id,glove_embedding,token
172,154,"[0.37711, -0.34471, 0.13405, -0.01171, -0.1942...",After
173,155,"[0.12883, -0.82209, 0.27438, -0.069014, 0.1798...",his
174,156,"[1.4868, 0.53223, -0.69511, -0.00053285, -0.33...",ordination
175,157,"[0.1225, -0.058833, 0.23658, -0.28877, -0.0281...",he
176,158,"[0.19742, 0.50812, -0.43072, -0.076864, 0.5989...",spent


Save the data:

In [18]:
# train_embeddings.to_csv(config.tokc_path+"glove_embeddings_train.csv")  # 50 dimensions
# dev_embeddings.to_csv(config.tokc_path+"glove_embeddings_dev.csv")      # 50 dimensions
train_embeddings.to_csv(config.tokc_path+"glove_embeddings100_train.csv")
dev_embeddings.to_csv(config.tokc_path+"glove_embeddings100_dev.csv")

<a id="3"></a>
## 3. Custom with fastText

Train custom word embeddings on my own data (metadata descriptions from the CRC's Archives catalog) using fastText.

* Data file: `data/descriptions_by_fonds`
* Date of harvesting: October 2020
* Harvesting and transformation code: [annot-prep/PreparationForAnnotation.ipynb](https://github.com/thegoose20/annot-prep/blob/main/PreparationForAnnotation.ipynb)

*References:* 
* *https://radimrehurek.com/gensim/models/fasttext.html*
* *https://radimrehurek.com/gensim/auto_examples/tutorials/run_fasttext.html#sphx-glr-auto-examples-tutorials-run-fasttext-py*

In [82]:
dir_path = config.inf_data_path+"descriptions_by_fonds/"
file_list = os.listdir(dir_path)
print(len(file_list))

1079


In [83]:
class CorpusIterator:
    def __iter__(self):
        file_list = os.listdir(dir_path)
        for fonds_f in file_list:
            assert ".txt" in fonds_f, "All files should be Plaintext." 
            file_path = dir_path+fonds_f
            with utils.open(file_path, 'r', encoding='utf-8') as f:
                for line in f:
                    yield list(tokenize(line))
#                     yield list(tokenize(line.lower()))

Define the hyperparameters for the unsupervised training of the fastText model (essentially a word2vec model that uses using character n-grams so subwords can help to assign embeddings to unseen words):

In [84]:
# Specify training architecture (default = "cbow" for Continuous Bag of Words)
models = ["cbow", "skipgram"]
model = models[0]
# Specify the learning rate (default = 0.025)
alpha = 0.025
# Specify the training objective (default = "ns")
# losses = ["ns", "hs", "softmax"]
# loss = losses[0]
# Specify the number of negative words to sample for 'ns' training objective (default = 5)
negative = 5
# Specify the threshold for downsampling higher-frequency words (default = 0.001)
sample = 0.001
# Specify the word embeddings' dimensions
vector_dimensions = 300 #50 #100
# Specify the context window (default is 5) 
context_window = 5
# Specify the number of epochs (default is 5)
epochs = 5
# Specify the threshold of word occurrences (ignore words that occur less than specified number of times; default = 5)
min_count = 5
# Specify the minimum and maximum length of character ngrams (defaults are 3 and 6)
min_n = 2
max_n = 6  # if 0, no character n-grams (subword vectors) will be used
# Specify the number of buckets for hashing ngrams (default = 2000000) 
bucket = 2000000
# Sort vocabulary by descending frequency (default = 1)
sorted_vocab = 1
# Specify the number of threads to use (default = 12)
# threads = 12

In [85]:
model = FastText(
    alpha=alpha, negative=negative, sample=sample,
    vector_size=vector_dimensions, window=context_window, 
    epochs=epochs, min_count=min_count, min_n=min_n, 
    max_n=max_n, bucket=bucket, sorted_vocab=sorted_vocab
)

In [86]:
model.build_vocab(corpus_iterable=CorpusIterator())
total_examples = model.corpus_count

In [87]:
model.train(corpus_iterable=CorpusIterator(), total_examples=total_examples, epochs=epochs)
# Not lowercased, 300 dimensions: 
# Lowercased, 300 dimensions: (7321643, 10119275)
# Not lowercased, 200 dimensions: (7355581, 10119275)
# Lowercased, 200 dimensions: (7322249, 10119275)
# Not lowercased, 100 dimensions: (7321074, 10119275)
# Lowercased, 100 dimensions: (7322411, 10119275)
# Not lowercased, 50 dimensions: (7356468, 10119275)

(7356731, 10119275)

In [88]:
# model.wv["recipient"]

Save the model:

In [89]:
# file_name = get_tmpfile(config.tokc_path+"fasttext100.model")
# file_name = get_tmpfile(config.tokc_path+"fasttext100_lowercased.model")
# file_name = config.tokc_path+"fasttext50.model"
# file_name = config.tokc_path+"fasttext50_lowercased.model"
# file_name = config.tokc_path+"fasttext50.model"
# file_name = config.tokc_path+"fasttext200.model"
# file_name = config.tokc_path+"fasttext200_lowercased.model"
file_name = config.tokc_path+"fasttext300.model"
# file_name = config.tokc_path+"fasttext300_lowercased.model"
model.save(file_name)

In [90]:
len(model.wv) 
# Not lowercased, 300 dimensions:
# Lowercased, 300 dimensions: 17418
# Not lowercased, 200 dimensions: 20683
# Lowercased, 200 dimensions: 17418
# Not lowercased, 100 dimensions: 20683
# Lowercased, 100 dimensions: 17418
# Not lowercased, 50 dimensions: 20683
# Lowercased, 50 dimensions: 17418

20683

In [91]:
type(model.wv.key_to_index)     # Looks good

dict

In [92]:
"the" in model.wv.key_to_index  # Looks good

True

In [93]:
"The" in model.wv.key_to_index  # Looks good

True