# Obtain Word Embeddings for Token Classifiers

In [59]:
import utils, config
import pandas as pd
import numpy as np
import re
import spacy
import nltk
from nltk.stem import WordNetLemmatizer

In [60]:
df_train = pd.read_csv(config.tokc_path+"model_input/token_train.csv", index_col=0)
df_dev = pd.read_csv(config.tokc_path+"model_input/token_validate.csv", index_col=0)
print(df_train.shape, df_dev.shape)
df_train.head()

(467564, 10) (157740, 10)


Unnamed: 0,description_id,sentence_id,ann_id,token_id,token,token_offsets,pos,tag,field,subset
3,1,1,99999,3,Title,"(17, 22)",NN,O,Title,train
4,1,1,99999,4,:,"(22, 23)",:,O,Title,train
5,1,1,99999,5,Papers,"(24, 30)",NNS,O,Title,train
6,1,1,99999,6,of,"(31, 33)",IN,O,Title,train
7,1,1,14384,7,The,"(34, 37)",DT,B-Unknown,Title,train


Lemmatize the tokens:

In [10]:
lmtzr = WordNetLemmatizer()

In [11]:
tokens_train = list(df_train.token)
lemmas_train = [lmtzr.lemmatize(token) for token in tokens_train]
tokens_dev = list(df_dev.token)
lemmas_dev = [lmtzr.lemmatize(token) for token in tokens_dev]

In [12]:
df_train.insert((list(df_train.columns).index("token")+1), "lemma", lemmas_train)
df_dev.insert((list(df_dev.columns).index("token")+1), "lemma", lemmas_dev)

Obtain the vocabulary of the annotated data:

In [13]:
df = pd.concat([df_train, df_dev])  # df_train

In [14]:
unique_tokens = list(set(list(df.token)))
unique_lemmas = list(set(list(df.lemma))) 
unique_lemmas = [lemma for lemma in unique_lemmas if lemma.isalpha()]
lemmas_lower = [lemma.lower() for lemma in unique_lemmas]
unique_lemmas_lower = list(set(lemmas_lower))
unique_words = [token for token in unique_tokens if token.isalpha()]  # keep tokens with only alphabetic characters
print(len(unique_words), len(unique_lemmas), len(unique_lemmas_lower))

32957 31157 26757


### SpaCy Contextual Word Embeddings: sense2vec

Load [spaCy's contextual word embeddings](https://github.com/explosion/sense2vec), which were trained on 2015 Reddit posts:

In [36]:
nlp = spacy.load("en_core_web_sm")
s2v = nlp.add_pipe("sense2vec")
s2v.from_disk(config.s2v_reddit_path)

#-------------
# doc = nlp("A sentence about natural language processing.")
# assert doc[3:6].text == "natural language processing"
# freq = doc[3:6]._.s2v_freq
# vector = doc[3:6]._.s2v_vec
# most_similar = doc[3:6]._.s2v_most_similar(3)
# # [(('machine learning', 'NOUN'), 0.8986967),
# #  (('computer vision', 'NOUN'), 0.8636297),
# #  (('deep learning', 'NOUN'), 0.8573361)]

<sense2vec.component.Sense2VecComponent at 0x7f87677c5d30>

In [38]:
print(unique_words[:20])

['Cliffs', 'rpm', 'Heal', 'hereditary', 'Medjid', 'presumedly', 'Mondays', 'Routine', 'recipientESPMedawar', 'Kirkuk', 'Seton', 'Venado', 'Edith', 'Mackay', 'Visiting', 'interwar', 'atherogenic', 'Cawdor', 'jockey', 'Burgesses']


In [75]:
not_in_s2v = []
for word in unique_words:
    lowercased = word.lower()
    w = (nlp(lowercased))[0]
    if w._.s2v_vec is None:
        not_in_s2v += [word]

print("Total words in vocabulary not in Sense2Vec:", len(not_in_s2v))
print("Proportion of vocabulary not in Sense2Vec:",(len(not_in_s2v))/len(unique_words))

Total words in vocabulary not in Sense2Vec: 11529
Proportion of vocabulary not in Sense2Vec: 0.40388859695218077


In [61]:
# print(not_in_s2v[:100])
# print(not_in_s2v[1000:1100])
print(not_in_s2v[-100:])

['recipient', 'squabs', 'Darby', 'poulterer', 'Scotsman', 'sepolero', 'Morphogenetic', 'Hynes', 'Repleta', 'Simal', 'Mai', 'Arithmetic', 'Duce', 'Mme', 'lectureKatchalsky', 'inA', 'Tulsk', 'Berg', 'Mode', 'Bohme', 'Envelope', 'furnitureKoestler', 'Burmester', 'Evang', 'Ilona', 'compagne', 'BarnArthur', 'Cant', 'GollyArthur', 'Hatano', 'ElectionsAccompanied', 'accomodement', 'Finney', 'poetarum', 'Realites', 'Margaropus', 'LI', 'Lennox', 'Basberg', 'Ignacio', 'Glennie', 'Duffus', 'Takagi', 'Aliza', 'emir', 'Hersham', 'Rossini', 'Bald', 'Magnus', 'Pattison', 'Skefhill', 'Sorrel', 'Landolphin', 'Staub', 'ME', 'Alumbadi', 'Model', 'Majesties', 'Harian', 'trichocysts', 'Wandervogel', 'Rumped', 'Waetjen', 'Verbena', 'ofThe', 'sturzte', 'Lee', 'Copernicus', 'Verasis', 'Altenberg', 'unnumbered', 'environs', 'Neutral', 'Wynne', 'Mossman', 'ReneeESPCutten', 'ReadingSent', 'sequelae', 'Calary', 'Soviets', 'AustriaKoestler', 'Karlsburg', 'Peckham', 'Macdougall', 'Goldschmidt', 'electroencephalogra

In [74]:
# x = "lettersBaillie"
# y = "writer'"
# z = "MunsterSent"
newly_found = 0
for word in not_in_s2v:
    found = re.findall("[A-Z]{0,1}[a-z]+", word)
    for f in found:
        lowercased = f.lower()
        w = nlp(lowercased)[0]
        if not w._.s2v_vec is None:
            newly_found += 1
print(newly_found)

2659


In [76]:
print("Proportion not found but possible to find:", newly_found/(len(not_in_s2v)))
print("Proportion of vocabulary not possible to find in Sense2Vec:",(len(not_in_s2v)-newly_found)/len(unique_words))

Proportion not found but possible to find: 0.23063578801283718
Proportion of vocabulary not possible to find in Sense2Vec: 0.3107374321247154


I'm not sure it's worth reworking the tokenization and part-of-speech tagging to increase Sense2Vec's coverage of the vocabulary by only about 9%, so we'll keep going with the model input data as it is for now.  

### GloVe Word Embeddings

Load the [GloVe word embeddings](https://github.com/stanfordnlp/GloVe), which were trained on 2014 English Wikipedia entries and Gigaword 5:

*Note: could also try [GN-GloVe](https://github.com/uclanlp/gn_glove), which supposedly has gender-neutral word embeddings*

In [15]:
# Reference: https://medium.com/analytics-vidhya/basics-of-using-pre-trained-glove-vectors-in-python-d38905f356db
dimensions = ["50", "100", "200", "300"]  # pretrained GloVe embeddings come as vectors with one of these four dimensions
d = dimensions[0]  # start small to begin with
glove_path = config.inf_data_path+"glove.6B/glove.6B.{}d.txt".format(d)

In [18]:
glove = dict()
with open(glove_path, "r") as f:
    for line in f:
        values = line.split()
        word = values[0]
        vector = np.asarray(values[1:], "float32")
        glove[word] = vector
print(glove["recipient"])

[-0.13412    1.5041     0.39706   -0.41357    0.71336    0.63438
  0.1214     0.16746    1.4104     0.013127   0.68386    0.85236
  0.92599    0.098158   0.425     -0.83799    0.08482    0.22135
 -0.15247    0.72654    0.052728  -1.0064    -0.032626  -0.63342
  0.0039789 -1.5288    -0.74005   -1.2051    -1.0227    -0.10588
  1.2225     0.14845   -0.1641    -0.52887   -0.29012    0.59774
  0.62847    0.49003   -0.14227   -1.2193     0.56094   -0.17673
 -0.11216   -0.41801   -0.40841   -0.41748   -0.40276    0.25091
 -0.43016    0.26412  ]


Calculate GloVe's coverage of our vocabulary: 

In [19]:
not_in_glove = []
in_glove = dict()
for word in unique_words:
    lowercased = word.lower()
    try:
        vector = glove[lowercased]
        in_glove[word] = vector
    except KeyError:
        not_in_glove += [word]

print("Total words in vocabulary not in GloVe:", len(not_in_glove))
print("Proportion of vocabulary not in GloVe:",(len(not_in_glove))/len(unique_words))

Total words in vocabulary not in GloVe: 6453
Proportion of vocabulary not in GloVe: 0.1958005886458112


In [24]:
still_not_found = []
newly_found = 0
# partial_glove_match = dict()
for word in not_in_glove:
    found = re.findall("[A-Z]{0,1}[a-z]+", word)
    for f in found:
        lowercased = f.lower()
        try:
            vector = glove[lowercased]
            in_glove[word] = vector  # partial_glove_match[word] = vector
            newly_found += 1
        except KeyError:
            still_not_found += [word]
print("Not possible to find in GloVe:", len(still_not_found))
print("Proportion not found but possible to find in GloVe:", newly_found/(len(not_in_glove)))
print("Proportion of vocabulary not possible to find in GloVe:",(len(still_not_found))/len(unique_words))

Not possible to find in GloVe: 4187
Proportion not found but possible to find in GloVe: 0.7311328064466139
Proportion of vocabulary not possible to find in GloVe: 0.12704433049124617


GloVe has much better coverage than sense2vec, as expected due to the better domain match (Wikipedia entries are more similar to archival metadata descriptions than Reddit comments!).

In [25]:
words_to_vectors = in_glove.copy()
key_array = np.array(words_to_vectors.keys())
for word in unique_words:
    if word not in key_array:
        if word.lower() in key_array:
            vector = words_to_vectors[word.lower()]
        else:
            vector = np.array([])
        words_to_vectors[word] = vector

In [26]:
assert len(words_to_vectors) == len(unique_words)

Create a dataset associating each token to a GloVe word embedding:

In [55]:
embedding_col_name = "glove_embedding"
embedding_dict = glove

In [56]:
train_embeddings = utils.createEmbeddingDataFrame(df_train, embedding_dict, embedding_col_name)
train_embeddings.head()

Unnamed: 0,token_id,glove_embedding,token
3,3,"[-0.15234, 0.98085, 1.0065, 0.97812, 0.6628, 0...",Title
4,4,[],:
5,5,"[-0.15234, 0.98085, 1.0065, 0.97812, 0.6628, 0...",Papers
6,6,"[-0.15234, 0.98085, 1.0065, 0.97812, 0.6628, 0...",of
7,7,"[-0.15234, 0.98085, 1.0065, 0.97812, 0.6628, 0...",The


In [57]:
dev_embeddings = utils.createEmbeddingDataFrame(df_dev, embedding_dict, embedding_col_name)
dev_embeddings.head()

Unnamed: 0,token_id,glove_embedding,token
172,154,"[-0.15234, 0.98085, 1.0065, 0.97812, 0.6628, 0...",After
173,155,"[-0.15234, 0.98085, 1.0065, 0.97812, 0.6628, 0...",his
174,156,"[-0.15234, 0.98085, 1.0065, 0.97812, 0.6628, 0...",ordination
175,157,"[-0.15234, 0.98085, 1.0065, 0.97812, 0.6628, 0...",he
176,158,"[-0.15234, 0.98085, 1.0065, 0.97812, 0.6628, 0...",spent


Save the data:

In [58]:
# train_embeddings.to_csv(config.tokc_path+"glove_embeddings_train.csv")
# dev_embeddings.to_csv(config.tokc_path+"glove_embeddings_dev.csv")