In [1]:
from allennlp.modules.elmo import Elmo, batch_to_ids

options_file = "https://allennlp.s3.amazonaws.com/models/elmo/2x4096_512_2048cnn_2xhighway/elmo_2x4096_512_2048cnn_2xhighway_options.json"
weight_file = "https://allennlp.s3.amazonaws.com/models/elmo/2x4096_512_2048cnn_2xhighway/elmo_2x4096_512_2048cnn_2xhighway_weights.hdf5"


In [131]:
import pandas as pd
from scipy.spatial.distance import cosine
import pickle

In [3]:

# Compute two different representation for each token.
# Each representation is a linear weighted combination for the
# 3 layers in ELMo (i.e., charcnn, the outputs of the two BiLSTM))
elmo = Elmo(options_file, weight_file, 1, dropout=0)


In [134]:
df = pd.read_csv('dem_srl_df.tsv', sep='\t')

In [135]:
df.head()

Unnamed: 0,Text,Verb_annotation
0,"['We', 'had', 'our', 'own', 'bad', 'weather', ...","{'verb': 'had', 'description': ""[ARG0: We] [V:..."
1,"['We', 'had', 'our', 'own', 'bad', 'weather', ...","{'verb': ""'re"", 'description': ""We had our own..."
2,"['We', 'had', 'our', 'own', 'bad', 'weather', ...","{'verb': 'thinking', 'description': ""We had ou..."
3,"['We', 'had', 'our', 'own', 'bad', 'weather', ...","{'verb': 'were', 'description': ""We had our ow..."
4,"['We', 'had', 'our', 'own', 'bad', 'weather', ...","{'verb': 'hit', 'description': ""We had our own..."


In [136]:
len(df)

90894

In [137]:
all_text = df.Text.drop_duplicates().tolist()

In [138]:
len(all_text)

28869

In [None]:
batch_size = 64
start = 0
all_embeddings = []
while start < len(all_text):
    batch = all_text[start: start+batch_size]
    character_ids = batch_to_ids(batch)
    emb = elmo(character_ids)['elmo_representations']
    for item in emb[0]:
        all_embeddings.append(item)
    start += batch_size

In [39]:
all_embeddings_np = [e.detach().numpy() for e in all_embeddings]

In [40]:
df_text_embeddings = pd.DataFrame(zip(df_text[0:11455], all_embeddings_np), columns=['Text', 'Embeddings'])

In [41]:
df_text_embeddings.head()

Unnamed: 0,Text,Embeddings
0,"['We', 'had', 'our', 'own', 'bad', 'weather', ...","[[0.02317109, -0.7599905, -0.66356254, -0.4027..."
1,"['We', 'are', 'with', 'you', 'in', 'spirit', '...","[[0.046381567, -0.75730634, -0.65926844, -0.39..."
2,"['RT', '@AGNeronha', ':', 'Questions', 'about'...","[[0.041496996, -0.7381885, -0.64177155, -0.373..."
3,"['RI', ""'s"", '@AGNeronha', 'is', 'leading', 'e...","[[0.065919176, -0.75744694, -0.64841074, -0.39..."
4,"['If', 'you', 'see', 'a', '#', 'COVID19scam', ...","[[0.012046866, -0.8399536, -0.46612412, -0.552..."


In [89]:
df_srl_emb = pd.merge(df, df_text_embeddings, on='Text', how='inner')

In [109]:
df_srl_emb.head()

Unnamed: 0,Text,Verb_annotation,Embeddings
0,"['We', 'had', 'our', 'own', 'bad', 'weather', ...","{'verb': 'had', 'description': ""[ARG0: We] [V:...","[[0.02317109, -0.7599905, -0.66356254, -0.4027..."
1,"['We', 'had', 'our', 'own', 'bad', 'weather', ...","{'verb': ""'re"", 'description': ""We had our own...","[[0.02317109, -0.7599905, -0.66356254, -0.4027..."
2,"['We', 'had', 'our', 'own', 'bad', 'weather', ...","{'verb': 'thinking', 'description': ""We had ou...","[[0.02317109, -0.7599905, -0.66356254, -0.4027..."
3,"['We', 'had', 'our', 'own', 'bad', 'weather', ...","{'verb': 'were', 'description': ""We had our ow...","[[0.02317109, -0.7599905, -0.66356254, -0.4027..."
4,"['We', 'had', 'our', 'own', 'bad', 'weather', ...","{'verb': 'hit', 'description': ""We had our own...","[[0.02317109, -0.7599905, -0.66356254, -0.4027..."


In [None]:
all_wor

In [91]:
len(df_srl_emb)

35564

In [133]:
pickle.dump(df_srl_emb, open('srl_elmo_dem.tsv', 'wb'))