In [65]:
from sentence_transformers import SentenceTransformer, util
import pandas as pd
import re

In [66]:
# one of the quicker pre-trained transformer models, although there are more accurate ones
sbert_model = SentenceTransformer('all-MiniLM-L12-v2')

In [67]:
# read in and clean file with top 100k words
df = pd.read_csv("count_1w100k.txt", header=None)
df["words"] = df[0].str.split("\\t", expand=True)[0]
df.head()

Unnamed: 0,0,words
0,THE\t23135851162,THE
1,OF\t13151942776,OF
2,AND\t12997637966,AND
3,TO\t12136980858,TO
4,A\t9081174698,A


In [68]:
# create list of words
words = df["words"].tolist()

In [69]:
# TODO: limit file to only top x words if desired to limit file size of embedding file
#words = words[0:20000]

In [70]:
# clean words as prep for embeddings
words = [str(word) for word in words]
words = [word.lower() for word in words if len(word) > 2]
p = re.compile("[.!?\\-]")
words = [word for word in words if not p.match(word)]

In [71]:
embeddings = sbert_model.encode(words)
df2 = pd.DataFrame({'words': words, 'Embedding': None})
df2['Embedding'] = embeddings.tolist()
df2.head()

Unnamed: 0,words,Embedding
0,the,"[-0.05535852909088135, 0.06034570187330246, 0...."
1,and,"[0.0064349789172410965, 0.03416347876191139, 0..."
2,for,"[-0.09993784129619598, 0.0073022376745939255, ..."
3,that,"[0.011291169561445713, 0.04566764459013939, 0...."
4,this,"[-0.04582986980676651, 0.027872908860445023, 0..."


In [None]:
df2.to_pickle('words_pickle.pickle')

In [90]:
# reading in dataset with full set of words
emb_df = pd.read_pickle('words_pickle.pickle')[['words', 'Embedding']]
words = emb_df['words']

# creating a smaller set of *popular* words to be used as the prompt words
words_small = words[0:5000]

In [94]:
rand_sample = random.sample(range(1, len(set(words_small))), 5)
r1 = words_small[rand_sample[0]]
r2 = words_small[rand_sample[1]]
prompt_words = [r1, r2]

In [95]:
input_words = ["here", "there", "mom", "dad"]
input_words = [word.lower() for word in input_words]

In [100]:
all_words = [prompt_words[0]] + input_words + [prompt_words[1]]

In [108]:
# create df that will serve as main df to join emb_df with
df = pd.DataFrame(all_words).rename(columns = {0:'word'})
df['word2'] = df['word'].shift(-1)
df['word3'] = df['word'].shift(-2)
df = df.iloc[0:4]

In [109]:
# add embbeddings for input words onto dataset via joining
df = df.merge(emb_df, left_on = "word", right_on = "words", how = "left").rename(columns = {"Embedding":"word_embs"}).drop(columns = {'words'})
df = df.merge(emb_df, left_on = "word2", right_on = "words", how = "left").rename(columns = {"Embedding":"word2_embs"}).drop(columns = {'words'})
df = df.merge(emb_df, left_on = "word3", right_on = "words", how = "left").rename(columns = {"Embedding":"word3_embs"}).drop(columns = {'words'})

In [110]:
df

Unnamed: 0,word,word2,word3,word_embs,word2_embs,word3_embs
0,characters,here,there,"[-0.07646714150905609, 0.04853050038218498, 0....","[-0.018403148278594017, -0.03153776004910469, ...","[-0.020763833075761795, 0.007058394607156515, ..."
1,here,there,mom,"[-0.018403148278594017, -0.03153776004910469, ...","[-0.020763833075761795, 0.007058394607156515, ...","[-0.013820691034197807, -0.0153655419126153, 0..."
2,there,mom,dad,"[-0.020763833075761795, 0.007058394607156515, ...","[-0.013820691034197807, -0.0153655419126153, 0...","[-0.04803462326526642, 0.057016823440790176, 0..."
3,mom,dad,muscle,"[-0.013820691034197807, -0.0153655419126153, 0...","[-0.04803462326526642, 0.057016823440790176, 0...","[-0.06318749487400055, -0.059611767530441284, ..."


In [88]:
util.pytorch_cos_sim(df['input_embs'][i], df['word_embs'][i])[0].numpy().max()

0.22855799