In [2]:
# Loading the textgrids

# Rstories are the names of the training (or Regression) stories, which we will use to fit our models
Rstories = ['alternateithicatom', 'avatar', 'howtodraw', 'legacy', 
            'life', 'myfirstdaywiththeyankees', 'naked', 
            'odetostepfather', 'souls', 'undertheinfluence']

# Pstories are the test (or Prediction) stories (well, story), which we will use to test our models
Pstories = ['wheretheressmoke']

allstories = Rstories + Pstories

# Load TextGrids
from stimulus_utils import load_grids_for_stories
grids = load_grids_for_stories(allstories)

# Load TRfiles
from stimulus_utils import load_generic_trfiles
trfiles = load_generic_trfiles(allstories)

# Make word and phoneme datasequences
from dsutils import make_word_ds, make_phoneme_ds
wordseqs = make_word_ds(grids, trfiles) # dictionary of {storyname : word DataSequence}
phonseqs = make_phoneme_ds(grids, trfiles) # dictionary of {storyname : phoneme DataSequence}

In [3]:
naked = wordseqs["naked"]

# naked.data is a list of all the words in the story
print ("There are %d words in the story called 'naked'" % len(list(naked.data)))

There are 3218 words in the story called 'naked'


In [4]:
import nltk  

# Only run once: This just downloads the punkt tokenizer, which is necessary for the word_tokenize function 
# nltk.download('punkt')

In [5]:

import sys
sys.path.append('../')

import numpy as np
from LMMS.transformers_encoder import TransformersEncoder
from LMMS.vectorspace import SensesVSM

import spacy
en_nlp = spacy.load('en_core_web_sm')  # required for lemmatization and POS-tagging

from LMMS.wn_utils import WN_Utils
wn_utils = WN_Utils()  # WordNet auxilliary methods (just for describing results)

  from .autonotebook import tqdm as notebook_tqdm


In [6]:
# NLM/LMMS paths and parameters
vecs_path = '/Users/thomasmcgall/Desktop/research/LMMS/data/vectors/lmms-sp-wsd.albert-xxlarge-v2.vectors.txt'
wsd_encoder_cfg = {
    'model_name_or_path': 'albert-xxlarge-v2',
    'min_seq_len': 0,
    'max_seq_len': 512,
    'layers': [-n for n in range(1, 12 + 1)],  # all layers, with reversed indices
    'layer_op': 'ws',
    'weights_path': '/Users/thomasmcgall/Desktop/research/LMMS/data/weights/lmms-sp-wsd.albert-xxlarge-v2.weights.txt'
,
    'subword_op': 'mean'
}

print('Loading NLM and sense embeddings ...')  # (takes a while)
wsd_encoder = TransformersEncoder(wsd_encoder_cfg)
senses_vsm = SensesVSM(vecs_path, normalize=True)
print('Done')

Loading NLM and sense embeddings ...
Done


In [8]:
# Hugging face model for inserting punctuation

from transformers import T5Tokenizer, TFT5ForConditionalGeneration, T5ForConditionalGeneration

punctuation_tokenizer = T5Tokenizer.from_pretrained('SJ-Ray/Re-Punctuate')
punctuation_model = TFT5ForConditionalGeneration.from_pretrained('SJ-Ray/Re-Punctuate')

All model checkpoint layers were used when initializing TFT5ForConditionalGeneration.

All the layers of TFT5ForConditionalGeneration were initialized from the model checkpoint at SJ-Ray/Re-Punctuate.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFT5ForConditionalGeneration for predictions without further training.


In [1]:
# helper function to get the surrounding words of a given index
def get_surrounding_words(index, context_size, words):
    # If context size is larger than the number of words, set context size to the number of words
    if context_size > len(words):
        context_size = len(words)

    half_window = context_size // 2
    word_count = len(words)
    
    start_index = index - half_window
    end_index = index + half_window   + context_size % 2  
    
    if start_index < 0:
        surrounding_words = words[0:context_size] 
        relative_word_idx = index
    elif end_index > word_count:
        surrounding_words = words[word_count - context_size:word_count]
        relative_word_idx = index - (word_count - context_size)

    else:
        surrounding_words = words[start_index:end_index]
        relative_word_idx = context_size//2 

    return ' '.join(surrounding_words), relative_word_idx

In [30]:
def generate_sense_embeddings(input_text, context_length = 'sentence'):
    '''
    Parameters
    ----------
    input_text : string
        The text for which to generate sense embeddings.
    context_length : int or string('sentence')
        The number of words to use as context for each word in the input text. 
        If 'sentence', the entire sentence will be used as context for each word.
    
    Returns
    -------
    word, sense, sense_embedding : tuple

    word: str:
        The word in the input text for which embeddings were generated.
    sense: str:
        The sense of the word in the input text.
        ex)'grow%2:30:04::'
    sense_embedding: np.array
        The sense embedding of the word in the input text.
    


    '''
    # context length can be 'sentence' or int(words) where words is the number of words to use as context 
    if context_length == 'sentence':
        # Punctuate Sentences
        inputs = punctuation_tokenizer.encode("punctuate: " + input_text, return_tensors="tf") 
        result = punctuation_model.generate(inputs)
        punctuated_output =  punctuation_tokenizer.decode(result[0], skip_special_tokens=True)

        # Split sentences by Punctuation
        sentences = nltk.sent_tokenize(punctuated_output)  

        sense_embeddings = []

        for sentence in sentences:
            print(sentence)
            for word_idx in range(0,len(sentence.split())):
                context = sentence
                target_idxs = [word_idx]  # for 'mouse'

                # use spacy to automatically determine lemma and POS (replace with your favorite NLP toolkit)
                doc = en_nlp(context)
                target_lemma = '_'.join([doc[i].lemma_ for i in target_idxs])
                target_pos = doc[target_idxs[0]].pos_

                # retrieve contextual embedding for target token/span
                tokens = [t.text for t in doc]
                ctx_embeddings = wsd_encoder.token_embeddings([tokens])[0]
                target_embedding = np.array([ctx_embeddings[i][1] for i in target_idxs]).mean(axis=0)
                target_embedding = target_embedding / np.linalg.norm(target_embedding)


                # find sense embeddings that are nearest-neighbors to the target contextual embedding
                # candidates restricted by lemma and part-of-speech
                matches = senses_vsm.match_senses(target_embedding, lemma=target_lemma, postag=target_pos, topn=3)

                if len(matches) == 0:
                    # No sense embeddings found, append (,)
                    sense_embeddings.append((context.split()[word_idx],None,None))
                    continue
                else:
                    senses_vsm.get_vec(matches[0][0])
                    sense_embeddings.append((context.split()[word_idx],matches[0][0],senses_vsm.get_vec(matches[0][0])))

        return sense_embeddings


    else:
        assert type(context_length) is int, "context_length must be an integer or 'sentence'"
        sense_embeddings = []

        number_of_words_input = len(input_text.split())
        for word_idx in range(0,number_of_words_input):
            # assuming context is centered around the word of interest
            context,relative_idx = get_surrounding_words(word_idx, context_length, input_text.split())

            # print(context, relative_idx)
            
            target_idxs = [relative_idx] 

            # use spacy to automatically determine lemma and POS (replace with your favorite NLP toolkit)
            doc = en_nlp(context)
            target_lemma = '_'.join([doc[i].lemma_ for i in target_idxs])
            target_pos = doc[target_idxs[0]].pos_


            # retrieve contextual embedding for target token/span
            tokens = [t.text for t in doc]
            ctx_embeddings = wsd_encoder.token_embeddings([tokens])[0]
            target_embedding = np.array([ctx_embeddings[i][1] for i in target_idxs]).mean(axis=0)
            target_embedding = target_embedding / np.linalg.norm(target_embedding)


            # find sense embeddings that are nearest-neighbors to the target contextual embedding
            # candidates restricted by lemma and part-of-speech
            matches = senses_vsm.match_senses(target_embedding, lemma=target_lemma, postag=target_pos, topn=3)

            # print(len(matches))
            if len(matches) == 0:
                # No sense embeddings found, append (,)
                sense_embeddings.append((context.split()[relative_idx],None,None))
                continue
            else:
                senses_vsm.get_vec(matches[0][0])
                sense_embeddings.append((context.split()[relative_idx],matches[0][0],senses_vsm.get_vec(matches[0][0])))

        return sense_embeddings
            
        

In [31]:
#example usage 
life = wordseqs["life"]
#get first 20 words of the story
firstWords = ' '.join(life.data[:20])
print(firstWords)

embeddings = generate_sense_embeddings(firstWords, context_length = 5)



so i get a phone call from my mom and she tells me that my father is about to get


In [32]:
for word, sense, embedding in embeddings:
    embedding_info = embedding.shape if embedding is not None and embedding.size > 0 else "No embedding"
    sense_info = sense if sense else "No sense"
    print(f"{word} : {sense_info}, {embedding_info}")

so : so%4:02:09::, (4096,)
i : No sense, No embedding
get : get%2:39:00::, (4096,)
a : No sense, No embedding
phone : phone%1:06:00::, (4096,)
call : call%1:10:01::, (4096,)
from : No sense, No embedding
my : No sense, No embedding
mom : mom%1:18:00::, (4096,)
and : No sense, No embedding
she : No sense, No embedding
tells : tell%2:32:04::, (4096,)
me : No sense, No embedding
that : No sense, No embedding
my : No sense, No embedding
father : father%1:18:00::, (4096,)
is : No sense, No embedding
about : about%5:00:00:active:01, (4096,)
to : No sense, No embedding
get : get%2:33:00::, (4096,)
