In [74]:
import nltk  
from nltk.tokenize import wordpunct_tokenize
from string import punctuation
from collections import defaultdict
import warnings
from deepmultilingualpunctuation import PunctuationModel
from nltk.corpus import stopwords
from nltk.corpus import stopwords
from tqdm import tqdm



In [2]:
# Loading the textgrids

# Rstories are the names of the training (or Regression) stories, which we will use to fit our models
Rstories = ['alternateithicatom', 'avatar', 'howtodraw', 'legacy', 
            'life', 'myfirstdaywiththeyankees', 'naked', 
            'odetostepfather', 'souls', 'undertheinfluence']

# Pstories are the test (or Prediction) stories (well, story), which we will use to test our models
Pstories = ['wheretheressmoke']

allstories = Rstories + Pstories

# Load TextGrids
from stimulus_utils import load_grids_for_stories
grids = load_grids_for_stories(allstories)

# Load TRfiles
from stimulus_utils import load_generic_trfiles
trfiles = load_generic_trfiles(allstories)

# Make word and phoneme datasequences
from dsutils import make_word_ds, make_phoneme_ds
wordseqs = make_word_ds(grids, trfiles) # dictionary of {storyname : word DataSequence}
phonseqs = make_phoneme_ds(grids, trfiles) # dictionary of {storyname : phoneme DataSequence}

In [3]:
naked = wordseqs["naked"]

# naked.data is a list of all the words in the story
print ("There are %d words in the story called 'naked'" % len(list(naked.data)))

There are 3218 words in the story called 'naked'


In [47]:

# Only run these once: This just downloads the punkt tokenizer, which is necessary for the word_tokenize function and the list of stopwords

# nltk.download('punkt')
# nltk.download('stopwords')


[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/thomasmcgall/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [4]:

import sys
sys.path.append('../')

import numpy as np
from LMMS.transformers_encoder import TransformersEncoder
from LMMS.vectorspace import SensesVSM

import spacy
en_nlp = spacy.load('en_core_web_sm')  # required for lemmatization and POS-tagging

from LMMS.wn_utils import WN_Utils
wn_utils = WN_Utils()  # WordNet auxilliary methods (just for describing results)

  from .autonotebook import tqdm as notebook_tqdm


In [5]:
# NLM/LMMS paths and parameters
vecs_path = '/Users/thomasmcgall/Desktop/research/LMMS/data/vectors/lmms-sp-wsd.albert-xxlarge-v2.vectors.txt'
wsd_encoder_cfg = {
    'model_name_or_path': 'albert-xxlarge-v2',
    'min_seq_len': 0,
    'max_seq_len': 512,
    'layers': [-n for n in range(1, 12 + 1)],  # all layers, with reversed indices
    'layer_op': 'ws',
    'weights_path': '/Users/thomasmcgall/Desktop/research/LMMS/data/weights/lmms-sp-wsd.albert-xxlarge-v2.weights.txt'
,
    'subword_op': 'mean'
}

print('Loading NLM and sense embeddings ...')  # (takes a while)
wsd_encoder = TransformersEncoder(wsd_encoder_cfg)
senses_vsm = SensesVSM(vecs_path, normalize=True)
print('Done')

Loading NLM and sense embeddings ...
Done


In [11]:
input_text = ' '.join(naked.data[:500])

In [18]:

# https://huggingface.co/oliverguhr/fullstop-punctuation-multilang-large

model = PunctuationModel()
text = "My name is Clara and I live in Berkeley California Ist das eine Frage Frau Müller"
result = model.restore_punctuation(text)
print(result)


KeyboardInterrupt: 

In [None]:
# Can also just ask ChatGpt to punctuate but need paid API key
import requests

API_KEY = '<API_KEY>'

# The OpenAI API URL for ChatGPT
API_URL = 'https://api.openai.com/v1/chat/completions'

headers = {
    'Authorization': f'Bearer {API_KEY}',
    'Content-Type': 'application/json',
}


data = {
    'model': 'gpt-3.5-turbo',  
    'messages': [
        {'role': 'user', 'content': f"Punctuate the following text: {input_text}"}
    ],
}

response = requests.post(API_URL, headers=headers, json=data)

if response.status_code == 200:
    response_data = response.json()
    if response_data['choices']:
        answer = response_data['choices'][0]['message']['content']
        print("Answer from ChatGPT:", answer.strip())
    else:
        print("No answer received from ChatGPT.")
else:
    print("Failed to get a response from the API. Status code:", response.status_code)

In [33]:
# Helper functions

#get the surrounding words of a given index
def get_surrounding_words(index, context_size, words):
    # If context size is larger than the number of words, set context size to the number of words
    if context_size > len(words):
        context_size = len(words)

    half_window = context_size // 2
    word_count = len(words)
    
    start_index = index - half_window
    end_index = index + half_window   + context_size % 2  
    
    if start_index < 0:
        surrounding_words = words[0:context_size] 
        relative_word_idx = index
    elif end_index > word_count:
        surrounding_words = words[word_count - context_size:word_count]
        relative_word_idx = index - (word_count - context_size)

    else:
        surrounding_words = words[start_index:end_index]
        relative_word_idx = context_size//2 

    return ' '.join(surrounding_words), relative_word_idx

# Ensure that this model is not adding/removing any words
def check_word_count(input_text,output_text):
    # Tokenize the string into words and punctuation
    tokens_input = wordpunct_tokenize(input_text)
    # Remove tokens that are entirely punctuation, mixed(punctuation and word) tokens may include datees, emails, contractions, ...
    words = [token for token in tokens_input if not all(char in punctuation for char in token)]
    # get number of word tokens
    input_num= len(words)

    tokens_output = wordpunct_tokenize(output_text)

    words = [token for token in tokens_output if not all(char in punctuation for char in token)]
    output_num= len(words)

    if(input_num != output_num):
        warnings.warn("Model may have introduced or removed words. Please check the model.")
    

In [80]:
def generate_sense_embeddings(input_text, context_length = 'sentence'):
    '''
    Parameters
    ----------
    input_text : string
        The text for which to generate sense embeddings.
    context_length : int or string('sentence')
        The number of words to use as context for each word in the input text. 
        If 'sentence', the entire sentence will be used as context for each word.
    
    Returns
    -------
    sense_embeddings, missing_sense_embeddings, sentences

    sense_embeddings: list of tuples
        tuple - (word, sense, sense_embedding)
            word: str:
                The word in the input text for which embeddings were generated.
            sense: str:
                The sense of the word in the input text.
                ex)'grow%2:30:04::'
            sense_embedding: np.array
                The sense embedding of the word in the input text.
    missing_sense_embeddings: dict
        A dictionary of non-stopwords that do not have sense embeddings
        dictionary - {word: count of apperances in input text}
    sentences - list
        list of sentences in the input text. 


    '''
    #list of stopwords from nltk for reference
    stop_words = set(stopwords.words('english'))

    #dictionary that maps from word(missing sense embedding) to the number of times it appears in the input text
    missing_sense_embeddings = defaultdict(int)

    # context length can be 'sentence' or int(words) where words is the number of words to use as context 
    if context_length == 'sentence':
        # May need to split text into chunks before feeding the punctuator?

        # Punctuate Sentences
        punctuation_model = PunctuationModel()
        punctuated_output = punctuation_model.restore_punctuation(input_text)

        # Will raise warning if number of non-punctuation tokens in input text is not equal to output text
        # NOTE: This is not a perfect check as new tokens may be generated that are a mix of punctuation and existing words
        check_word_count(input_text,punctuated_output)

        # Split sentences by Punctuation
        sentences = nltk.sent_tokenize(punctuated_output)  

        sense_embeddings = []

        for idx,sentence in enumerate(sentences) :
            print(f"{idx+1}/{len(sentences)}")
            for word_idx in range(0,len(sentence.split())):
                context = sentence
                target_idxs = [word_idx]  # for 'mouse'

                # use spacy to automatically determine lemma and POS (replace with your favorite NLP toolkit)
                doc = en_nlp(context)
                target_lemma = '_'.join([doc[i].lemma_ for i in target_idxs])
                target_pos = doc[target_idxs[0]].pos_

                # retrieve contextual embedding for target token/span
                tokens = [t.text for t in doc]
                ctx_embeddings = wsd_encoder.token_embeddings([tokens])[0]
                target_embedding = np.array([ctx_embeddings[i][1] for i in target_idxs]).mean(axis=0)
                target_embedding = target_embedding / np.linalg.norm(target_embedding)


                # find sense embeddings that are nearest-neighbors to the target contextual embedding
                # candidates restricted by lemma and part-of-speech
                matches = senses_vsm.match_senses(target_embedding, lemma=target_lemma, postag=target_pos, topn=3)

                word = context.split()[word_idx]
                if len(matches) == 0:
                    # No sense embeddings found, append (,)
                    sense_embeddings.append((word,None,None))
                    #increment count for missing embedding
                    if word not in stop_words:
                        missing_sense_embeddings[word] += 1
                    continue
                else:
                    senses_vsm.get_vec(matches[0][0])
                    sense_embeddings.append((word,matches[0][0],senses_vsm.get_vec(matches[0][0])))

        return sense_embeddings, missing_sense_embeddings,sentences


    else:
        assert type(context_length) is int, "context_length must be an integer or 'sentence'"
        sense_embeddings = []

        number_of_words_input = len(input_text.split())
        for word_idx in tqdm(range(0, number_of_words_input), desc="Processing"):   
            # assuming context is centered around the word of interest
            context,relative_idx = get_surrounding_words(word_idx, context_length, input_text.split())

            # print(context, relative_idx)
            
            target_idxs = [relative_idx] 

            # use spacy to automatically determine lemma and POS (replace with your favorite NLP toolkit)
            doc = en_nlp(context)
            target_lemma = '_'.join([doc[i].lemma_ for i in target_idxs])
            target_pos = doc[target_idxs[0]].pos_


            # retrieve contextual embedding for target token/span
            tokens = [t.text for t in doc]
            ctx_embeddings = wsd_encoder.token_embeddings([tokens])[0]
            target_embedding = np.array([ctx_embeddings[i][1] for i in target_idxs]).mean(axis=0)
            target_embedding = target_embedding / np.linalg.norm(target_embedding)


            # find sense embeddings that are nearest-neighbors to the target contextual embedding
            # candidates restricted by lemma and part-of-speech
            matches = senses_vsm.match_senses(target_embedding, lemma=target_lemma, postag=target_pos, topn=3)

            # print(len(matches))
            word = context.split()[relative_idx]
            if len(matches) == 0:
                # No sense embeddings found, append (,)
                sense_embeddings.append((word,None,None))
                #increment count for missing embedding
                if word not in stop_words:
                    missing_sense_embeddings[word] += 1
                continue
            else:
                senses_vsm.get_vec(matches[0][0])
                sense_embeddings.append((word,matches[0][0],senses_vsm.get_vec(matches[0][0])))

        return sense_embeddings, missing_sense_embeddings,sentences
            
        

In [None]:
#example usage 
life = wordseqs["life"]
len(life.data)

#get first 20 words of the story
input_text = ' '.join(life.data)

embeddings,missing_embeddings,sentences = generate_sense_embeddings(input_text, context_length = 'sentence')


In [92]:
for word, sense, embedding in embeddings[:10]:
    embedding_info = embedding.shape if embedding is not None and embedding.size > 0 else "No embedding"
    sense_info = sense if sense else "No sense"
    print(f"{word} : {sense_info}, {embedding_info}")

so : so%4:02:07::, (4096,)
i : No sense, No embedding
get : get%2:30:03::, (4096,)
a : No sense, No embedding
phone : phone%1:06:00::, (4096,)
call : call%1:10:01::, (4096,)
from : No sense, No embedding
my : No sense, No embedding
mom : mom%1:18:00::, (4096,)
and : No sense, No embedding


In [91]:
print(list(missing_embeddings.keys())[:10])

['montana', 'kinda', 'telling', 'worry,', 'gonna', 'ok,', "we're", 'this,', 'alright.', 'something']


In [93]:
for sentence in sentences[:10]:
    print(sentence)

so i get a phone call from my mom and she tells me that my father is about to get on an emergency life flight from our home in montana to go to denver to get an emergency liver transplant.
my mom is kinda perennially optimistic and she's telling me: don't worry, it's gonna be ok, we're gonna pull through this, it's gonna be alright.
but i know something is is really wrong.
so i get the next flight i can to go from where i'm living here in new york, hoping that i get there before my father dies, and i'm really glad i i got that flight as fast as i did because i was able to spend a couple hours with my father before he passed away.
and before i know it i'm at this side of his hospital bed with my mom and we're sobbing because he's passed.
um, my dad was a. he was a strong, silent type.
he was um, he was a grew up on a farm and he was the town one of one of two town eye doctors.
so he could fix anything.
you know, he could fix tractors or eyes.
he could, you know, no matter what, and he w