In [1]:
from datasets import load_dataset
import numpy as np
import spacy
import torch
# import en_core_web_trf
import en_core_web_sm
from collections import Counter


  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# Load the Spacy model (Small model for CPU usage)
nlp = spacy.load("en_core_web_sm")  # Smaller model suitable for CPU

# Load the corpus dataset
dataset = load_dataset("generics_kb", "generics_kb_best")

In [26]:
dataset['train'][:1000]

{'source': ['Waterloo',
  'ARC',
  'ARC',
  'Waterloo',
  'WordNet3.0',
  'ARC',
  'ARC',
  'ARC',
  'ARC',
  'ARC',
  'ARC',
  'Waterloo',
  'ConceptNet',
  'ConceptNet',
  'ARC',
  'ARC',
  'ARC',
  'ARC',
  'ARC',
  'ARC',
  'ARC',
  'ARC',
  'ARC',
  'ConceptNet',
  'ARC',
  'ARC',
  'ARC',
  'Waterloo',
  'SimpleWikipedia',
  'ARC',
  'ARC',
  'ConceptNet',
  'Waterloo',
  'Waterloo',
  'ConceptNet',
  'ARC',
  'ARC',
  'ARC',
  'ARC',
  'ARC',
  'ARC',
  'Waterloo',
  'ARC',
  'Waterloo',
  'ConceptNet',
  'Waterloo',
  'TupleKB',
  'Waterloo',
  'TupleKB',
  'TupleKB',
  'TupleKB',
  'TupleKB',
  'ARC',
  'SimpleWikipedia',
  'TupleKB',
  'Waterloo',
  'TupleKB',
  'TupleKB',
  'TupleKB',
  'TupleKB',
  'TupleKB',
  'ARC',
  'ARC',
  'ARC',
  'ARC',
  'TupleKB',
  'TupleKB',
  'TupleKB',
  'TupleKB',
  'WordNet3.0',
  'WordNet3.0',
  'WordNet3.0',
  'WordNet3.0',
  'WordNet3.0',
  'WordNet3.0',
  'WordNet3.0',
  'WordNet3.0',
  'TupleKB',
  'WordNet3.0',
  'TupleKB',
  'WordNet3

In [27]:
dataset['train'] = dataset["train"][:1000]

In [3]:
# Choose a subset of sentences in the corpus, and make a bank of words 
# labeled by syntactic category.

# small subset of sentences for lighter load 
np.random.seed(42)
sentences_to_use = np.random.choice(range(len(dataset['train'])), 10000, replace=False)
sentence_pipe = nlp.pipe(dataset['train'][sentences_to_use[0:3000]]['generic_sentence']) #only used 3000 to avoid OOM error
word_bank = [(w.text, w.pos_, w.tag_, w.dep_, w.is_stop) for x in list(sentence_pipe) for w in x]
'''
w.text: The actual word (text).
w.pos_: The coarse-grained part of speech (e.g., 'NOUN', 'VERB').
w.tag_: The fine-grained part of speech tag (e.g., 'NNP' for proper singular noun).
w.dep_: The syntactic dependency relation showing the word's relation to other words (e.g., 'nsubj' for nominal subject).
w.is_stop: A boolean indicating whether the word is a stop word (common word like 'and', 'the', etc., which are often filtered out in NLP tasks).
'''

"\nw.text: The actual word (text).\nw.pos_: The coarse-grained part of speech (e.g., 'NOUN', 'VERB').\nw.tag_: The fine-grained part of speech tag (e.g., 'NNP' for proper singular noun).\nw.dep_: The syntactic dependency relation showing the word's relation to other words (e.g., 'nsubj' for nominal subject).\nw.is_stop: A boolean indicating whether the word is a stop word (common word like 'and', 'the', etc., which are often filtered out in NLP tasks).\n"

In [12]:
list(sentence_pipe)[:3]

[]

In [7]:
# Iterate through processed sentences and print the first two
for i, doc in enumerate(sentence_pipe):
    print(f"Sentence {i+1}: {doc.text}")
    if i >= 1:  # Since i starts at 0, this will print Sentence 1 and Sentence 2
        break


In [8]:

# Load the Pereira sentences.  We replace "x-ray" with "scan" since the Spacy
# tokenizer makes "x-ray" into 3 tokens, which makes generated sentences become 
# extremely unlikely to have the correct subtree structure.
pereira_sentences = open('/Users/thomasmcgall/Desktop/research/speechmodeltutorial/sentences_ordered.txt', 'r').read()
pereira_sentences = pereira_sentences.replace('x-ray', 'scan')

In [9]:
# Now we get a list of all of the words in the Pereira sentences in their original order, 
# each in a tuple with the other variables defining the word's syntactic category.
# For convenience, we also create a list just with the syntactic category information.
pereira_sentence_pipe = nlp.pipe([x.strip() for x in pereira_sentences.split('\n')[:-1]])
pereira_words = [(w.text, w.pos_, w.tag_, w.dep_, w.is_stop) for x in list(pereira_sentence_pipe) for w in x]
pereira_word_types = [x[1:] for x in pereira_words]

In [11]:
print(pereira_word_types[:10])

[('NOUN', 'NNS', 'nsubj', False), ('VERB', 'VBP', 'ROOT', False), ('DET', 'DT', 'det', True), ('ADJ', 'JJ', 'amod', False), ('NOUN', 'NN', 'npadvmod', False), ('ADP', 'IN', 'prep', True), ('PRON', 'PRP$', 'poss', True), ('NOUN', 'NNS', 'pobj', False), ('PUNCT', '.', 'punct', False), ('ADJ', 'JJ', 'nsubjpass', True)]


In [None]:
# Now comes the main loop. First, we initialize equivalence_sum, which will
# be a running tally of how many syntactically equivalent sentences we've 
# generated for each original sentence in Pereira. Throughout the following
# comments, the word "iteration" will be used to refer to a pass through the
# while loop, whereas the term "cycle" will be used for the set of while loop
# passes that occur until a new syntactically equivalent sentence with correct
# subtrees has been generated for every sentence.
np.random.seed(42)
equivalence_sum = np.zeros(627)
n_sent_gens = 78
i=0
sentence_list = [[] for k in range(627)]
while np.min(equivalence_sum) < n_sent_gens:
    i+=1
    print('iteration ' + str(i))

    # First, for each syntactic category of token occurring in the Pereira dataset,
    # we sample from the tokens in our corpus as many of such tokens as are in 
    # the Pereira dataset. We only do this for tokens that are not stop words, not "$",
    # and not punctuation.
    sample_dict = {}
    for word_type in [x for x in Counter(pereira_word_types).keys() if x[3]==False]:
        if word_type[0] == 'PUNCT' or word_type == ('SYM', '$', 'quantmod', False):
            pass
        else:
            sample_dict[word_type] = np.random.choice(word_dict[word_type],Counter(pereira_word_types)[word_type])

    
    # Then, we go through the tokens in the original Pereira dataset, keeping stop words,
    # "$", and punctuation from the original sentences, and populating the rest of the 
    # tokens in the sentences with those that we just sampled.
    
    new_sents = []
    sample_counter_dict = {word_type: 0 for word_type in [x for x in Counter(pereira_word_types).keys() if x[3]==False]}
    for word in pereira_words:
        if word[4]==True or word[1] == 'PUNCT' or word[1:] == ('SYM', '$', 'quantmod', False):
            new_sents.append(word[0])
        else:
            new_sents.append(sample_dict[word[1:]][sample_counter_dict[word[1:]]])
            sample_counter_dict[word[1:]] += 1

    # Now we ensure that the first word of each sentence is capitalized,
    # other words are lower-cased (except for "I"), and "a" and "an" are used correctly.
    for word_idx in range(len(new_sents)):
        if new_sents[word_idx] != 'I':
            if new_sents[word_idx-1] != '.':
                new_sents[word_idx] = new_sents[word_idx].lower()
            else:
                new_sents[word_idx] = new_sents[word_idx].capitalize()
            if new_sents[word_idx][0] in ['a','e','i','o','u']:
                if new_sents[word_idx-1] == 'A':
                    new_sents[word_idx-1] = 'An'
                if new_sents[word_idx-1] == 'a':
                    new_sents[word_idx-1] = 'an'
            else:
                if new_sents[word_idx-1] == 'An':
                    new_sents[word_idx-1] = 'A'
                if new_sents[word_idx-1] == 'an':
                    new_sents[word_idx-1] = 'a'
    
    # Up to this point, new_sents has just been a single list of all of the tokens in all 
    # of the sentences we will generate in this iteration.  The next line does some 
    # string operations to turn this into a list of sentences.
    new_sents = [x.strip() + '.' for x in ' '.join(new_sents).replace(" '", "'").replace(" ,", ",").replace(' - ', '-').split('.')]
    new_sents = new_sents[:-1] #the last index ends up being just a period, so we remove it.
    
    # In some rare cases (around 1 in 2000), something goes wrong and we end up with
    # a different number of sentences than we should. In these cases, we quit the iteration 
    # and try again.
    if len(new_sents) != 627:
        continue
    
    # Next, we identify the sentences for which a syntactically equivalent sentence still
    # has not been generated in this cycle.
    idxs_to_update = np.argwhere(equivalence_sum==min(equivalence_sum))[:,0]

    # For these sentences, we take the putative sentences from this iteration and put 
    # them into a spacy parser.  We also do this for the corresponding sentences in
    # the original Pereira dataset.

    gen_pipe = nlp.pipe([str(x) for x in np.array(new_sents)[idxs_to_update]])
    pereira_sentence_pipe = nlp.pipe([str(x) for x in np.array([x.strip() for x in pereira_sentences.split('\n')[:-1]])[idxs_to_update]])

    # Now, we go through the pairs of generated and original sentences and check whether
    # they have matching subtrees for all words.  If they do, we append the generated
    # sentence to its corresponding list in sentence_list.  We update equivalence_sum,
    # which is a running tally of how many sentences have been generated for each sentence 
    # in the original Pereira dataset
    equivalence_list = np.zeros(627)
    for a,(x,y) in enumerate(zip([[[j.i for j in list(w.subtree)] for w in x] for x in gen_pipe],
                   [[[j.i for j in list(w.subtree)] for w in x] for x in pereira_sentence_pipe])):
        equivalence_list[idxs_to_update[a]] = x==y
        if x==y:
            sentence_list[idxs_to_update[a]].append(new_sents[idxs_to_update[a]])
    equivalence_sum += np.array(equivalence_list)

    print('idxs to update ' + str(len(idxs_to_update)))
    print('n sents each ' + str(np.min(equivalence_sum)))
    print('\n')