In [1]:
from datasets import load_dataset
import numpy as np
import spacy
import torch
# import en_core_web_trf
import spacy
import en_core_web_sm
from collections import Counter


from nltk.tokenize import wordpunct_tokenize
from string import punctuation
from collections import defaultdict
from torch.nn.functional import cosine_similarity



  from .autonotebook import tqdm as notebook_tqdm


In [2]:
from transformers import DistilBertModel, DistilBertTokenizer
from transformers import DistilBertTokenizer, DistilBertForMaskedLM
import torch

# Generating syntactically similar models

After trying a number of things, this approach has been the most successful which is as follows. For each word in a target sentnece, a LLM(BERT for its bidirectional properties) predicts a replacement for that word given the rest of the sentence as context. Its predictions are filtered to match the tags of the original word(POS,dep,...). Then from the X most likely predicitons that match the 'form' of the original word, a random one is chosen to replace the original. 
This process of replacing every word in the sentnece is repeated for a number of cycles. Alternative sentences are collected continuously.

This process allows the sentence to maintain its original sytactic structure as each word is generated given the rest of the sentence. The incremental and random nature of the process allows the sentence to slowly trend away from the semantics of the original sentence while not introducing a large enough change that causes the prediction model to fall into confusion and fail to predict sensical replacements.

Eventually, generated sentences can be filtered for dependency tree stucture of the original sentence and semantic dissimilarity from the original sentence. 

Possible issues:
1. speed
2. generated sentences appear to have random semantic distribution but this might not be the case


Question:
How much flexibility can there be in change words like 'are' to 'is'



In [3]:
# load nlp model
nlp = spacy.load("en_core_web_sm")  # Smaller model suitable for CP

# Bert model
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
model = DistilBertForMaskedLM.from_pretrained('distilbert-base-uncased')

In [56]:
# alternatives = np.empty((len(sentence_split), 100), dtype=object)

def generate_syntactic_similar_alternatives(sentences,num_alternatives_per_sent):
    #replace first, then second, then third word iteratively(once)
    all_alternatives = []

    with torch.no_grad():
        iteration = 0

        for sent_idx,sentence in enumerate(sentences):
            print(f'Finding alternatives for sentence: {" ".join(sentence)}')
            sentence_split = sentence.copy()



            #include original sentence in list of alternatives
            sentence_alternatives = [sentence_split]
            
            cycle = 0
            # every word in the sentence will be replaced once per cycle
            while len(sentence_alternatives) < num_alternatives_per_sent + 1:
                # print(len(sentence_alternatives))

                # replace words in random order
                random_order = np.random.permutation(len(sentence_split))

                for word_idx in random_order:
                    original_word = nlp(' '.join(sentence_split))[word_idx]

                    masked_sentence = sentence_split.copy()
                    masked_sentence[word_idx] = tokenizer.mask_token
                    masked_sentence = ' '.join(masked_sentence)

                    inputs = tokenizer(masked_sentence, return_tensors="pt")
                    mask_position = inputs.input_ids[0].tolist().index(tokenizer.mask_token_id)

                    outputs = model(**inputs)
                    predictions = outputs.logits


                    possible_predicted = []
                    top_k_to_consider = 100

                    predicted_index = torch.topk(predictions[0, mask_position], k=top_k_to_consider).indices.tolist()

                    # get top predictions, it seems only the top few make sense in many cases
                    predicted_tokens = [tokenizer.decode([idx]) for idx in predicted_index]

                    # filter by pos, dep, etc
                    for token in predicted_tokens:
                        temp = sentence_split.copy()
                        temp[word_idx] = token
                        word = nlp(' '.join(temp))[word_idx]
                        # if not (word.pos_ == original_word.pos_ and word.dep_ == original_word.dep_ and word.text.lower() != original_word.text.lower() and word.is_stop == original_word.is_stop):
                        #     continue
                        if not (word.pos_ == original_word.pos_ and word.tag_ == original_word.tag_ and word.dep_ == original_word.dep_ and word.text.lower() != original_word.text.lower() and word.is_stop == original_word.is_stop):
                            continue
                        # check for similar subtree
                        # if not ([j.i for j in list(word.subtree)] == [j.i for j in list(original_word.subtree)]):
                        #     continue

                        if word.text in sentence_split:
                            continue

                        possible_predicted.append(token)

                    # If there are no other possible words, just keep the original word, in this case it is likely it does not have a strong semantic contribution
                    if len(possible_predicted) == 0:
                        possible_predicted.append(original_word.text)
                    
                    # choose a random word from the filtered top k preds
                    sentence_split[word_idx] = np.random.choice([token for token in possible_predicted[:min(5, len(possible_predicted))]])

                save_every = 3 # save alternative sentence every 3 cycles
                if cycle%save_every == 0:
                         
                    # ensure subtrees are the same
                    original_subtrees = [[j.i for j in list(word.subtree)] for word in nlp(" ".join(sentence))]
                    alternative_subtrees = [[j.i for j in list(word.subtree)] for word in nlp(" ".join(sentence_split))]

                    if(original_subtrees == alternative_subtrees):
                        #subtrees are same so add to alternatives
                        sentence_alternatives.append(sentence_split.copy())
                        print(f'SAVING: {" ".join(sentence_split)}')
                    else:
                        sentence_alternatives.append(sentence_split.copy())
                        print(f'SAVING: {" ".join(sentence_split)}') 
                        print("Failed subtree match")
                # print(sentence_split)
                cycle += 1

           
            
            all_alternatives.append(sentence_alternatives)

    return all_alternatives


In [57]:
sentences = [
    "dogs are the best pets because they are loyal",
]
sentences = [sentence.split(' ') for sentence in sentences]

In [58]:
all_alternatives = generate_syntactic_similar_alternatives(sentences, 3)

Finding alternatives for sentence: dogs are the best pets because they are loyal
SAVING: rabbits am a best wishes as i are safe
Failed subtree match
SAVING: giants are those strongest spirits that you keep safe
Failed subtree match
SAVING: miracles are a deepest things whatever they say eternal
Failed subtree match


In [37]:
for target_idx,alternatives in enumerate(all_alternatives):
    for idx,alternative in enumerate(alternatives):
        if idx ==0:
            print(f'Orignal: {" ".join(alternative)}')
        else:
            print(f'Alternative: {" ".join(alternative)}')

Orignal: solutions are these hardest problems because others are different
Alternative: chances are these smallest moments while i am dead
Alternative: things are no worst losers until he are satisfied
Alternative: solutions are these hardest problems because others are different


In [59]:
pereira_sentences = open(
    '/Users/thomasmcgall/Desktop/research/research_push/ThomasCodeforKao/sentences_ordered.txt', 'r').read()
pereira_sentences = np.random.choice(pereira_sentences.split('\n'), 1)
pereira_sentences = [sentence.split(' ') for sentence in pereira_sentences]

for sent in pereira_sentences:
    print(" ".join(sent))


There are usually chairs around a table for people to sit.


In [60]:
all_alternatives = generate_syntactic_similar_alternatives(pereira_sentences, 3)

Finding alternatives for sentence: There are usually chairs around a table for people to sit.
SAVING: There become usually seats across every room that people to study
Failed subtree match
SAVING: There see increasingly trends towards an option which individuals to consider
Failed subtree match
SAVING: There seem increasingly regulations towards the item which consumers to avoid
Failed subtree match


In [61]:
for target_idx,alternatives in enumerate(all_alternatives):
    for idx,alternative in enumerate(alternatives):
        if idx ==0:
            print(f'Orignal: {" ".join(alternative)}')
        else:
            print(f'Alternative: {" ".join(alternative)}')

Orignal: There seem increasingly regulations towards the item which consumers to avoid
Alternative: There become usually seats across every room that people to study
Alternative: There see increasingly trends towards an option which individuals to consider
Alternative: There seem increasingly regulations towards the item which consumers to avoid


In [None]:
# perplexity, cross entropy loss of the sentence, lower perplexity is better

In [None]:
'''
first mask all the content words in the sentence
then generate predictions for each masked word, and from all of the predicitons select the most common one 
iteratively fill these in with some randomness
'''

# hugging face .generate ,(can take restrictions) and edit this to 

https://huggingface.co/docs/transformers/v4.40.0/en/main_classes/text_generation#transformers.GenerationConfig

change so instead of word token match constraintes it is syntax tag match

also right now it is so that each word in the constraint must appear in order at some point in the string
each time a word in the sequence is seen, it 'advances' and is eventually completed

Instead match is generated on syntax tags, and the constraints are the tags of the complete sentence. In this case the model must update on every single word generation. 