# Final version of lexical simplification pipeline

-Swarupa Hardikar

MA Linguistics

Note: this notebook has been originally created using Google Colaboratory

In [None]:
## Mounting drive for file access
from google.colab import drive
drive.mount('/content/drive/')

Mounted at /content/drive/


In [None]:
## installations for AL
!pip install bitsandbytes
!pip install -q datasets loralib sentencepiece
!pip install -q git+https://github.com/zphang/transformers@c3dc391
!pip install -q git+https://github.com/huggingface/peft.git

## installations for Orca
!pip install auto-gptq
!pip install transformers

Collecting bitsandbytes
  Downloading bitsandbytes-0.41.1-py3-none-any.whl (92.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m92.6/92.6 MB[0m [31m8.8 MB/s[0m eta [36m0:00:00[0m
[?25h[31mERROR: Operation cancelled by user[0m[31m
[0mTraceback (most recent call last):
  File "/usr/local/lib/python3.10/dist-packages/pip/_vendor/pkg_resources/__init__.py", line 3108, in _dep_map
    return self.__dep_map
  File "/usr/local/lib/python3.10/dist-packages/pip/_vendor/pkg_resources/__init__.py", line 2901, in __getattr__
    raise AttributeError(attr)
AttributeError: _DistInfoDistribution__dep_map

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/usr/local/lib/python3.10/dist-packages/pip/_internal/cli/base_command.py", line 169, in exc_logging_wrapper
    status = run_func(*args)
  File "/usr/local/lib/python3.10/dist-packages/pip/_internal/cli/req_command.py", line 242, in wrapper
    return func(s

In [None]:
## Loading Alpaca LoRA
## source for cell: https://colab.research.google.com/drive/1eWAmesrW99p7e1nah5bipn0zikMb8XYC


from peft import PeftModel
from transformers import LLaMATokenizer, LLaMAForCausalLM, GenerationConfig

tokenizer = LLaMATokenizer.from_pretrained("decapoda-research/llama-7b-hf")
model_al = LLaMAForCausalLM.from_pretrained(
    "decapoda-research/llama-7b-hf",
    load_in_8bit=True,
    device_map="auto",
)
model_al = PeftModel.from_pretrained(model_al, "tloen/alpaca-lora-7b")



Loading checkpoint shards:   0%|          | 0/33 [00:00<?, ?it/s]

Downloading (…)/adapter_config.json:   0%|          | 0.00/399 [00:00<?, ?B/s]

OutOfMemoryError: ignored

In [None]:
## Setup for AL
## source for cell: https://colab.research.google.com/drive/1eWAmesrW99p7e1nah5bipn0zikMb8XYC

def generate_prompt(instruction, input=None):
    if input:
        return f"""Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.

### Instruction:
{instruction}

### Input:
{input}

### Response:"""
    else:
        return f"""Below is an instruction that describes a task. Write a response that appropriately completes the request.

### Instruction:
{instruction}

### Response:"""

In [None]:
## source for cell: https://colab.research.google.com/drive/1eWAmesrW99p7e1nah5bipn0zikMb8XYC
generation_config = GenerationConfig(
    temperature=0.1,
    top_p=0.75,
    num_beams=4,
)

def evaluate(instruction, input=None):
    prompt = generate_prompt(instruction, input)
    inputs = tokenizer(prompt, return_tensors="pt")
    input_ids = inputs["input_ids"].cuda()
    generation_output = model_al.generate(
        input_ids=input_ids,
        generation_config=generation_config,
        return_dict_in_generate=True,
        output_scores=True,
        max_new_tokens=256
    )
    for s in generation_output.sequences:
        output = tokenizer.decode(s)
        return output.split("### Response:")[1].strip()

In [None]:
## imports

import requests
import re
from collections import defaultdict
import time
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords

## Setup for similarity model
similarity_API_URL = "https://api-inference.huggingface.co/models/sentence-transformers/all-MiniLM-L6-v2"
api_token = "[TOKEN]"
headers = {"Authorization": f"Bearer {api_token}"}

## Setup for n-gram frequency model
base_url = "https://api.ngrams.dev"
corpus = "eng"

def similarity_query(payload):
    response = requests.post(similarity_API_URL, headers=headers, json=payload)
    return response.json()

class LexicalSimplification:
    def __init__(self, data_file, model_name, prompt_template, output_file, ranker):
        """
        Initialize the LexicalSimplification object.

        Args:
            data_file (str): Path to the data file containing sentences and target words.
            model_name (str): Name or ID of the model to use for generating substitutes.
            prompt_template (str): Template string for generating prompts, containing placeholders for sentence and word.
            output_file (str): Path to the output file where the generated substitutes will be saved.
            ranker (str): Method of substitute ranking.
        """
        self.data = data_file
        self.model = model_name
        self.template = prompt_template
        self.output = output_file
        self.ranker = ranker

    def read_data(self):
        """
        Read the data file and return its contents.

        Returns:
            str: Contents of the data file.
        """
        with open(self.data, 'r') as sf:
            # Read and process the data file
            data = sf.readlines()
            # Additional processing or formatting as needed
            return data

    def gen_prompt(self, data):
        """
        Generate a prompt by filling the template with the provided data.

        Args:
            data (str): Data containing sentences and target words in TSV format.

        Returns:
            str: Prompt generated using the template.
        """
        sentence, word = data.strip().split('\t')
        prompt = self.template.replace('[WORD]', word).replace('[SENTENCE]', sentence)
        return prompt

    def answer_parser(self, answer, prompt):
        """
        Parse the answer and extract the clean substitutes.

        Args:
            answer (str): Answer obtained from the model.

        Returns:
            list: List of clean substitutes.
        """
        stop_words = set(stopwords.words('english'))

        ## Catching noise from OA model
        if "<|assistant|>" in answer:
          answer = answer.split("<|assistant|>")[1].replace("</s>","")
        if "Note:" in answer:
          answer = answer.split("Note:")[0]

        if ":" in answer:
            answer = answer.split(":")[-1]
            clean_answer = answer.lower()
        else:
            clean_answer = answer.lower()

        ## Cleaning the answer
        clean_answer = re.sub(r'(?<![a-zA-Z])-|[^a-zA-Z\s-]', '', clean_answer)
        substitutes_list = clean_answer.strip().split()
        ## Removing function words
        substitutes_list = [word for word in substitutes_list if word not in stop_words and word not in prompt.lower().split()]
        return substitutes_list

    def generate_rank_substitutes(self):
        """
        Generate substitutes for the target words in the data file using the model.
        Save the generated substitutes to the output file in the desired format.
        """
        # Reading the data
        data = self.read_data()

        # Iterating over the data and generate substitutes
        with open(self.output, 'a') as self.output_file:
          for i, line in enumerate(data):
              sentence, complex_word = line.strip().split('\t')
              prompt = self.gen_prompt(line)
              print(f"{i+1})")

              ## Generation of substitutes
              ## Model: Alpaca-LoRA
              if self.model == "al":
                answer = evaluate("Instruction: ", prompt)
                print(answer)
                substitutes_list = self.answer_parser(answer, prompt)
            
              ## Model: Orca  
              if self.model == "oa":
                answer = generate_answer(prompt)
                print(answer)
                substitutes_list = self.answer_parser(answer, prompt)

                ## Catching blank lists
              if not substitutes_list:
                substitutes_list.append("error")

                ## Removing duplicates
              substitutes_set = set(substitutes_list)

            ## Ranking the substitutes...
            
              ## ranking by sentence similarity
              if self.ranker == "similarity":
                word2score = defaultdict(float)

                for substitute in substitutes_set:
                    payload = {
                        "inputs": {
                            "source_sentence": sentence,
                            "sentences": [sentence.replace(complex_word,substitute)]
                        },
                    "options": {
                        "wait_for_model": True,
                        "use_cache": True
                    }
                    }
                    data = similarity_query(payload)
                    print(data)
                    word2score[substitute] = data[0]

                print(word2score)
                ranked_subs = [k for k, v in sorted(word2score.items(), key=lambda x: x[1], reverse=True)][:min(10, len(word2score))]

              ## Ranking by word frequency
              if self.ranker == "frequency":

                word2score = defaultdict(float)

                for substitute in substitutes_set:
                    payload = {
                        "inputs": {
                            "source_sentence": sentence,
                            "sentences": [sentence.replace(complex_word,substitute)]
                        },
                    "options": {
                        "wait_for_model": True,
                        "use_cache": True
                    }
                    }
                    data = similarity_query(payload)
                    word2score[substitute] = data[0]

                print(word2score)
                ranked_subs = [k for k, v in sorted(word2score.items(), key=lambda x: x[1], reverse=True)][:min(10, len(word2score))]

                word2freq = defaultdict(int)
                for substitute in ranked_subs:
                    url = f"{base_url}/{corpus}/search"
                    query_params = {
                        "query": substitute
                    }

                    response = requests.get(url, params=query_params)

                    if response.status_code == 200:
                        data = response.json()
                        if "ngrams" in data and data["ngrams"]:
                            word2freq[substitute] = data["ngrams"][0]["absTotalMatchCount"]
                        else:
                            word2freq[substitute] = 0

                print(word2freq)

                ranked_subs = [k for k, v in sorted(word2freq.items(), key=lambda x: x[1], reverse=True)][:min(10, len(word2freq))]

              ## Ranking by the generative model itself
              if self.ranker == "nil":
                ranked_subs = substitutes_list[:10]

              ranked_subs_str = '\t'.join(ranked_subs)
              output_line = sentence + '\t' + complex_word + '\t' + ranked_subs_str + '\n'
              self.output_file.write(output_line)
              time.sleep(1)

        print("Substitute ranking complete. Output saved in:", self.output)


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [None]:
def main():
    # Defining the file paths and model details
    data_file = "[INPUT_PATH]"
    model_name = "al" # "oa" 
    # prompt_template = "Context:'[SENTENCE]'\n Given the above context, list 10 simpler substitutes for the word '[WORD]'."
    prompt_template = '''Context: A local witness said a separate group of attackers disguised in burqas — the head-to-toe robes worn by conservative Afghan women — then tried to storm the compound.\n
                        Question: Given the above context, list ten alternative words	for “disguised” that are easier to understand. \n
                        Answer:\n1. concealed\n2. dressed\n3. hidden\n4. camouflaged\n5. changed\n6. covered\n7. masked\n8. unrecognizable\n9. converted\n10. impersonated\n\n
                        Context: '[SENTENCE]'\n
                        Question: Given the above context, list ten alternatives for [WORD]' that are easier to understand. \n				'''
    ranker = "similarity" # "frequency" # "nil"
    output_file = f"[OUTPUT_PATH]{model_name}_{ranker}_test_wcontext_1shot_p2.tsv"

    lexsimp = LexicalSimplification(data_file, model_name, prompt_template, output_file, ranker)

    # Generate and rank substitutes and save them to the output file
    lexsimp.generate_rank_substitutes()

# Execute the main function
if __name__ == "__main__":
    main()


1)
Instruction:
Instruction: 

### Input:
Context: A local witness said a separate group of attackers disguised in burqas — the head-to-toe robes worn by conservative Afghan women — then tried to storm the compound.
				
                        Question: Given the above context, list ten alternative words	for “disguised” that are easier to understand. 
				
                        Answer:
1. concealed
2. dressed
3. hidden
4. camouflaged
5. changed
6. covered
7. masked
8. unrecognizable
9. converted
10. impersonated

Context: 'This discovery helped to establish yet another spectral class even cooler than L dwarfs, known as `` T dwarfs '', for which Gliese 229B is the prototype.'
				
                        Question: Given the above context, list ten alternatives for prototype' that are easier to understand.
[0.9722535610198975]
[0.97263503074646]
defaultdict(<class 'float'>, {'context': 0.9722535610198975, 'understand': 0.97263503074646})
2)
Instruction:
Instruction: 

### Input:
Conte

KeyError: ignored