In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
from datasets import load_dataset

In [None]:
dataset = load_dataset('json', data_files="/content/drive/MyDrive/all_faqs.json", split="train")

In [None]:
import nltk
import random
import json
from nltk.corpus import wordnet
from transformers import pipeline

# Download the WordNet corpus (needed for synonyms)
nltk.download('wordnet')
nltk.download('omw-1.4')

# Load a paraphrasing model
paraphraser = pipeline("text2text-generation", model="Vamsi/T5_Paraphrase_Paws", device=0)

def get_synonyms(word):
    synonyms = set()
    for syn in wordnet.synsets(word):
        for lemma in syn.lemmas():
            synonyms.add(lemma.name().replace('_', ' '))
    return list(synonyms)

def synonym_replacement(sentence):
    words = sentence.split()
    new_sentence = []
    for word in words:
        synonyms = get_synonyms(word)
        if synonyms:
            new_sentence.append(random.choice(synonyms))
        else:
            new_sentence.append(word)
    return ' '.join(new_sentence)

# def add_noise(sentence):
#     words = list(sentence)
#     idx = random.randint(0, len(words)-1)
#     words[idx] = random.choice('abcdefghijklmnopqrstuvwxyz')
#     return ''.join(words)

def reorder_question(sentence):
    words = sentence.split()
    random.shuffle(words)
    return ' '.join(words)

def negate_question(sentence):
    words = sentence.split()
    if "not" not in words:
        words.insert(1, "not")
    return ' '.join(words)

def paraphrase_question(question):
    # paraphrased = paraphraser(question, max_length=60, num_return_sequences=3)
    paraphrased = paraphraser(question, max_length=60, num_return_sequences=3, num_beams=5)
    return [p['generated_text'] for p in paraphrased]

def augment_question(question, answer):
    augmented_data = []

    # Paraphrasing
    for paraphrase in paraphrase_question(question):
        augmented_data.append({"Question": paraphrase, "Answer": answer})

    # Synonym Replacement
    augmented_data.append({"Question": synonym_replacement(question), "Answer": answer})

    # # Adding Noise
    # augmented_data.append({"Question": add_noise(question), "Answer": answer})

    # Reordering
    augmented_data.append({"Question": reorder_question(question), "Answer": answer})

    # Negation
    augmented_data.append({"Question": negate_question(question), "Answer": answer})

    return augmented_data

# Augment the QA data
augmented_dataset = []
for item in dataset:
    augmented_dataset.extend(augment_question(item["Question"], item["Answer"]))

# Output the augmented data
print(json.dumps(augmented_dataset, indent=4))