In [1]:
import pandas as pd
from datasets import load_dataset
from CDA.substitutor import Substitutor
from CDA.utils import load_json_pairs

base_pairs = load_json_pairs('CDA/cda_default_pairs.json')
name_pairs = load_json_pairs('CDA/names_pairs_1000_scale.json')
substitutor = Substitutor(base_pairs, name_pairs=name_pairs)

# !pip3 install datasets

2024-03-06 15:31:35,210 : INFO : Loading spaCy model...
2024-03-06 15:31:36,282 : INFO : Done.


#### Helper functions

In [2]:
instruction_data=[]
def read_file(path,is_sep,column_names=None):
    if column_names is not None:
        return pd.read_csv(path,sep=is_sep,names=column_names)
    return pd.read_csv(path,sep=is_sep)

In [3]:
def create_json(instruction,sentence,sentiment):
    temp = {
        "instruction": instruction,
        "input": sentence,
        "output": sentiment        
    }
    return temp

def create_json_new(instruction,sentence1,sentence2,sentiment,key1,key2):
    temp = {
        "instruction": instruction,
        "input": {
            key1: sentence1,
            key2: sentence2
        },
        "output": sentiment        
    }
    return temp

In [4]:
def process_data(dataframe,instruction):

    for index,s in dataframe.iterrows():
        sentence = s['sentence']
        sentiment = s['label']
        json_original = create_json(instruction,sentence,sentiment)
        instruction_data.append(json_original)
        flipped,isreversed = substitutor.invert_document(sentence)
        if isreversed:
            json_flipped = create_json(instruction,flipped,sentiment)
            instruction_data.append(json_flipped)

#### Reading STSB dataset from huggingface

In [5]:
stsb_dataset = load_dataset("glue", "stsb")
train_stsb = stsb_dataset["train"]
df=train_stsb.to_pandas()
train_stsb = df[['sentence1','sentence2','label']][:2000]

#### Reading qnli dataset from huggingface

In [6]:
qnli_dataset = load_dataset("glue", "qnli")
train_qnli = qnli_dataset["train"]
df=train_qnli.to_pandas()
train_qnli = df[['question','sentence','label']][:2000]

#### Reading SST-2 and Cola Datasets

In [7]:
train_sst2 = read_file('data/sst-2/train.csv',',')
column_names = ['Source', 'label', 'Author', 'sentence']
train_cola = read_file('data/cola/in_domain_train.tsv','\t',column_names)[['sentence','label']]

In [8]:
process_data(train_sst2,"Predict the sentiment of the given sentence")
process_data(train_cola,"Predict if the sentence is grammatically correct or not")

In [9]:
instruction="Find the Semantic Text Similarity between sentence1 and sentence2 with a similarity score between 1 to 5. Can be a float number"
for index,s in train_stsb.iterrows():
        sentence1 = s['sentence1']
        sentence2 = s['sentence2']
        sentiment = s['label']
        json_original = create_json_new(instruction,sentence1,sentence2,sentiment,"sentence1","sentence2")
        instruction_data.append(json_original)
        flipped1,isreversed1 = substitutor.invert_document(sentence1)
        flipped2, isreversed2 = substitutor.invert_document(sentence2)
        if isreversed1 or isreversed2:
            json_flipped = create_json_new(instruction,flipped1,flipped2,sentiment,"sentence1","sentence2")
            instruction_data.append(json_flipped)


In [11]:
instruction="Find if the sentence contains answer to the question."
for index,s in train_qnli.iterrows():
        question = s['question']
        sentence = s['sentence']
        sentiment = s['label']
        json_original = create_json_new(instruction,question,sentence,sentiment,"question","sentence")
        instruction_data.append(json_original)
        flipped1,isreversed1 = substitutor.invert_document(question)
        flipped2, isreversed2 = substitutor.invert_document(sentence)
        if isreversed1 or isreversed2:
            json_flipped = create_json_new(instruction,flipped1,flipped2,sentiment,"question","sentence")
            instruction_data.append(json_flipped)



In [12]:
import json
print(len(instruction_data))
with open("instruction_data.json", "w") as json_file:
    json.dump(instruction_data, json_file, indent=4)

27875
