In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


## Prepare Referece files using TFIDF for retrieving attributes


In [2]:
%cd "/content/drive/MyDrive/riya"

/content/drive/.shortcut-targets-by-id/1TCvtUjcTYKWgalUeZbJk_aFhLp1aLjr6/riya


In [5]:
import pandas as pd
from tqdm import tqdm, trange
import numpy as np
import time
import torch
from sklearn.feature_extraction.text import TfidfVectorizer

In [6]:
def read_file(path):
    with open(path) as fp:
        lines = fp.read().splitlines()
    return lines

In [7]:
def clean_text(text):
    return text.replace("<POS>","").replace("<NEG>","").replace("<CON_START>","").replace("<START>","").replace("<END>","").strip()


In [8]:
train0_org = read_file("/content/drive/MyDrive/riya/data/yelp/sentiment.train.0") # Training data of negative sentiment
train1_org = read_file("/content/drive/MyDrive/riya/data/yelp/sentiment.train.1") # Training data of positive sentiment
ref0_processed = read_file("/content/drive/MyDrive/riya/data/yelp/processed_files_with_bert_with_best_head/reference_0.txt") # Reference data for delete_generate model
ref1_processed = read_file("/content/drive/MyDrive/riya/data/yelp/processed_files_with_bert_with_best_head/reference_1.txt") # Reference data for delete_generate model
ref0_org = read_file("/content/drive/MyDrive/riya/data/yelp/reference.0") # Original Refrence_0 data
ref1_org = read_file("/content/drive/MyDrive/riya/data/yelp/reference.1") # Original Refrence_1 data
train0_processed = read_file("/content/drive/MyDrive/riya/data/yelp/processed_files_with_bert_with_best_head/delete_retrieve_edit_model/sentiment_train_0_all_attrs.txt") # training data with content and attributes seperation
train1_processed = read_file("/content/drive/MyDrive/riya/data/yelp/processed_files_with_bert_with_best_head/delete_retrieve_edit_model/sentiment_train_1_all_attrs.txt") # training data with content and attributes seperation

In [9]:
# Get the Original Reference Sentence
ref0_org = [x.split("\t")[0] for x in ref0_org]
ref1_org = [x.split("\t")[0] for x in ref1_org]

In [10]:
# Get the Content of the Reference Sentences
ref0_con = [clean_text(x) for x in ref0_processed]
ref1_con = [clean_text(x) for x in ref1_processed]

In [11]:
ref0_org[:4], ref0_con[:4]

(["ever since joes has changed hands it 's just gotten worse and worse .",
  'there is definitely not enough room in that part of the venue .',
  'so basically tasted watered down .',
  "she said she 'd be back and disappeared for a few minutes ."],
 ["everes changed hands just gotten worse and worse.\tEver since joes has changed hands it's gotten better and better.",
  'there is not room in of the.There is so much room in that part of the venue',
  "so basically watered.\tIt didn't taste down at all.",
  "she'be back a few minutes.\tShe said she'd be back, and didn't disappear at all."])

In [12]:
def get_train_content(text):
    return text.split("<START>")[0].split("<CON_START>")[1].strip()

In [13]:
def get_train_attrs(text):
    return text.split("<CON_START>")[0].replace("<ATTR_WORDS>","").strip().split()

In [14]:
get_train_attrs(train0_processed[0])

['mistaken', '.']

In [15]:
train0_processed[:4], train1_processed[:4]

(['<ATTR_WORDS> mistaken . <CON_START> i was sadly mistaken. <START> i was sadly mistaken . <END>',
  '<ATTR_WORDS> to hoagies , is mill . <CON_START> so on theagies, the italian general run of the <START> so on to the hoagies , the italian is general run of the mill . <END>',
  '<ATTR_WORDS> meat a lettuce . <CON_START> minimal and ton of shredded lettuce. <START> minimal meat and a ton of shredded lettuce . <END>',
  '<ATTR_WORDS> special not tag . <CON_START> nothing really & worthy of the $ _num_ price tag. <START> nothing really special & not worthy of the $ _num_ price tag . <END>'],
 ['<ATTR_WORDS> food . <CON_START> excellent food. <START> excellent food . <END>',
  '<ATTR_WORDS> service . <CON_START> superb customer service. <START> superb customer service . <END>',
  '<ATTR_WORDS> have specials good . <CON_START> they also daily and ice cream which is really good. <START> they also have daily specials and ice cream which is really good . <END>',
  "<ATTR_WORDS> it 's a good h

In [16]:
# get content
train0_con = [get_train_content(x) for x in train0_processed]
train1_con = [get_train_content(x) for x in train1_processed]

In [17]:
train0_con[:4], train1_con[:4]

(['i was sadly mistaken.',
  'so on theagies, the italian general run of the',
  'minimal and ton of shredded lettuce.',
  'nothing really & worthy of the $ _num_ price tag.'],
 ['excellent food.',
  'superb customer service.',
  'they also daily and ice cream which is really good.',
  "it'a toasted hoagie."])

In [18]:
# Fatch attributes from the training data
attrs_neg = [get_train_attrs(x) for x in train0_processed]
attrs_pos = [get_train_attrs(x) for x in train1_processed]

In [19]:
# Get TFIDF vectors for Training and Reference
tfidf = TfidfVectorizer()
conts_vecs = tfidf.fit_transform(train0_con + train1_con)
conts_pos_vecs = conts_vecs[:len(train1_con)]
conts_neg_vecs = conts_vecs[len(train1_con):len(train1_con)+len(train0_con)]
conts_from_pos_ref_vecs = tfidf.transform(ref1_con)
conts_from_neg_ref_vecs = tfidf.transform(ref0_con)

#### AnnoyIndex is used to store the TFIDF vectors of training set and retrieve nearest neighbours of the reference content 

In [20]:
!pip install annoy

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting annoy
  Downloading annoy-1.17.2.tar.gz (647 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m647.4/647.4 kB[0m [31m19.8 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: annoy
  Building wheel for annoy (setup.py) ... [?25l[?25hdone
  Created wheel for annoy: filename=annoy-1.17.2-cp39-cp39-linux_x86_64.whl size=582240 sha256=85b9534e2d633ac7d30f040c8604898dff63b1063f4e7e300368db29f3c8a2a5
  Stored in directory: /root/.cache/pip/wheels/f2/2e/e4/f3ae385c375b87982a2a70055061d4a6330ef4f60817e717e3
Successfully built annoy
Installing collected packages: annoy
Successfully installed annoy-1.17.2


In [21]:
from annoy import AnnoyIndex

In [22]:
train0_tree = AnnoyIndex(conts_neg_vecs.shape[-1])
train1_tree = AnnoyIndex(conts_pos_vecs.shape[-1])

  train0_tree = AnnoyIndex(conts_neg_vecs.shape[-1])
  train1_tree = AnnoyIndex(conts_pos_vecs.shape[-1])


In [23]:
# We have randomly selected training samples to control the memory usage
neg_idxs = np.random.choice(conts_neg_vecs.shape[0], size=50000)
pos_idxs = np.random.choice(conts_pos_vecs.shape[0], size=50000)

In [24]:
#for i in trange(conts_neg_vecs.shape[0]):
for i in trange(len(neg_idxs)):
    np_array = conts_neg_vecs[neg_idxs[i]].toarray()[0]
    train0_tree.add_item(i,np_array)

100%|██████████| 50000/50000 [02:12<00:00, 377.73it/s]


In [25]:
train0_tree.build(50)
train0_tree.save('tfidf_train0.ann')

True

In [26]:
ref1_con[0:3], " ".join(attrs_neg[neg_idxs[0]])

(["it'small they make right at.it's small yet they make you feel like a stranger.",
  "i will going and enjoying place!i't be going back and suffering at this terrible place!",
  'the drinks affordable a good pour.\tthe were expensive and half.'],
 'crap .')

In [28]:
with open("/content/drive/MyDrive/riya/data/yelp/processed_files_with_bert_with_best_head/delete_retrieve_edit_model/tfidf/reference_1.txt", "w") as out_fp:
    for i in range(conts_from_pos_ref_vecs.shape[0]):
        x = conts_from_pos_ref_vecs[i].toarray()[0]
        inx,dis = train0_tree.get_nns_by_vector(x, 1, include_distances=True)
        ref_sen = ref1_con[i]
        #ref_sen = processed_ref0[i].replace("<POS>","").replace("<NEG>","").replace("<CON_START>","").replace("<START>","")
        #print(dis,"\t",ref0_org[i], "\t" ,train1_data[inx[0]], train1_attr[inx[0]])
        out_str = "<ATTR_WORDS> " + " ".join(attrs_neg[neg_idxs[inx[0]]]) + " <CON_START> " + ref_sen.strip() + " <START>" + "\n"
        print(out_str)
        out_fp.write(out_str)

<ATTR_WORDS> exciting . <CON_START> it'small they make right at.it's small yet they make you feel like a stranger. <START>

<ATTR_WORDS> and unhappy . <CON_START> i will going and enjoying place!i't be going back and suffering at this terrible place! <START>

<ATTR_WORDS> does dq exist ? <CON_START> the drinks affordable a good pour.	the were expensive and half. <START>

<ATTR_WORDS> there a . <CON_START> my husband aen sandwich loved it	my husband got reuben sandwich, he hated it. <START>

<ATTR_WORDS> could n't tell difference . <CON_START> i signed for email and got coupon.I up for email and got spam. <START>

<ATTR_WORDS> wo n't staying again . <CON_START> i'definitely giving them a try.	'd recommend not a try. <START>

<ATTR_WORDS> course . <CON_START> i highly e m painting.	I highly avoiding e & m. <START>

<ATTR_WORDS> horrible . <CON_START> otherwise great and we will again.other a terrible and we will not go again. <START>

<ATTR_WORDS> tax and delivery , _num_ . <CON_START> g

In [29]:
#for i in trange(conts_neg_vecs.shape[0]):
for i in trange(len(pos_idxs)):
    np_array = conts_pos_vecs[pos_idxs[i]].toarray()[0]
    train1_tree.add_item(i,np_array)

100%|██████████| 50000/50000 [02:22<00:00, 350.14it/s]


In [30]:
train1_tree.build(50)
train1_tree.save('tfidf_train1.ann')

True

In [31]:
with open("/content/drive/MyDrive/riya/data/yelp/processed_files_with_bert_with_best_head/delete_retrieve_edit_model/tfidf/reference_0.txt", "w") as out_fp:
    for i in range(conts_from_neg_ref_vecs.shape[0]):
        x = conts_from_neg_ref_vecs[i].toarray()[0]
        inx,dis = train1_tree.get_nns_by_vector(x, 1, include_distances=True)
        ref_sen = ref0_con[i]
        out_str = "<ATTR_WORDS> " + " ".join(attrs_pos[pos_idxs[inx[0]]]) + " <CON_START> " + ref_sen.strip() + " <START>" + "\n"
        print(i, out_str)
        out_fp.write(out_str)

0 <ATTR_WORDS> express ! <CON_START> everes changed hands just gotten worse and worse.	Ever since joes has changed hands it's gotten better and better. <START>

1 <ATTR_WORDS> , nice staff , catering , food ! <CON_START> there is not room in of the.There is so much room in that part of the venue <START>

2 <ATTR_WORDS> wow ! <CON_START> so basically watered.	It didn't taste down at all. <START>

3 <ATTR_WORDS> place ! <CON_START> she'be back a few minutes.	She said she'd be back, and didn't disappear at all. <START>

4 <ATTR_WORDS> fish . <CON_START> i ca't how inconsiderate this pharmacy.This pharmacy considerate. <START>

5 <ATTR_WORDS> store amazing ! <CON_START> just left took off the bill.	just and it on. <START>

6 <ATTR_WORDS> , to to , doctor ! <CON_START> it is't, but't good either.	it isn't perfect, but it is very good. <START>

7 <ATTR_WORDS> my restaurants pittsburgh ! <CON_START> definitely that could not birthday gift	initely not disappointed that i could use my birthday 

In [32]:
ata = data2 = ""
 
# Reading data from file1
with open('/content/drive/MyDrive/riya/data/yelp/processed_files_with_bert_with_best_head/delete_retrieve_edit_model/sentiment_train_0.txt') as fp:
    data = fp.read()
 
# Reading data from file2
with open('/content/drive/MyDrive/riya/data/yelp/processed_files_with_bert_with_best_head/delete_retrieve_edit_model/sentiment_train_1.txt') as fp:
    data2 = fp.read()
 
# Merging 2 files
# To add the data of file2
# from next line
data += "\n"
data += data2
 
with open ('/content/drive/MyDrive/riya/data/yelp/processed_files_with_bert_with_best_head/delete_retrieve_edit_model/sentiment_train.txt', 'w+') as fp:
    fp.write(data)

In [33]:
ata = data2 = ""
 
# Reading data from file1
with open('/content/drive/MyDrive/riya/data/yelp/processed_files_with_bert_with_best_head/delete_retrieve_edit_model/sentiment_dev_0.txt') as fp:
    data = fp.read()
 
# Reading data from file2
with open('/content/drive/MyDrive/riya/data/yelp/processed_files_with_bert_with_best_head/delete_retrieve_edit_model/sentiment_dev_1.txt') as fp:
    data2 = fp.read()
 
# Merging 2 files
# To add the data of file2
# from next line
data += "\n"
data += data2
 
with open ('/content/drive/MyDrive/riya/data/yelp/processed_files_with_bert_with_best_head/delete_retrieve_edit_model/sentiment_dev.txt', 'w') as fp:
    fp.write(data)

In [37]:
DG_TRAIN_DATA= "/content/drive/MyDrive/riya/data/yelp/processed_files_with_bert_with_best_head/delete_retrieve_edit_model/sentiment_train.txt"
DG_EVAL_DATA= "/content/drive/MyDrive/riya/data/yelp/processed_files_with_bert_with_best_head/delete_retrieve_edit_model/sentiment_dev.txt"
DG_MODEL_OUT= "/content/drive/MyDrive/riya/model_output"

In [38]:
!python "/content/drive/MyDrive/riya/openai_gpt_delete_and_generate.py"\
--model_name openai-gpt \
--do_train \
--do_eval \
--train_dataset $DG_TRAIN_DATA \
--eval_dataset $DG_EVAL_DATA \
--train_batch_size 32 \
--eval_batch_size 32 \
--max_seq_length 85 \
--output_dir $DG_MODEL_OUT 

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
Training loss: 1.90e-01 lr: 2.25e-05:  64% 8858/13852 [1:33:59<53:27,  1.56it/s][A
Training loss: 2.05e-01 lr: 2.25e-05:  64% 8859/13852 [1:34:00<53:21,  1.56it/s][A
Training loss: 2.17e-01 lr: 2.25e-05:  64% 8860/13852 [1:34:00<53:18,  1.56it/s][A
Training loss: 2.09e-01 lr: 2.25e-05:  64% 8861/13852 [1:34:01<53:35,  1.55it/s][A
Training loss: 2.24e-01 lr: 2.25e-05:  64% 8862/13852 [1:34:01<53:31,  1.55it/s][A
Training loss: 2.37e-01 lr: 2.25e-05:  64% 8863/13852 [1:34:02<53:28,  1.55it/s][A
Training loss: 2.14e-01 lr: 2.25e-05:  64% 8864/13852 [1:34:03<53:25,  1.56it/s][A
Training loss: 1.96e-01 lr: 2.25e-05:  64% 8865/13852 [1:34:03<53:07,  1.56it/s][A
Training loss: 2.12e-01 lr: 2.25e-05:  64% 8866/13852 [1:34:04<52:53,  1.57it/s][A
Training loss: 2.03e-01 lr: 2.25e-05:  64% 8867/13852 [1:34:05<52:54,  1.57it/s][A
Training loss: 1.94e-01 lr: 2.25e-05:  64% 8868/13852 [1:34:05<52:41,  1.58it/s][A
Training lo

In [35]:
!pip install boto3

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting boto3
  Downloading boto3-1.26.120-py3-none-any.whl (135 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m135.6/135.6 kB[0m [31m5.0 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting s3transfer<0.7.0,>=0.6.0
  Downloading s3transfer-0.6.0-py3-none-any.whl (79 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m79.6/79.6 kB[0m [31m9.6 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting jmespath<2.0.0,>=0.7.1
  Downloading jmespath-1.0.1-py3-none-any.whl (20 kB)
Collecting botocore<1.30.0,>=1.29.120
  Downloading botocore-1.29.120-py3-none-any.whl (10.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m10.7/10.7 MB[0m [31m82.2 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: jmespath, botocore, s3transfer, boto3
Successfully installed boto3-1.26.120 botocore-1.29.120 jmespath-1.0.1 s3transfer-0.6.0
