## Prepare Referece files using TFIDF for retrieving attributes


In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
! pip install boto3

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting boto3
  Downloading boto3-1.26.118-py3-none-any.whl (135 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m135.6/135.6 kB[0m [31m4.2 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting botocore<1.30.0,>=1.29.118
  Downloading botocore-1.29.118-py3-none-any.whl (10.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m10.7/10.7 MB[0m [31m91.2 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting s3transfer<0.7.0,>=0.6.0
  Downloading s3transfer-0.6.0-py3-none-any.whl (79 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m79.6/79.6 kB[0m [31m10.0 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting jmespath<2.0.0,>=0.7.1
  Downloading jmespath-1.0.1-py3-none-any.whl (20 kB)
Installing collected packages: jmespath, botocore, s3transfer, boto3
Successfully installed boto3-1.26.118 botocore-1.29.118 jmespath-1.0.1 s3transfer-0.6.0


In [3]:
%cd /content/drive/MyDrive/transformer-drg-style-transfer-master/data/yelp/processed_files_with_t5_with_best_head

/content/drive/.shortcut-targets-by-id/1gvgdEyQQFFN43xnL2_DdI4rUtMI5gnmU/transformer-drg-style-transfer-master/data/yelp/processed_files_with_t5_with_best_head


In [4]:
import pandas as pd
from tqdm import tqdm, trange
import numpy as np
import time
import torch
from sklearn.feature_extraction.text import TfidfVectorizer

In [5]:
def read_file(path):
    with open(path) as fp:
        lines = (line.rstrip() for line in fp.readlines()) 
        lines = list(line for line in lines if line)
        # lines = fp.read().splitlines()
    return lines

In [6]:
def clean_text(text):
    return text.replace("<POS>","").replace("<NEG>","").replace("<CON_START>","").replace("<START>","").replace("<END>","").strip()


In [7]:
train0_org = read_file("sentiment.train.0") # Training data of negative sentiment
train1_org = read_file("sentiment.train.1") # Training data of positive sentiment
ref0_processed = read_file("reference.0") # Reference data for delete_generate model
ref1_processed = read_file("reference.1") # Reference data for delete_generate model
ref0_org = read_file("../reference.0") # Original Refrence_0 data
ref1_org = read_file("../reference.1") # Original Refrence_1 data
train0_processed = read_file("sentiment_train_0_all_attrs.txt") # training data with content and attributes seperation
train1_processed = read_file("sentiment_train_1_all_attrs.txt") # training data with content and attributes seperation

In [8]:
# Get the Original Reference Sentence
ref0_org = [x.split("\t")[0] for x in ref0_org]
ref1_org = [x.split("\t")[0] for x in ref1_org]

In [9]:
# Get the Content of the Reference Sentences
ref0_con = [clean_text(x) for x in ref0_processed]
ref1_con = [clean_text(x) for x in ref1_processed]

In [10]:
ref0_org[:4], ref0_con[:4]

(["ever since joes has changed hands it 's just gotten worse and worse .",
  'there is definitely not enough room in that part of the venue .',
  'so basically tasted watered down .',
  "she said she 'd be back and disappeared for a few minutes ."],
 ["ever since joes has changed hands it's just gotten worse and worse. Ever since joes has changed hands it's gotten better and better.",
  'there is not room in that part of the venue. There is much room in that part of the venue',
  'so tasted watered down. It didnt taste watered down all.',
  "she said she 'd be back and disappeared fora few minutes. She said shed be back, and didnt disappear at all."])

In [11]:
def get_train_content(text):
    return text.split("<START>")[0].split("<CON_START>")[1].strip()

In [12]:
def get_train_attrs(text):
    return text.split("<CON_START>")[0].replace("<ATTR_WORDS>","").strip().split()

In [13]:
get_train_attrs(train0_processed[0])

['sadly', 'mistaken', '.']

In [14]:
train0_processed[:4], train1_processed[:4]

(['<ATTR_WORDS> sadly mistaken . <CON_START> i was  mistaken. <START> i was sadly mistaken . <END>',
  '<ATTR_WORDS> , of mill . <CON_START> so on to the hoagies the italian is general run the mill. <START> so on to the hoagies , the italian is general run of the mill . <END>',
  '<ATTR_WORDS> meat and a of lettuce . <CON_START> minimal anda ton shredded lettuce. <START> minimal meat and a ton of shredded lettuce . <END>',
  '<ATTR_WORDS> & _num_ tag . <CON_START> nothing really special  not worthy of the $ num price tag. <START> nothing really special & not worthy of the $ _num_ price tag . <END>'],
 ['<ATTR_WORDS> food . <CON_START> excellent food. <START> excellent food . <END>',
  '<ATTR_WORDS> service . <CON_START> superb customer service. <START> superb customer service . <END>',
  '<ATTR_WORDS> and ice is good . <CON_START> they also have daily specials andice cream which really good. <START> they also have daily specials and ice cream which is really good . <END>',
  "<ATTR_WOR

In [15]:
# get content
train0_con = [get_train_content(x) for x in train0_processed]
train1_con = [get_train_content(x) for x in train1_processed]

In [16]:
train0_con[:4], train1_con[:4]

(['i was  mistaken.',
  'so on to the hoagies the italian is general run the mill.',
  'minimal anda ton shredded lettuce.',
  'nothing really special  not worthy of the $ num price tag.'],
 ['excellent food.',
  'superb customer service.',
  'they also have daily specials andice cream which really good.',
  'itsa good toasted hoagie.'])

In [17]:
# Fatch attributes from the training data
attrs_neg = [get_train_attrs(x) for x in train0_processed]
attrs_pos = [get_train_attrs(x) for x in train1_processed]

In [18]:
# Get TFIDF vectors for Training and Reference
tfidf = TfidfVectorizer()
conts_vecs = tfidf.fit_transform(train0_con + train1_con)
conts_pos_vecs = conts_vecs[:len(train1_con)]
conts_neg_vecs = conts_vecs[len(train1_con):len(train1_con)+len(train0_con)]
conts_from_pos_ref_vecs = tfidf.transform(ref1_con)
conts_from_neg_ref_vecs = tfidf.transform(ref0_con)

#### AnnoyIndex is used to store the TFIDF vectors of training set and retrieve nearest neighbours of the reference content 

In [19]:
! pip install annoy

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting annoy
  Downloading annoy-1.17.2.tar.gz (647 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m647.4/647.4 kB[0m [31m10.8 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: annoy
  Building wheel for annoy (setup.py) ... [?25l[?25hdone
  Created wheel for annoy: filename=annoy-1.17.2-cp39-cp39-linux_x86_64.whl size=582197 sha256=b90654ebb854e4e0c5dd8077bc48e7bd1d6690fe1c6092610f243ef2ec081708
  Stored in directory: /root/.cache/pip/wheels/f2/2e/e4/f3ae385c375b87982a2a70055061d4a6330ef4f60817e717e3
Successfully built annoy
Installing collected packages: annoy
Successfully installed annoy-1.17.2


In [20]:
from annoy import AnnoyIndex

In [21]:
train0_tree = AnnoyIndex(conts_neg_vecs.shape[-1])
train1_tree = AnnoyIndex(conts_pos_vecs.shape[-1])

  train0_tree = AnnoyIndex(conts_neg_vecs.shape[-1])
  train1_tree = AnnoyIndex(conts_pos_vecs.shape[-1])


In [22]:
# We have randomly selected training samples to control the memory usage
neg_idxs = np.random.choice(conts_neg_vecs.shape[0], size=50000, replace=False)
pos_idxs = np.random.choice(conts_pos_vecs.shape[0], size=50000, replace=False)

In [23]:
#for i in trange(conts_neg_vecs.shape[0]):
for i in trange(len(neg_idxs)):
    np_array = conts_neg_vecs[neg_idxs[i]].toarray()[0]
    train0_tree.add_item(i,np_array)

100%|██████████| 50000/50000 [02:38<00:00, 315.25it/s]


In [24]:
train0_tree.build(50)
train0_tree.save('tfidf_train0.ann')

True

In [25]:
ref1_con[0:3], " ".join(attrs_neg[neg_idxs[0]])

(["it's small yet they make you feel right at home.s small yet they make you feel likea stranger.",
  'i will be going back and enjoying this great place!i wont be going back and suffering at this terrible place!',
  'drinks were affordable anda good pour. drinks were expensive and half full.'],
 'twice .')

In [33]:
with open("/content/drive/MyDrive/transformer-drg-style-transfer-master/outputs_drg_yelp/reference_1.txt", "w") as out_fp:
    for i in range(conts_from_pos_ref_vecs.shape[0]):
        x = conts_from_pos_ref_vecs[i].toarray()[0]
        inx,dis = train0_tree.get_nns_by_vector(x, 1, include_distances=True)
        ref_sen = ref1_con[i]
        #ref_sen = processed_ref0[i].replace("<POS>","").replace("<NEG>","").replace("<CON_START>","").replace("<START>","")
        #print(dis,"\t",ref0_org[i], "\t" ,train1_data[inx[0]], train1_attr[inx[0]])
        out_str = "<ATTR_WORDS> " + " ".join(attrs_neg[neg_idxs[inx[0]]]) + " <CON_START> " + ref_sen.strip() + " <START>" + "\n"
        print(out_str)
        out_fp.write(out_str)

<ATTR_WORDS> n't like a is . <CON_START> it's small yet they make you feel right at home.s small yet they make you feel likea stranger. <START>

<ATTR_WORDS> _num_ , pathetic looking . <CON_START> i will be going back and enjoying this great place!i wont be going back and suffering at this terrible place! <START>

<ATTR_WORDS> wrong . <CON_START> drinks were affordable anda good pour. drinks were expensive and half full. <START>

<ATTR_WORDS> the and waiting . <CON_START> my husband gota ruben sandwich, he loved it. my husband gota reuben sandwich, he hated it. <START>

<ATTR_WORDS> n't difference . <CON_START> i signed up for their email and gota coupon. signed up for their email and got spam. <START>

<ATTR_WORDS> it , the poor . <CON_START> i 'd definitely recommend giving thema try.'d definitely recommend not giving thema try. <START>

<ATTR_WORDS> coffee , , benedict . <CON_START> i highly recommend e & m painting. highly recommend avoiding e & m painting. <START>

<ATTR_WORDS> di

In [34]:
#for i in trange(conts_neg_vecs.shape[0]):
for i in trange(len(pos_idxs)):
    np_array = conts_pos_vecs[pos_idxs[i]].toarray()[0]
    train1_tree.add_item(i,np_array)

  0%|          | 0/50000 [00:00<?, ?it/s]


Exception: ignored

In [None]:
train1_tree.build(50)
train1_tree.save('tfidf_train1.ann')

In [35]:
with open("/content/drive/MyDrive/transformer-drg-style-transfer-master/outputs_drg_yelp/reference_0.txt", "w") as out_fp:
    for i in range(conts_from_neg_ref_vecs.shape[0]):
        x = conts_from_neg_ref_vecs[i].toarray()[0]
        inx,dis = train1_tree.get_nns_by_vector(x, 1, include_distances=True)
        ref_sen = ref0_con[i]
        out_str = "<ATTR_WORDS> " + " ".join(attrs_pos[pos_idxs[inx[0]]]) + " <CON_START> " + ref_sen.strip() + " <START>" + "\n"
        print(i, out_str)
        out_fp.write(out_str)

0 <ATTR_WORDS> the the was wonderful . <CON_START> ever since joes has changed hands it's just gotten worse and worse. Ever since joes has changed hands it's gotten better and better. <START>

1 <ATTR_WORDS> the the is great . <CON_START> there is not room in that part of the venue. There is much room in that part of the venue <START>

2 <ATTR_WORDS> really place . <CON_START> so tasted watered down. It didnt taste watered down all. <START>

3 <ATTR_WORDS> definitely back ! <CON_START> she said she 'd be back and disappeared fora few minutes. She said shed be back, and didnt disappear at all. <START>

4 <ATTR_WORDS> place . <CON_START> i ca nt believe how inconsider this pharmacy is. This pharmacy is really considerate. <START>

5 <ATTR_WORDS> very good clean . <CON_START> just and took it the bill. just left and put it on the bill. <START>

6 <ATTR_WORDS> the are very good . <CON_START> is nt terrible, but it isn't very good either. isn't perfect, but it is very good. <START>

7 <ATTR

In [None]:
ata = data2 = ""
 
# Reading data from file1
with open('/content/drive/MyDrive/transformer-drg-style-transfer-master/data/yelp/processed_files_with_bert_with_best_head/delete_retrieve_edit_model/sentiment_train_0.txt') as fp:
    data = fp.read()
 
# Reading data from file2
with open('/content/drive/MyDrive/transformer-drg-style-transfer-master/data/yelp/processed_files_with_bert_with_best_head/delete_retrieve_edit_model/sentiment_train_1.txt') as fp:
    data2 = fp.read()
 
# Merging 2 files
# To add the data of file2
# from next line
data += "\n"
data += data2
 
with open ('/content/drive/MyDrive/transformer-drg-style-transfer-master/data/yelp/processed_files_with_bert_with_best_head/delete_retrieve_edit_model/sentiment_train.txt', 'w') as fp:
    fp.write(data)

In [None]:
ata = data2 = ""
 
# Reading data from file1
with open('/content/drive/MyDrive/transformer-drg-style-transfer-master/data/yelp/processed_files_with_bert_with_best_head/delete_retrieve_edit_model/sentiment_dev_0.txt') as fp:
    data = fp.read()
 
# Reading data from file2
with open('/content/drive/MyDrive/transformer-drg-style-transfer-master/data/yelp/processed_files_with_bert_with_best_head/delete_retrieve_edit_model/sentiment_dev_1.txt') as fp:
    data2 = fp.read()
 
# Merging 2 files
# To add the data of file2
# from next line
data += "\n"
data += data2
 
with open ('/content/drive/MyDrive/transformer-drg-style-transfer-master/data/yelp/processed_files_with_bert_with_best_head/delete_retrieve_edit_model/sentiment_dev.txt', 'w') as fp:
    fp.write(data)

In [10]:
DG_TRAIN_DATA= "/content/drive/MyDrive/transformer-drg-style-transfer-master/data/yelp/processed_files_with_t5_with_best_head/sentiment_train.txt"
DG_EVAL_DATA= "/content/drive/MyDrive/transformer-drg-style-transfer-master/data/yelp/processed_files_with_t5_with_best_head/sentiment_dev.txt"
DG_MODEL_OUT= "/content/drive/MyDrive/transformer-drg-style-transfer-master/t5_tfidf/model_output/"

In [11]:
!python "/content/drive/MyDrive/transformer-drg-style-transfer-master/openai_gpt_delete_and_generate.py"\
--model_name openai-gpt \
--do_train \
--do_eval \
--train_dataset $DG_TRAIN_DATA \
--eval_dataset $DG_EVAL_DATA \
--train_batch_size 32 \
--eval_batch_size 32 \
--max_seq_length 85 \
--output_dir $DG_MODEL_OUT 

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
Training loss: 2.41e-01 lr: 2.25e-05:  64% 8858/13852 [1:42:11<57:26,  1.45it/s][A
Training loss: 2.59e-01 lr: 2.25e-05:  64% 8859/13852 [1:42:12<57:25,  1.45it/s][A
Training loss: 2.91e-01 lr: 2.25e-05:  64% 8860/13852 [1:42:12<57:25,  1.45it/s][A
Training loss: 2.51e-01 lr: 2.25e-05:  64% 8861/13852 [1:42:13<57:22,  1.45it/s][A
Training loss: 2.42e-01 lr: 2.25e-05:  64% 8862/13852 [1:42:14<57:15,  1.45it/s][A
Training loss: 2.83e-01 lr: 2.25e-05:  64% 8863/13852 [1:42:14<57:10,  1.45it/s][A
Training loss: 2.58e-01 lr: 2.25e-05:  64% 8864/13852 [1:42:15<57:20,  1.45it/s][A
Training loss: 2.43e-01 lr: 2.25e-05:  64% 8865/13852 [1:42:16<57:16,  1.45it/s][A
Training loss: 2.47e-01 lr: 2.25e-05:  64% 8866/13852 [1:42:16<57:10,  1.45it/s][A
Training loss: 2.61e-01 lr: 2.25e-05:  64% 8867/13852 [1:42:17<57:12,  1.45it/s][A
Training loss: 2.53e-01 lr: 2.25e-05:  64% 8868/13852 [1:42:18<57:13,  1.45it/s][A
Training lo

In [None]:
DRG_TRAIN_DATA = "/content/drive/MyDrive/transformer-drg-style-transfer-master/data/yelp/processed_files_with_bert_with_best_head/delete_retrieve_edit_model/sentiment_train.txt"
DRG_EVAL_DATA = "/content/drive/MyDrive/transformer-drg-style-transfer-master/data/yelp/processed_files_with_bert_with_best_head/delete_retrieve_edit_model/sentiment_dev.txt"
DG_MODEL_OUT = "/content/drive/MyDrive/transformer-drg-style-transfer-master/model_output/pytorch_model_zero_grad_1.bin"

In [2]:
!python "/content/drive/MyDrive/transformer-drg-style-transfer-master/openai_gpt_delete_retrive_and_generate.py"\
--model_name openai-gpt \
--do_train \
--do_eval \
--train_dataset "/content/drive/MyDrive/transformer-drg-style-transfer-master/data/yelp/processed_files_with_t5_with_best_head/sentiment_train.txt" \
--eval_dataset "/content/drive/MyDrive/transformer-drg-style-transfer-master/data/yelp/processed_files_with_t5_with_best_head/sentiment_dev.txt" \
--train_batch_size 32 \
--eval_batch_size 32 \
--max_seq_length 85 \
--output_dir "/content/drive/MyDrive/transformer-drg-style-transfer-master/model_output/drg_output"

python3: can't open file '/content/drive/MyDrive/transformer-drg-style-transfer-master/openai_gpt_delete_retrive_and_generate.py': [Errno 2] No such file or directory
