### import packages

In [None]:
from google.colab import drive
drive.mount('/content/drive')
# store yelp and shakespeares file in the directory '/content/drive/MyDrive'

Mounted at /content/drive


In [None]:
pip install transformers

Collecting transformers
  Downloading transformers-4.17.0-py3-none-any.whl (3.8 MB)
[K     |████████████████████████████████| 3.8 MB 7.9 MB/s 
Collecting pyyaml>=5.1
  Downloading PyYAML-6.0-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (596 kB)
[K     |████████████████████████████████| 596 kB 60.6 MB/s 
Collecting huggingface-hub<1.0,>=0.1.0
  Downloading huggingface_hub-0.4.0-py3-none-any.whl (67 kB)
[K     |████████████████████████████████| 67 kB 7.1 MB/s 
[?25hCollecting tokenizers!=0.11.3,>=0.11.1
  Downloading tokenizers-0.11.6-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (6.5 MB)
[K     |████████████████████████████████| 6.5 MB 54.8 MB/s 
[?25hCollecting sacremoses
  Downloading sacremoses-0.0.47-py2.py3-none-any.whl (895 kB)
[K     |████████████████████████████████| 895 kB 67.5 MB/s 
Installing collected packages: pyyaml, tokenizers, sacremoses, huggingface-hub, transformers
  Attempting uninstall: pyyaml
  

In [None]:
import pandas as pd
import numpy as np
import random
import torch
import os
from torch.utils.data import Dataset, DataLoader
from transformers import GPT2Tokenizer, GPT2LMHeadModel, AdamW, get_linear_schedule_with_warmup, pipeline
from tqdm import tqdm, trange
import torch.nn.functional as F
import nltk

### Prepare datasets

In [None]:
# read the dataset and text file and clean
yelp = pd.read_csv('/content/drive/MyDrive/Deep Learning Project/yelp.csv')
shakespeares = pd.DataFrame(columns=['text'])
text_str = ''
with open('/content/drive/MyDrive/Deep Learning Project/shakespeare.txt','r') as f:
  for line in f:
    text_str += line
for i in text_str.split('\n\n')[:-1]:
  shakespeares.loc[len(shakespeares.index)] = [' '.join(i.split('\n'))]
f.close()

In [None]:
cleaned_text = []
for i in yelp['text']:
  cleaned_text.append(' '.join(i.split('\n\n')))
yelp['text'] = cleaned_text

In [None]:
# sanity check: GPT2 can only deal with the sentences with less than 1024 tokens.
yelp = yelp[yelp['text'].apply(lambda x: len(x.split(' ')) < 1024)]
shakespeares = shakespeares[shakespeares['text'].apply(lambda x: len(x.split(' ')) < 1024)]

In [None]:
yelp_pos = yelp[yelp['stars'].apply(lambda x: x in (4,5))]

In [None]:
# tokenize yelp reviews
class YelpReview(Dataset):  
    def __init__(self, control_code, gpt2_type="gpt2", max_length=1024):

        self.tokenizer = GPT2Tokenizer.from_pretrained(gpt2_type)
        self.reviews = []

        for row in tqdm(yelp['text']):
          self.reviews.append(torch.tensor(
                self.tokenizer.encode(f"<|{control_code}|>{row[:max_length]}<|endoftext|>")
            ))
        self.reviews_count = len(self.reviews)
        
    def __len__(self):
        return self.reviews_count

    def __getitem__(self, item):
        return self.reviews[item]
    
yelp_review = YelpReview(yelp['text'], gpt2_type="gpt2") 

class YelpReviewPos(Dataset):  
    def __init__(self, control_code, gpt2_type="gpt2", max_length=1024):

        self.tokenizer = GPT2Tokenizer.from_pretrained(gpt2_type)
        self.reviews = []

        for row in tqdm(yelp_pos['text']):
          self.reviews.append(torch.tensor(
                self.tokenizer.encode(f"<|{control_code}|>{row[:max_length]}<|endoftext|>")
            ))
        self.reviews_count = len(self.reviews)
        
    def __len__(self):
        return self.reviews_count

    def __getitem__(self, item):
        return self.reviews[item]
yelp_review_pos = YelpReviewPos(yelp_pos['text'], gpt2_type="gpt2")  

100%|██████████| 10000/10000 [00:37<00:00, 266.06it/s]
100%|██████████| 6863/6863 [00:26<00:00, 257.10it/s]


In [None]:
# tokenize shakespeares
class Shakespeares(Dataset):  
    def __init__(self, control_code, gpt2_type="gpt2", max_length=1024):

        self.tokenizer = GPT2Tokenizer.from_pretrained(gpt2_type)
        self.poems = []

        for row in tqdm(shakespeares['text']):
          self.poems.append(torch.tensor(
                self.tokenizer.encode(f"<|{control_code}|>{row[:max_length]}<|endoftext|>")
            ))
        self.poems_count = len(self.poems)
        
    def __len__(self):
        return self.poems_count

    def __getitem__(self, item):
        return self.poems[item]
    
shake_poems = Shakespeares(yelp['text'], gpt2_type="gpt2")   

100%|██████████| 212/212 [00:00<00:00, 249.38it/s]


### Fine tune

In [None]:
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
model = GPT2LMHeadModel.from_pretrained('gpt2')

# Accumulated batch size (since GPT2 is so big)
# eg: take the mean of 10000 numbers: we record the mean of each 100 numbers to save the memory
def pack_tensor(new_tensor, packed_tensor, max_seq_len):
    if packed_tensor is None:
        return new_tensor, True, None
    if new_tensor.size()[1] + packed_tensor.size()[1] > max_seq_len:
        return packed_tensor, False, new_tensor
    else:
        packed_tensor = torch.cat([new_tensor, packed_tensor[:, 1:]], dim=1)
        return packed_tensor, True, None


Downloading:   0%|          | 0.00/523M [00:00<?, ?B/s]

In [None]:
# fine tune function
def train(
    dataset, model, tokenizer,
    batch_size=16, epochs=5, lr=2e-5,
    max_seq_len=400, warmup_steps=200,
    gpt2_type="gpt2", output_dir=".", output_prefix="yelp",
    test_mode=False, save_model_on_epoch=False,
):
    acc_steps = 100
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    model = model.cuda()
    model.train()

    optimizer = AdamW(model.parameters(), lr=lr)
    scheduler = get_linear_schedule_with_warmup(
        optimizer, num_warmup_steps=warmup_steps, num_training_steps=-1
    )

    train_dataloader = DataLoader(dataset, batch_size=1, shuffle=True)
    loss=0
    accumulating_batch_count = 0
    input_tensor = None

    for epoch in range(epochs):

        print(f"Training epoch {epoch}")
        print(loss)
        for idx, entry in tqdm(enumerate(train_dataloader)):
            (input_tensor, carry_on, remainder) = pack_tensor(entry, input_tensor, 768)

            if carry_on and idx != len(train_dataloader) - 1:
                continue

            input_tensor = input_tensor.to(device)
            outputs = model(input_tensor, labels=input_tensor)
            loss = outputs[0]
            loss.backward()

            if (accumulating_batch_count % batch_size) == 0:
                optimizer.step()
                scheduler.step()
                optimizer.zero_grad()
                model.zero_grad()

            accumulating_batch_count += 1
            input_tensor = None
        if save_model_on_epoch:
            torch.save(
                model.state_dict(),
                '/content/drive/MyDrive/'+f"{output_prefix}-{epoch}.pt"
            )
    return model

In [None]:
# fine tune with yelp reviews
# model = train(yelp_review, model, tokenizer, epochs = 5, save_model_on_epoch = True)
# fine tune with positive yelp reviews
model = train(yelp_review_pos, model, tokenizer, epochs = 5, save_model_on_epoch = True, output_prefix="pos_yelp")



Training epoch 0
0


6863it [17:29,  6.54it/s]


Training epoch 1
tensor(0.3428, device='cuda:0', grad_fn=<NllLossBackward0>)


6863it [17:26,  6.56it/s]


Training epoch 2
tensor(0.3815, device='cuda:0', grad_fn=<NllLossBackward0>)


6863it [17:27,  6.55it/s]


Training epoch 3
tensor(1.0359, device='cuda:0', grad_fn=<NllLossBackward0>)


6863it [17:22,  6.58it/s]


Training epoch 4
tensor(0.8242, device='cuda:0', grad_fn=<NllLossBackward0>)


6863it [17:15,  6.63it/s]


In [None]:
# fine tune with shakespeares's sonnets
model = train(shake_poems, model, tokenizer, epochs = 10, save_model_on_epoch = True, output_prefix="shakes")



Training epoch 0
0


212it [00:30,  6.96it/s]


Training epoch 1
tensor(1.6088, device='cuda:0', grad_fn=<NllLossBackward0>)


212it [00:29,  7.14it/s]


Training epoch 2
tensor(1.6515, device='cuda:0', grad_fn=<NllLossBackward0>)


212it [00:30,  6.93it/s]


Training epoch 3
tensor(2.4888, device='cuda:0', grad_fn=<NllLossBackward0>)


212it [00:30,  6.92it/s]


Training epoch 4
tensor(2.4200, device='cuda:0', grad_fn=<NllLossBackward0>)


212it [00:30,  6.92it/s]


Training epoch 5
tensor(2.2236, device='cuda:0', grad_fn=<NllLossBackward0>)


212it [00:29,  7.08it/s]


Training epoch 6
tensor(2.9465, device='cuda:0', grad_fn=<NllLossBackward0>)


212it [00:30,  7.02it/s]


Training epoch 7
tensor(1.4781, device='cuda:0', grad_fn=<NllLossBackward0>)


212it [00:30,  6.87it/s]


Training epoch 8
tensor(2.2045, device='cuda:0', grad_fn=<NllLossBackward0>)


212it [00:30,  6.91it/s]


Training epoch 9
tensor(2.1925, device='cuda:0', grad_fn=<NllLossBackward0>)


212it [00:30,  6.98it/s]


### Load tuned model

In [None]:
$$# load the model so dont need to train model again
model_loaded = GPT2LMHeadModel.from_pretrained('gpt2')
model_loaded.load_state_dict(torch.load('/content/drive/MyDrive/shakes-9.pt'))

<All keys matched successfully>

### Genereate shakespeares review with guidance (positive or negative)

In [None]:
# generate text
def generate(
    model,
    tokenizer,
    prompt,
    entry_count=10,
    entry_length=30, #maximum number of words
    top_p=0.8,
    temperature=1.,
):
    model.eval()
    generated_num = 0
    generated_list = []

    filter_value = -float("Inf")

    with torch.no_grad():

        for entry_idx in trange(entry_count):

            entry_finished = False
            generated = torch.tensor(tokenizer.encode(prompt)).unsqueeze(0)

            for i in range(entry_length):
                outputs = model(generated, labels=generated)
                loss, logits = outputs[:2]
                logits = logits[:, -1, :] / (temperature if temperature > 0 else 1.0)

                sorted_logits, sorted_indices = torch.sort(logits, descending=True)
                cumulative_probs = torch.cumsum(F.softmax(sorted_logits, dim=-1), dim=-1)

                sorted_indices_to_remove = cumulative_probs > top_p
                sorted_indices_to_remove[..., 1:] = sorted_indices_to_remove[
                    ..., :-1
                ].clone()
                sorted_indices_to_remove[..., 0] = 0

                indices_to_remove = sorted_indices[sorted_indices_to_remove]
                logits[:, indices_to_remove] = filter_value

                next_token = torch.multinomial(F.softmax(logits, dim=-1), num_samples=1)
                generated = torch.cat((generated, next_token), dim=1)

                if next_token in tokenizer.encode("<|endoftext|>"):
                    entry_finished = True

                if entry_finished:

                    generated_num = generated_num + 1

                    output_list = list(generated.squeeze().numpy())
                    output_text = tokenizer.decode(output_list)
                    output_text.replace('<|endoftext|>','')
                    generated_list.append(output_text)
                    break
            
            if not entry_finished:
              output_list = list(generated.squeeze().numpy())
              output_text = tokenizer.decode(output_list)
              output_text.replace('<|endoftext|>','')
              output_text = '.'.join(output_text.split('.')[:-1])
              generated_list.append(output_text)
                
    return generated_list

In [None]:
# def generate_with_guidance(model, prompt, target):
#   sentiment_pipeline = pipeline("sentiment-analysis")
#   label = None
#   while not label and target != label:
#     s = generate(model=model,tokenizer=tokenizer,prompt=prompt,entry_length=150,entry_count=1)
#     label = sentiment_pipeline(s)[0]['label']
#   return s

### Generating cases

In [None]:
# model with 5-epoch shakespeares fine tune 
generate(model=model_loaded,tokenizer=tokenizer,prompt='Last month I went to the Marriott hotel fro business purpose',entry_length=100,entry_count=1)

100%|██████████| 1/1 [00:39<00:00, 39.04s/it]


["Last month I went to the Marriott hotel fro business purpose, my sense being that it has changed my mind on how to spend my vacations. This is no different.\n\nIt's been difficult, but it is true that you need to do your own research, and one thing I do know is that if you take my advice I'll come back to you here, I'll be a little unsure as to why I'm giving you that old retainer's chair, or why your entourage would have picked me.<|endoftext|>"]

In [None]:
generate_with_guidance(model=model_loaded, prompt='Last month I went to the Marriott hotel in Seattle.', target='POSITIVE')

No model was supplied, defaulted to distilbert-base-uncased-finetuned-sst-2-english (https://huggingface.co/distilbert-base-uncased-finetuned-sst-2-english)
100%|██████████| 1/1 [01:25<00:00, 85.16s/it]


["Last month I went to the Marriott hotel in Seattle. I was told it was a great location for long-term stay, so I was glad I got to visit. I was also happy to hear that the fitness center in San Francisco was recently upgraded to upscale fitness center in a new building. I'm glad the new facility is better and it's definitely better than the old one! Also, the new home of all the other Elite Fitness Sports venues is giving us a free gift: a lifetime membership to the Club of The Discovery"]

In [None]:
generate_with_guidance(model=model_loaded, prompt='Last month I went to the Marriott hotel in Seattle.', target='NEGATIVE')

No model was supplied, defaulted to distilbert-base-uncased-finetuned-sst-2-english (https://huggingface.co/distilbert-base-uncased-finetuned-sst-2-english)
100%|██████████| 1/1 [00:21<00:00, 21.98s/it]


['Last month I went to the Marriott hotel in Seattle. I saw many people making mistakes.\n\nThere was an enormous window for people to view the view of the horizon as the stars moved in the sky. This is really fine if you have the right equipment. But for the common beginner to come out, there are lots of unknowns and most people find it easy to be like the devil.<|endoftext|>']

#Swapping Sentiments of Reviews

In [None]:
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.


True

In [None]:
from transformers import BertTokenizer, BertForSequenceClassification
import torch
from torch.autograd import Variable

model_name = "nlptown/bert-base-multilingual-uncased-sentiment"
tokenizer = BertTokenizer.from_pretrained(model_name)
model = BertForSequenceClassification.from_pretrained(model_name)

Downloading:   0%|          | 0.00/851k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/112 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/39.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/953 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/638M [00:00<?, ?B/s]

In [None]:
from nltk.corpus import wordnet

def get_antonyms(word):
  antonyms = set()
  for syn in wordnet.synsets(word):
      for lm in syn.lemmas():
          if lm.antonyms():
              antonyms.add(lm.antonyms()[0].name()) #adding into antonyms
  return antonyms

# Returns the number of stars this review most likely gave
def get_sentiment(model, tokenizer, input):
  inputs = tokenizer(input, return_tensors="pt")['input_ids']
  output = model(inputs)[0].tolist()
  return np.array(output[0]).argmax() + 1


def swap_sentiment(model, tokenizer, input_string):
  inputs = tokenizer(input_string, return_tensors="pt")['input_ids'] # Convert our string into tokens
  emb = model.bert.embeddings(inputs) # Convert the tokens into vectors with the embedding layer
  emb = Variable(emb, requires_grad=True) # Add requires_grad so we can keep track of the gradients
  output = model.classifier(model.bert.encoder(emb)[0]).sum() # Classification of embeddings
  grads = torch.autograd.grad(output,emb,retain_graph=True, create_graph=True)[0] # Calculate gradients

  mags = torch.linalg.norm(grads, axis=-1)[0] # Calculate the magnitude of each gradient to find the most significant word
  tokens = tokenizer.convert_ids_to_tokens(inputs.tolist()[0])
  words = tokenizer.convert_tokens_to_string(tokens).split()
  sorted_words = [x for _, x in sorted(zip(mags, words), reverse=True)]
  original_sentiment = get_sentiment(model, tokenizer, input_string)

  # Replace the most significant word until sentiment has flippd
  review = input_string
  for w in sorted_words:
    antonyms = get_antonyms(w)
    best_antonym = w
    best_antonym_distance = 0
    for ant in antonyms:
      new_review = review.replace(w, ant)
      new_sentiment = get_sentiment(model, tokenizer, new_review)
      sentiment_distance = abs(new_sentiment - original_sentiment)
      if sentiment_distance >= 2: # Greater than 2 stars apart, consider that flipped
        print(new_sentiment, original_sentiment)
        return new_review
      elif sentiment_distance > best_antonym_distance:
        best_antonym_distance = sentiment_distance
        best_antonym = ant
    if len(antonyms) > 0:
      review = review.replace(w, best_antonym) # Replace this word with the antonym that creates the largest sentiment distance
  return None

swap_sentiment(model, tokenizer, "This place is the worst")
# swap_sentiment(model, tokenizer, "Last month I went to the Marriott hotel in Seattle. I was told it was a great location for long-term stay, so I was glad I got to visit. I was also happy to hear that the fitness center was recently upgraded to an upscale fitness center in a new building. I'm glad the new facility is better and it's definitely better than the old one! Also, the new home of all the other Elite Fitness Sports venues is giving us a free gift: a lifetime membership to the Club of The Discovery")

5 1


'This place is the good'

In [None]:
swap_sentiment(model, tokenizer, "I love this resturant, the food is so good")

1 5


'I hate this resturant, the food is so good'