<a href="https://colab.research.google.com/github/verycumbersome/DeboBot/blob/master/CS499_Final_Assignment.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Init

In [None]:
!pip install -q git+https://github.com/huggingface/transformers.git

import os
import torch
import pandas as pd

from torch.utils.data import Dataset, DataLoader, random_split, RandomSampler, SequentialSampler
torch.manual_seed(42)

from transformers import GPT2LMHeadModel,  GPT2Tokenizer, GPT2Config, GPT2LMHeadModel
from transformers import AdamW, get_linear_schedule_with_warmup

# Mount google drive to access data
from google.colab import drive
drive.mount("/content/drive")

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

tokenizer = GPT2Tokenizer.from_pretrained(
    "gpt2", 
    bos_token="<|startoftext|>", 
    eos_token="<|endoftext|>", 
    pad_token="<|pad|>"
)


# add the EOS token as PAD token to avoid warnings
model = GPT2LMHeadModel.from_pretrained("gpt2", pad_token_id=tokenizer.eos_token_id)


  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
    Preparing wheel metadata ... [?25l[?25hdone
[K     |████████████████████████████████| 890kB 13.2MB/s 
[K     |████████████████████████████████| 2.9MB 34.9MB/s 
[?25h  Building wheel for transformers (PEP 517) ... [?25l[?25hdone
  Building wheel for sacremoses (setup.py) ... [?25l[?25hdone
Mounted at /content/drive


HBox(children=(FloatProgress(value=0.0, description='Downloading', max=1042301.0, style=ProgressStyle(descript…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=456318.0, style=ProgressStyle(descripti…

Special tokens have been added in the vocabulary, make sure the associated word embedding are fine-tuned or trained.





HBox(children=(FloatProgress(value=0.0, description='Downloading', max=665.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=548118077.0, style=ProgressStyle(descri…




In [None]:
#data_path = "/content/drive/MyDrive/Data/CS499 Final Project/Tweet data"
#
#data = []
#for f in os.listdir(data_path):
#    data.append(pd.read_csv(os.path.join(data_path, f)))
#
#data = pd.concat(data, axis=0, ignore_index=True).dropna()
#data["text"] = data["text"].astype(str)

data_path = "/content/drive/MyDrive/Data/CS499 Final Project/Restaurant_Reviews.tsv"
data = pd.read_csv(data_path, sep="\t", engine="python")
data["Review"] = data["Review"].astype(str)

pos_data = data[data["Liked"] == 1].drop("Liked", axis=1)
neg_data = data[data["Liked"] == 0].drop("Liked", axis=1)



In [None]:
class GPT2Dataset(Dataset):
    def __init__(self, text_data, tokenizer, gpt2_type="gpt2", max_length=768):
        self.text_data = text_data 
        self.tokenizer = tokenizer
        self.input_ids = []
        self.attention_mask = []
    
        for index in range(len(text_data)):
            text = self.text_data.iloc[index,:]["Review"]
            text = "<|startoftext|>" + str(text) + "<|endoftext|>"
            encodings = self.tokenizer(
                text,
                truncation=True,
                max_length=max_length,
                padding="max_length",
                return_tensors="pt"
            )
#            encodings.to(device)
            self.input_ids.append(encodings["input_ids"])
            self.attention_mask.append(encodings["attention_mask"])

    def __getitem__(self, index):
        return({
            "text":self.text_data.iloc[index,:]["Review"],
            "input_ids":self.input_ids[index],
            "attention_mask":self.attention_mask[index],
        }) 
      
    def __len__(self):
        return(len(self.text_data))


pos_dataset = GPT2Dataset(pos_data, tokenizer)
neg_dataset = GPT2Dataset(neg_data, tokenizer)


In [None]:
model.resize_token_embeddings(len(tokenizer))

epochs = 5
learning_rate = 5e-4
warmup_steps = 1e2
epsilon = 1e-8

# this produces sample output every 100 steps
sample_every = 100

# Note: AdamW is a class from the huggingface library (as opposed to pytorch) 
optimizer = AdamW(
    model.parameters(),
    lr = learning_rate,
    eps = epsilon
)

# Total number of training steps is [number of batches] x [number of epochs]. 
# (Note that this is not the same as the number of training samples).
total_steps = len(pos_data) * epochs
            
# Create the learning rate scheduler.
# This changes the learning rate as the training loop progresses
scheduler = get_linear_schedule_with_warmup(
    optimizer, 
    num_warmup_steps = warmup_steps, 
    num_training_steps = total_steps
)


# Training

In [None]:
model.to(device)

epochs = 3
for epoch in range(epochs):
    model.train()

    for batch_num, batch in enumerate(neg_dataset):
        batch_input_ids = batch["input_ids"].to(device)
        batch_labels = batch["input_ids"].to(device)
        batch_masks = batch["attention_mask"].to(device)

        model.zero_grad()

        outputs = model(
            batch_input_ids,
            labels=batch_labels, 
            attention_mask=batch_masks,
            token_type_ids=None
        )

        loss = outputs[0] 

        if (batch_num % 100 == 0):
            print("Batch num:", batch_num)
            
            sample_out = model.generate(
                #batch_input_ids, 
                do_sample=True,
                max_length=50,
                top_p=0.95,
                top_k=50, 
                temperature=0.7,
                num_return_sequences=5,
            )

            print("Original:", batch["text"])
            print("Generated:", tokenizer.decode(sample_out[0], skip_special_tokens=True))
            print("Training Loss:", loss)
            print("END")

        loss.backward()

        optimizer.step()

        scheduler.step()

model.save_pretrained("/content/drive/MyDrive/Data/CS499 Final Project/negmodel")

Batch num: 0
Original: Crust is not good.
Generated: The owner of the restaurant was very nice and even brought two small boys and a baby back to eat there.
Training Loss: tensor(0.0458, device='cuda:0', grad_fn=<NllLossBackward>)
END
Batch num: 100
Original: Hopefully this bodes for them going out of business and someone who can cook can come in.
Generated:  food was not good.
Training Loss: tensor(0.1378, device='cuda:0', grad_fn=<NllLossBackward>)
END
Batch num: 200
Original: I am far from a sushi connoisseur but I can definitely tell the difference between good food and bad food and this was certainly bad food.
Generated:  and an extensive

I have no complaints.
Training Loss: tensor(0.1444, device='cuda:0', grad_fn=<NllLossBackward>)
END
Batch num: 300
Original: Your staff spends more time talking to themselves than me.
Generated:  food was lacking at the end.
Training Loss: tensor(0.0765, device='cuda:0', grad_fn=<NllLossBackward>)
END
Batch num: 400
Generated:  food is not good,

In [None]:

pos_model = GPT2LMHeadModel.from_pretrained("/content/drive/MyDrive/Data/CS499 Final Project/posmodel")
neg_model = GPT2LMHeadModel.from_pretrained("/content/drive/MyDrive/Data/CS499 Final Project/negmodel")

for i in range(20):
    outputs = pos_model.generate(
        max_length=25,
        do_sample=True,
        top_k=50, 
        top_p=0.95,
        temperature=0.7,
        num_return_sequences=5,
    )
    
    output = tokenizer.decode(outputs[0], skip_special_tokens=True)
    print(str(i + 1) + ". Positive review:", output)

for i in range(20):
    outputs = neg_model.generate(
        max_length=25,
        do_sample=True,
        top_k=50, 
        top_p=0.95,
        temperature=0.7,
        num_return_sequences=5,
    )
    
    output = tokenizer.decode(outputs[0], skip_special_tokens=True)
    print(str(i + 1) + ". Negative review:", output)

1. Positive review: The food came out on time, great service, good food.
2. Positive review: I'm going to review this place twice - once hereas a tribute to the place and once as a place to
3. Positive review: I love the fact that their chips and salsa was so fresh and fresh.
4. Positive review: This place is pretty good.
5. Positive review: The Han Han Han Chicken was deliciously seasoned with a delicate touch and moist mouth.
6. Positive review: The pizza was absolutely amazing.
7. Positive review: The chef was very nice and even brought the guest out to his table.
8. Positive review: The nachos are a MUST HAVE!
9. Positive review: The Veggitarian Pita is a MUST HAVE!
10. Positive review: This is an unbelievable experience and I highly recommend it.
11. Positive review: The staff is super nice and efficient.
12. Positive review: I love the decor and the decor.
13. Positive review: I can assure you that you won be happy you did.
14. Positive review: This is an excellent restaurant by 