# Libraries

In [2]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import torch
import logging
from tqdm import tqdm
import math
import argparse
import os

## Weights and Biases Integration

In [None]:
! pip install wandb

In [None]:
import wandb
wandb.init()

In [None]:
# Log in to your W&B account
wandb.login()

In [None]:
wandb.watch(model, log_freq=100)

# Import transformers 

In [3]:
!git clone https://github.com/huggingface/transformers
!pip install transformers/
from transformers import GPT2Tokenizer, GPT2LMHeadModel
from transformers.optimization import AdamW, get_linear_schedule_with_warmup

Cloning into 'transformers'...
remote: Enumerating objects: 88534, done.[K
remote: Counting objects: 100% (387/387), done.[K
remote: Compressing objects: 100% (235/235), done.[K
remote: Total 88534 (delta 178), reused 276 (delta 115), pack-reused 88147[K
Receiving objects: 100% (88534/88534), 71.56 MiB | 14.77 MiB/s, done.
Resolving deltas: 100% (63678/63678), done.
Processing ./transformers
[33m  DEPRECATION: A future pip version will change local packages to be built in-place without first copying to a temporary directory. We recommend you use --use-feature=in-tree-build to test your packages with this new behavior before it becomes the default.
   pip 21.3 will remove support for this functionality. You can find discussion regarding this at https://github.com/pypa/pip/issues/7555.[0m
  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
    Preparing wheel metadata ... [?25l[?25hdone
Collecting huggingface-hub<1.0,>=0

#Getting data

In [4]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


# Cleaning text and generating train and test dataset

In [5]:
DATAPATH='/content/drive/MyDrive/CMPE_297_Project/story-text'
def combinetext(prompt, story):
    fp=open(os.path.join(DATAPATH,prompt),encoding='utf8')
    fs=open(os.path.join(DATAPATH,story),encoding='utf8')
    prompts=fp.readlines()
    stories=fs.readlines()
    assert len(prompts)==len(stories)
    combine=[]
    for i in range(len(prompts)):
        combine.append(prompts[i].rstrip()+' <sep> '+" ".join(stories[i].split()[:300]))
    return combine

#do a littel text clean with punctuations
def cleanpunctuation(s):
    for p in '!,.:;?':
        s=s.replace(' '+p,p)
    s=s.replace(' '+'n\'t','n\'t')
    s=s.replace(' '+'\'s','\'s')
    s=s.replace(' '+'\'re','\'re')
    s=s.replace(' '+'\'ve','\'ve')
    s=s.replace(' '+'\'ll','\'ll')
    s=s.replace(' '+'\'am','\'am')
    s=s.replace(' '+'\'m','\'m')
    s=s.replace(' '+'\' m','\'m')
    s=s.replace(' '+'\'m','\'m')
    s=s.replace(' '+'\' ve','\'ve')
    s=s.replace(' '+'\' s','\'s')
    s=s.replace('<newline>','\n')
    return s   

train_text=combinetext('valid.wp_source', 'valid.wp_target')
train_text=list(map(cleanpunctuation,train_text))
valid_text=combinetext('test.wp_source', 'test.wp_target')
valid_text=list(map(cleanpunctuation,valid_text))

# Sample train data

In [6]:
train_text[6]

"[ WP ] Everyone in the world has magic with various levels of mastery over it. You are extremely powerful with almost no control so you find a demon that's very weak but extremely good at controlling his powers. <sep> `` Imagine you're in a field. '' Green extends in all directions. `` You're alone, the earth is flat, and the blue sky touches the horizon. '' Blue shoots from the ground, arcing overhead. `` The sun appears, tiny in the sky. '' There's a bright light, rays casting shadow behind me. `` What color is it? '' \n \n `` Yellow. '' It burns so brightly, winking playfully. \n \n `` Good. '' She licks her chapped lips, the sound distorting my tiny sun's light. `` Look ahead of you. There's a sheep. '' Something soft and downy wanders across the green, its shadow stretching far beyond the horizon. `` What color is it? '' \n \n My brows crease. `` Uh- '' \n \n `` What color is it? '' \n \n The green wavers. Baa baa black sheep, have you any wool? `` Uh. '' Mary had a little lamb, 

# Tokenizer and GPT model

In [7]:
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
tokenizer.pad_token=tokenizer.eos_token

inputs_train = tokenizer(train_text, padding=True,truncation=True,max_length=512)
inputs_valid=tokenizer(valid_text, padding=True,truncation=True,max_length=512)

Downloading:   0%|          | 0.00/0.99M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/446k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.29M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/665 [00:00<?, ?B/s]

# Masking

In [8]:
def create_labels(inputs):
    labels=[]
    for ids,attention_mask in zip(inputs['input_ids'],inputs['attention_mask']):
        label=ids.copy()
        real_len=sum(attention_mask)
        padding_len=len(attention_mask)-sum(attention_mask)
        label[:]=label[:real_len]+[-100]*padding_len
        labels.append(label)
    inputs['labels']=labels
    
create_labels(inputs_train)
create_labels(inputs_valid)

In [9]:
print(inputs_train['input_ids'][6])
print(inputs_train['attention_mask'][6])
print(inputs_train['labels'][6])

[58, 28993, 2361, 11075, 287, 262, 995, 468, 5536, 351, 2972, 2974, 286, 30677, 625, 340, 13, 921, 389, 4457, 3665, 351, 2048, 645, 1630, 523, 345, 1064, 257, 3222, 326, 338, 845, 4939, 475, 4457, 922, 379, 12755, 465, 5635, 13, 1279, 325, 79, 29, 7559, 18450, 345, 821, 287, 257, 2214, 13, 10148, 3469, 14582, 287, 477, 11678, 13, 7559, 921, 821, 3436, 11, 262, 4534, 318, 6228, 11, 290, 262, 4171, 6766, 18105, 262, 17810, 13, 10148, 4518, 20611, 422, 262, 2323, 11, 610, 2259, 16965, 13, 7559, 383, 4252, 3568, 11, 7009, 287, 262, 6766, 13, 10148, 1318, 338, 257, 6016, 1657, 11, 24823, 13092, 9082, 2157, 502, 13, 7559, 1867, 3124, 318, 340, 30, 10148, 220, 198, 220, 198, 7559, 12550, 13, 10148, 632, 20246, 523, 35254, 11, 266, 8040, 711, 2759, 13, 220, 198, 220, 198, 7559, 4599, 13, 10148, 1375, 300, 3378, 607, 442, 6320, 11914, 11, 262, 2128, 1233, 24707, 616, 7009, 4252, 338, 1657, 13, 7559, 6803, 4058, 286, 345, 13, 1318, 338, 257, 15900, 13, 10148, 13742, 2705, 290, 866, 88, 11569, 36

# Creating dataset

In [10]:
class StoryDataset:
    def __init__(self, inputs):
        self.ids = inputs['input_ids']
        self.attention_mask = inputs['attention_mask']
        self.labels=inputs['labels']

    def __len__(self):
        return len(self.ids)

    def __getitem__(self, item):

        return [torch.tensor(self.ids[item], dtype=torch.long),
                torch.tensor(self.attention_mask[item], dtype=torch.long),
                torch.tensor(self.labels[item], dtype=torch.long)]

In [11]:
train_batch_size=4
valid_batch_size=4
traindata=StoryDataset(inputs_train)
train_dataloader = torch.utils.data.DataLoader(
    traindata,
    shuffle=False,
    batch_size=train_batch_size)

validdata=StoryDataset(inputs_valid)
valid_dataloader = torch.utils.data.DataLoader(
    validdata,
    shuffle=False,
    batch_size=valid_batch_size)

# Downloading pretrained model

In [12]:
model = GPT2LMHeadModel.from_pretrained('gpt2')

Downloading:   0%|          | 0.00/523M [00:00<?, ?B/s]

# Ecvaluating on validation dataset

In [13]:
model.to('cuda')
model.eval()
eval_loss=[]
for inputs in tqdm(valid_dataloader, desc="eval"):
    d1,d2,d3=inputs
    d1=d1.to('cuda')        
    d2=d2.to('cuda')
    d3=d3.to('cuda')

    with torch.no_grad():
        output = model(input_ids=d1, attention_mask=d2,labels=d3)
        batch_loss=output[0]
    eval_loss+=[batch_loss.cpu().item()]
    del batch_loss
eval_loss=np.mean(eval_loss)
perplexity=math.exp(eval_loss)
print(f'The average perplexity for valid dataset before fine-tuning is {perplexity}') 

eval: 100%|██████████| 3785/3785 [07:21<00:00,  8.58it/s]

The average perplexity for valid dataset before fine-tuning is 39.2788028702746





# Function to generate story from prompt

In [14]:
prompt=valid_text[300][:valid_text[300].find('<sep>')]
target=valid_text[300][valid_text[300].find('<sep>')+5:]

def generate_story(prompt,target,k=0,p=0.9,output_length=300,temperature=1,num_return_sequences=3,repetition_penalty=1.0):
    print("====prompt====\n")
    print(prompt+"\n")
    print('====target story is as below===\n')
    print(target+"\n")
    encoded_prompt = tokenizer.encode(prompt, add_special_tokens=False, return_tensors="pt")
    model.to('cpu')
    model.eval()
    output_sequences = model.generate(
        input_ids=encoded_prompt,
        max_length=output_length,
        temperature=temperature,
        top_k=k,
        top_p=p,
        repetition_penalty=repetition_penalty,
        do_sample=True,
        num_return_sequences=num_return_sequences
    )
    if len(output_sequences.shape) > 2:
        output_sequences.squeeze_()
    for generated_sequence_idx, generated_sequence in enumerate(output_sequences):
        print("=== GENERATED SEQUENCE {} ===".format(generated_sequence_idx + 1))
        generated_sequence = generated_sequence.tolist()
        # Decode text
        text = tokenizer.decode(generated_sequence, clean_up_tokenization_spaces=True)
        # Remove all text after eos token
        text = text[: text.find(tokenizer.eos_token)]
        print(text)

generate_story(prompt,target)

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


====prompt====

Children's logic dictates the way the world works. [ WP ] 

====target story is as below===

 “ That ’ s not an option I ’ m currently willing to exercise. ” 
 
 I pinch the bridge of my nose to stave off the headache building behind my eyes. If this goes on much longer, I ’ m gon na have to start to start cutting back on the vegetables. 
 
 “ She ’ s dangerous, Jimmy. You know that. You ’ ve seen it. Dealt with it first hand. She just doesn ’ t play by anyone ’ s rules. ” 
 
 Ali finished off her sucker and unwrapped a fresh one, offering it to me. I declined. I ’ d sworn off the things after my third cavity scare. That one saw me at the dentist for the third time in as many months. I don ’ t care what my dad says, I know that guy is evil. Who owns a drill like that? A murderer, that ’ s who. I still hear the damn thing in my nightmares. 
 
 While she savored the smooth flavor of blue-raspberry, I pondered her words. We both knew she was right. The situation was spiral

# Hyperparamters

In [15]:
lr = 5e-5
epochs = 1
warmup = 0.1

# Training model

In [16]:
num_train_epochs = epochs
training_steps_per_epoch=len(train_dataloader)
total_num_training_steps = int(training_steps_per_epoch*num_train_epochs)
weight_decay=0
learning_rate=lr
adam_epsilon=1e-8
warmup_steps=int(total_num_training_steps*warmup)
no_decay = ["bias", "LayerNorm.weight"]
optimizer_grouped_parameters = [
    {
        "params": [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)],
        "weight_decay": weight_decay,
    },
    {
        "params": [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)],
        "weight_decay": 0.0,
    },
]
optimizer = AdamW(optimizer_grouped_parameters, lr=learning_rate, eps=adam_epsilon)
scheduler = get_linear_schedule_with_warmup(
    optimizer, num_warmup_steps=warmup_steps, num_training_steps=total_num_training_steps
)

In [17]:
print("***** Running training *****")
print("  Total_num_training_step = {}".format(total_num_training_steps))
print("  Num Epochs = {}".format(num_train_epochs))
print(f"  Train_batch_size per device = {train_batch_size}")
print(f"  Valid_batch_size per device = {valid_batch_size}")
model.to('cuda')
for epoch in range(num_train_epochs):
    print(f"Start epoch{epoch+1} of {num_train_epochs}")
    train_loss=0
    epoch_iterator = tqdm(train_dataloader,desc='Iteration')
    model.train()
    model.zero_grad()    
    for _, inputs in enumerate(epoch_iterator):        
        d1,d2,d3=inputs
        d1=d1.to('cuda')
        d2=d2.to('cuda')
        d3=d3.to('cuda')
        output = model(input_ids=d1, attention_mask=d2,labels=d3)
        batch_loss=output[0]
        batch_loss.backward()
        optimizer.step()
        scheduler.step()
        model.zero_grad()
        train_loss+=batch_loss.item()
        epoch_iterator.set_description('(batch loss=%g)' % batch_loss.item())
        del batch_loss
    avg_train_loss = train_loss/training_steps_per_epoch
    for i in range (10):
        wandb.log({"train_loss": avg_train_loss})
    print(f'Average train loss per example={avg_train_lossh} in epoch{epoch+1}')    
    print(f'Starting evaluate after epoch {epoch+1}')
    eval_loss=[]    
    model.eval()    
    for inputs in tqdm(valid_dataloader, desc="eval"):
        d1,d2,d3=inputs
        d1=d1.to('cuda')        
        d2=d2.to('cuda')
        d3=d3.to('cuda')
        with torch.no_grad():
            output = model(input_ids=d1, attention_mask=d2,labels=d3)
            batch_loss=output[0]
        eval_loss+=[batch_loss.cpu().item()]
        del batch_loss
    eval_loss=np.mean(eval_loss)
    perplexity=math.exp(eval_loss)
    for i in range (10):
        wandb.log({"eval_loss": eval_loss})
    print(f'Average valid loss per example={eval_loss} in epoch{epoch+1}')    
    print(f'Perplextiy for valid dataset in epoch{epoch+1} is {perplexity}')

***** Running training *****
  Total_num_training_step = 3905
  Num Epochs = 1
  Train_batch_size per device = 4
  Valid_batch_size per device = 4
Start epoch1 of 1


(batch loss=2.8727): 100%|██████████| 3905/3905 [22:35<00:00,  2.88it/s]


Average train loss per example=3.284487036400965 in epoch1
Starting evaluate after epoch 1


eval: 100%|██████████| 3785/3785 [07:21<00:00,  8.58it/s]

Average valid loss per example=3.18296861717849 in epoch1
Perplextiy for valid dataset in epoch1 is 24.118245220941436





### Complete Wandb training

In [None]:
wandb.finish()

# Testing

In [18]:
prompt=valid_text[300][:valid_text[300].find('<sep>')]
target=valid_text[300][valid_text[300].find('<sep>')+5:]
generate_story(prompt,target)

====prompt====

Children's logic dictates the way the world works. [ WP ] 

====target story is as below===

 “ That ’ s not an option I ’ m currently willing to exercise. ” 
 
 I pinch the bridge of my nose to stave off the headache building behind my eyes. If this goes on much longer, I ’ m gon na have to start to start cutting back on the vegetables. 
 
 “ She ’ s dangerous, Jimmy. You know that. You ’ ve seen it. Dealt with it first hand. She just doesn ’ t play by anyone ’ s rules. ” 
 
 Ali finished off her sucker and unwrapped a fresh one, offering it to me. I declined. I ’ d sworn off the things after my third cavity scare. That one saw me at the dentist for the third time in as many months. I don ’ t care what my dad says, I know that guy is evil. Who owns a drill like that? A murderer, that ’ s who. I still hear the damn thing in my nightmares. 
 
 While she savored the smooth flavor of blue-raspberry, I pondered her words. We both knew she was right. The situation was spiral

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


=== GENERATED SEQUENCE 1 ===
Children's logic dictates the way the world works. [ WP ] 
 - You are the greatest body in the universe. You have more life, more intelligence, more power and ability than any other human. You are immortal. All of the previous power the world had, you are now barely immortal. <sep> `` No, 'keeper *Jesus*! '' The button clicked. Steve immediately tightened his grip. He didn't want to be trapped in this body; he was a body in control, a body that only enjoyed controlling a few tiny hairs on the back of his neck. 
 Steve opened the door and leaned in, resting his hand on the brass knob of his desk. He pulled down the scotch. It was heavy, and every nerve that ran through it was squished into a tiny concave hole. So he sat down. `` Welcome to earth. '' He commanded and Steve glanced at him. 
 
 `` Why is God chosen for it? Why not.... '' 
 
 Steve looked up at him, holding his hand open. `` Maybe he doesn't like me. '' 
 
 The words came out of Steve's mouth. `