# Automatic Elon Musk: Generating Synthetic Tweets Using GPT2

GPT2 Finetuning using PyTorch and HuggingFace

In [1]:
!pip install transformers



In [2]:
import torch.nn as nn
import transformers
from torch.utils.data import DataLoader, Dataset, random_split, RandomSampler, SequentialSampler
import pandas as pd
import numpy as np
import random 
from google.colab import drive
import matplotlib.image as mpimg
import matplotlib.pyplot as plt

import transformers
from transformers import GPT2LMHeadModel,  GPT2Tokenizer, GPT2Config, GPT2LMHeadModel
from transformers import AdamW, get_linear_schedule_with_warmup
from transformers import Trainer, TrainingArguments

In [3]:
# empty cache to clear space for training
import torch
torch.cuda.empty_cache()

In [4]:
# mount drive
drive.mount('/gdrive')
drive_root = '/gdrive/My Drive/'

Drive already mounted at /gdrive; to attempt to forcibly remount, call drive.mount("/gdrive", force_remount=True).


In [5]:
%cd ..
%cd gdrive/MyDrive/elon_bot

/
/gdrive/MyDrive/elon_bot


In [6]:
# refer to elon_bot_lstm.ipynb 
init_tweets = pd.read_csv('tweets.csv')
init_tweets = init_tweets['0']

In [7]:
# ampersand bug
tweets = []
for tweet in init_tweets:
  tweets.append(tweet.replace('&amp', '&'))

tweets = pd.Series(tweets)

In [8]:
!pip install transformers



In [9]:
# define batch size and load in pretrained tokenizer
# adding bos, eos and pad token
BATCH_SIZE=4
tokenizer = GPT2Tokenizer.from_pretrained('gpt2', bos_token='<|startoftext|>', eos_token='<|endoftext|>', pad_token='<|pad|>') 

Special tokens have been added in the vocabulary, make sure the associated word embedding are fine-tuned or trained.


In [10]:
# define Dataset class
class TorchDataset(Dataset):

  def __init__(self, tweets, tokenizer, max_length):

    self.tokenizer = tokenizer
    self.input_ids = []
    self.attn_mask = []

    for tweet in tweets:
      # tokenizing on a word level
      # encodings is dictionary with two keys:
      # one are the tokenized inputs, and other are attention_masks (what to pay attention to)
      # all tweets are padded to length of 'max_length' w/ the pad token
      # padded tokens are defaulted w/ attention 0
      encodings = tokenizer('<|startoftext|>'+ tweet + '<|endoftext|>', truncation=True, max_length=max_length, padding="max_length")
      self.input_ids.append(torch.tensor(encodings['input_ids']))
      self.attn_mask.append(torch.tensor(encodings['attention_mask']))

  def __len__(self):
    return len(self.input_ids)

  def __getitem__(self, idx):
    return {'input_ids': self.input_ids[idx], 'attn_mask': self.attn_mask[idx]}

In [11]:
# train test split
dataset = TorchDataset(tweets, tokenizer, max_length=300)

TRAIN_SIZE = int(0.85 * len(dataset))
VAL_SIZE = len(dataset) - TRAIN_SIZE

train_ds, val_ds = random_split(dataset, [TRAIN_SIZE, VAL_SIZE])

In [12]:
# take look at one output
tokenizer.decode((list(train_ds))[0]['input_ids'])

'<|startoftext|> I never used this guy. He gave a talk at SpaceX once.<|endoftext|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pa

In [13]:
# DataLoader similar to tf dataset
# train_ds = train_ds.shuffle(BUFFER_SIZE).batch(BATCH_SIZE) 
# for train, shuffle and batch randomly
train_dataloader = DataLoader(train_ds, 
                              sampler = RandomSampler(train_ds), 
                              batch_size = BATCH_SIZE)

# for validation can just batch sequentially.
val_dataloader = DataLoader(val_ds,
            sampler = SequentialSampler(val_ds),
            batch_size = BATCH_SIZE)

In [14]:
from transformers import GPT2Tokenizer, TFGPT2Model
configuration = GPT2Config.from_pretrained('gpt2', output_hidden_states=False)

# load pretrained model
model = GPT2LMHeadModel.from_pretrained("gpt2", config=configuration)

# added bos_token and eos_token to embeddings
# need to resize otherwise tokenizer and model tensors won't match up
model.resize_token_embeddings(len(tokenizer))

device = torch.device("cuda")
model.cuda()

# reproducability
random.seed(0)
np.random.seed(0)
torch.manual_seed(0)
torch.cuda.manual_seed_all(0)

In [15]:
warmup_steps = 100
epochs = 2
# optmize model paramters with AdamW
optimizer = AdamW(model.parameters(), lr = 0.001)

In [16]:
total_steps = len(train_dataloader) * epochs

# change learning rate as throughout training loop
scheduler = get_linear_schedule_with_warmup(optimizer, 
                                            num_warmup_steps = warmup_steps, 
                                            num_training_steps = total_steps)

In [17]:
def train_epoch(model, dataloader, optimizer):
  sample_every = 200
  total_train_loss = 0
  model.train()
  for step, batch in enumerate(dataloader):

    b_input_ids = batch['input_ids'].to(device)
    b_labels = batch['input_ids'].to(device)
    b_masks = batch['attn_mask'].to(device)

    # zero gradients after each batch 
    # https://stackoverflow.com/questions/48001598/why-do-we-need-to-call-zero-grad-in-pytorch 
    model.zero_grad()        

    outputs = model(  b_input_ids,
                      labels=b_labels, 
                      attention_mask = b_masks,
                      token_type_ids=None
                    )

    loss = outputs[0]  
    batch_loss = loss.item()
    total_train_loss += batch_loss

    # generate sentence evey n sample steps
    if step % sample_every == 0 and not step == 0:

        model.eval()

        # https://huggingface.co/blog/how-to-generate
        # top k: keep only top k words in distribution, and redistribute distribution to those words
        # top p: filter words that exceed a probability p, then redistribute (dynamic!)
        sample_outputs = model.generate(
                                bos_token_id=random.randint(1,30000),
                                do_sample=True,   
                                top_k=50, 
                                max_length = 200,
                                top_p=0.95, 
                                num_return_sequences=1
                            )
        for i, sample_output in enumerate(sample_outputs):
              print("{}: {}".format(i, tokenizer.decode(sample_output, skip_special_tokens=True)))
        
        model.train()

    # calc loss gradients
    loss.backward()
    # update
    optimizer.step()
    scheduler.step()

  # average loss
  return total_train_loss / len(dataloader)       

In [18]:
def eval_epoch(model, dataloader):
  model.eval()
  val_loss = 0

  # evaluate data for one epoch
  for batch in dataloader:
      
      b_input_ids = batch['input_ids'].to(device)
      b_labels = batch['input_ids'].to(device)
      b_masks = batch['attn_mask'].to(device)
      
      # disable gradient calculation for evaluation
      with torch.no_grad():        

          outputs  = model(b_input_ids, attention_mask = b_masks, labels=b_labels)
          loss = outputs[0]  
          
      batch_loss = loss.item()
      val_loss += batch_loss        

  return val_loss / len(dataloader)

In [19]:
# train and generate every 200 steps
model = model.to(device)

for epoch_i in range(0, epochs):

    print("")
    print('======== Epoch {:} / {:} ========'.format(epoch_i + 1, epochs))
    train_loss = train_epoch(model, train_dataloader, optimizer)

    print("")
    print("  Average training loss: {0:.2f}".format(train_loss))

    val_loss = eval_epoch(model, val_dataloader)
    print("  Validation Loss: {0:.2f}".format(val_loss))




Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


0:  invitesThere is a long lasting lesson in a few weeks


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


0:  CCTesla Model X, in Japan.


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


0:  barbarAnd, as soon as we get the upper stage (or the main engine) at night we will be at the top.


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


0: ramerThat's why. It’s like a Tesla. Aiming at real, product vs market, but just an increase in production.

  Average training loss: 0.45
  Validation Loss: 0.24



Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


0:  NeuroIt is better to be good as a product than as a service.


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


0:  hardSpaceX is trying to make Falcon 9 reusable by end of the year, but we still need 3. Starship has a rocket?


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


0:  enthusI will be reviewing the latest Tesla Model S/X/X SUV unveil on Tesla website every Monday at 3pm California time. Also a few things to clarify to all that's coming.


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


0:  BryantTbh like that

  Average training loss: 0.18
  Validation Loss: 0.24


In [20]:
# print out text
model.eval()
# start string of start token
prompt = "<|startoftext|>"

generated = torch.tensor(tokenizer.encode(prompt)).unsqueeze(0)
generated = generated.to(device)

print(generated)

sample_outputs = model.generate(
                                generated, 
                                do_sample=True,   
                                top_k=50, 
                                max_length = 300,
                                num_return_sequences=3
                                )

for i, sample_output in enumerate(sample_outputs):
  print("{}: {}\n\n".format(i, tokenizer.decode(sample_output, skip_special_tokens=True)))

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


tensor([[50257]], device='cuda:0')
0: Thanks, we can't wait to see thousands of Model 3 cars rolling out at launch. Launch dates would be April 1st and April 31st.


1: Dogecoin is coming all over again. Selling at this rate, so much more incentive flow to incent purchase.


2: yes




## References:


*   HuggingFace https://huggingface.co/transformers/model_doc/gpt2.html
*   Rey Farhan http://reyfarhan.com/posts/easy-gpt2-finetuning-huggingface/ (heavily inspired and sourced from his tutorial)
*   PyTorch Datasets https://pytorch.org/tutorials/beginner/data_loading_tutorial.html

