# Automatic Elon Musk: Generating Synthetic Tweets Using GPT2

GPT2 Finetuning using PyTorch and HuggingFace

In [1]:
!pip install transformers

Collecting transformers
[?25l  Downloading https://files.pythonhosted.org/packages/50/0c/7d5950fcd80b029be0a8891727ba21e0cd27692c407c51261c3c921f6da3/transformers-4.1.1-py3-none-any.whl (1.5MB)
[K     |████████████████████████████████| 1.5MB 12.7MB/s 
Collecting sacremoses
[?25l  Downloading https://files.pythonhosted.org/packages/7d/34/09d19aff26edcc8eb2a01bed8e98f13a1537005d31e95233fd48216eed10/sacremoses-0.0.43.tar.gz (883kB)
[K     |████████████████████████████████| 890kB 51.7MB/s 
Collecting tokenizers==0.9.4
[?25l  Downloading https://files.pythonhosted.org/packages/0f/1c/e789a8b12e28be5bc1ce2156cf87cb522b379be9cadc7ad8091a4cc107c4/tokenizers-0.9.4-cp36-cp36m-manylinux2010_x86_64.whl (2.9MB)
[K     |████████████████████████████████| 2.9MB 54.4MB/s 
Building wheels for collected packages: sacremoses
  Building wheel for sacremoses (setup.py) ... [?25l[?25hdone
  Created wheel for sacremoses: filename=sacremoses-0.0.43-cp36-none-any.whl size=893261 sha256=8f707a3063687f9269

In [2]:
import torch.nn as nn
import transformers
from torch.utils.data import DataLoader, Dataset, random_split, RandomSampler, SequentialSampler
import pandas as pd
import numpy as np
import random 
from google.colab import drive
import matplotlib.image as mpimg
import matplotlib.pyplot as plt

import transformers
from transformers import GPT2LMHeadModel,  GPT2Tokenizer, GPT2Config, GPT2LMHeadModel
from transformers import AdamW, get_linear_schedule_with_warmup
from transformers import Trainer, TrainingArguments

In [3]:
# empty cache to clear space for training
import torch
torch.cuda.empty_cache()

In [4]:
# mount drive
drive.mount('/gdrive')
drive_root = '/gdrive/My Drive/'

Mounted at /gdrive


In [5]:
%cd ..
%cd gdrive/MyDrive/elon_bot

/
/gdrive/MyDrive/elon_bot


In [6]:
# refer to elon_bot_lstm.ipynb 
init_tweets = pd.read_csv('tweets.csv')
init_tweets = init_tweets['0']

In [7]:
# ampersand bug
tweets = []
for tweet in init_tweets:
  tweets.append(tweet.replace('&amp', '&'))

tweets = pd.Series(tweets)

In [8]:
!pip install transformers



In [9]:
# define batch size and load in pretrained tokenizer
# adding bos, eos and pad token
BATCH_SIZE=4
tokenizer = GPT2Tokenizer.from_pretrained('gpt2', bos_token='<|startoftext|>', eos_token='<|endoftext|>', pad_token='<|pad|>') 

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=1042301.0, style=ProgressStyle(descript…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=456318.0, style=ProgressStyle(descripti…

Special tokens have been added in the vocabulary, make sure the associated word embedding are fine-tuned or trained.





In [10]:
# define Dataset class
class TorchDataset(Dataset):

  def __init__(self, tweets, tokenizer, max_length):

    self.tokenizer = tokenizer
    self.input_ids = []
    self.attn_mask = []

    for tweet in tweets:
      # tokenizing on a word level
      # encodings is dictionary with two keys:
      # one are the tokenized inputs, and other are attention_masks (what to pay attention to)
      # all tweets are padded to length of 'max_length' w/ the pad token
      # padded tokens are defaulted w/ attention 0
      encodings = tokenizer('<|startoftext|>'+ tweet + '<|endoftext|>', truncation=True, max_length=max_length, padding="max_length")
      self.input_ids.append(torch.tensor(encodings['input_ids']))
      self.attn_mask.append(torch.tensor(encodings['attention_mask']))

  def __len__(self):
    return len(self.input_ids)

  def __getitem__(self, idx):
    return self.input_ids[idx], self.attn_mask[idx] 

In [11]:
# train test split
dataset = TorchDataset(tweets, tokenizer, max_length=300)

TRAIN_SIZE = int(0.85 * len(dataset))
VAL_SIZE = len(dataset) - TRAIN_SIZE

train_ds, val_ds = random_split(dataset, [TRAIN_SIZE, VAL_SIZE])

In [12]:
# take look at one output
tokenizer.decode((list(train_ds))[0][0])

'<|startoftext|> Token of appreciation for those who lined up coming via mail. Thought maybe 20-30 people per store would line up, not 800. Gifts on order.<|endoftext|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|>

In [13]:
# DataLoader similar to shuffle and batch for tfdata
# train_ds = train_ds.shuffle(BUFFER_SIZE).batch(BATCH_SIZE) 
# for train, shuffle and batch randomly
train_dataloader = DataLoader(train_ds, 
                              sampler = RandomSampler(train_ds), 
                              batch_size = BATCH_SIZE)

# for validation can just batch sequentially.
val_dataloader = DataLoader(val_ds,
            sampler = SequentialSampler(val_ds),
            batch_size = BATCH_SIZE)

In [14]:
from transformers import GPT2Tokenizer, TFGPT2Model
configuration = GPT2Config.from_pretrained('gpt2', output_hidden_states=False)

# load pretrained model
model = GPT2LMHeadModel.from_pretrained("gpt2", config=configuration)

# added bos_token and eos_token to embeddings
# need to resize otherwise tokenizer and model tensors won't match up
model.resize_token_embeddings(len(tokenizer))

device = torch.device("cuda")
model.cuda()

# reproducability
random.seed(0)
np.random.seed(0)
torch.manual_seed(0)
torch.cuda.manual_seed_all(0)

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=665.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=548118077.0, style=ProgressStyle(descri…




In [15]:
warmup_steps = 100
epochs = 2
# optmize model paramters with AdamW
optimizer = AdamW(model.parameters(), lr = 0.001)

In [16]:
total_steps = len(train_dataloader) * epochs

# change learning rate as throughout training loop
scheduler = get_linear_schedule_with_warmup(optimizer, 
                                            num_warmup_steps = warmup_steps, 
                                            num_training_steps = total_steps)

In [17]:
# train and generate every 200 steps

sample_every = 200
training_stats = []
model = model.to(device)

for epoch_i in range(0, epochs):

    print("")
    print('======== Epoch {:} / {:} ========'.format(epoch_i + 1, epochs))
    total_train_loss = 0
    model.train()

    for step, batch in enumerate(train_dataloader):

        b_input_ids = batch[0].to(device)
        b_labels = batch[0].to(device)
        b_masks = batch[1].to(device)

        # zero gradients after each batch 
        # great read: https://stackoverflow.com/questions/48001598/why-do-we-need-to-call-zero-grad-in-pytorch 
        model.zero_grad()        

        outputs = model(  b_input_ids,
                          labels=b_labels, 
                          attention_mask = b_masks,
                          token_type_ids=None
                        )

        loss = outputs[0]  
        batch_loss = loss.item()
        total_train_loss += batch_loss

        # generate sentence evey n sample steps
        if step % sample_every == 0 and not step == 0:

            model.eval()

            # https://huggingface.co/blog/how-to-generate
            # top k: keep only top k words in distribution, and redistribute distribution to those words
            # top p: filter words that exceed a probability p, then redistribute (dynamic!)
            sample_outputs = model.generate(
                                    bos_token_id=random.randint(1,30000),
                                    do_sample=True,   
                                    top_k=50, 
                                    max_length = 200,
                                    top_p=0.95, 
                                    num_return_sequences=1
                                )
            for i, sample_output in enumerate(sample_outputs):
                  print("{}: {}".format(i, tokenizer.decode(sample_output, skip_special_tokens=True)))
            
            model.train()

        # calc loss gradients
        loss.backward()
        # update
        optimizer.step()
        scheduler.step()

    # average loss
    train_loss = total_train_loss / len(train_dataloader)       

    print("")
    print("  Average training loss: {0:.2f}".format(train_loss))

    model.eval()
    val_loss = 0

    # evaluate data for one epoch
    for batch in val_dataloader:
        
        b_input_ids = batch[0].to(device)
        b_labels = batch[0].to(device)
        b_masks = batch[1].to(device)
        
        # disable gradient calculation
        with torch.no_grad():        

            outputs  = model(b_input_ids, attention_mask = b_masks, labels=b_labels)
            loss = outputs[0]  
            
        batch_loss = loss.item()
        val_loss += batch_loss        

    avg_val_loss = val_loss / len(val_dataloader)

    print("  Validation Loss: {0:.2f}".format(avg_val_loss))




Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


0:  invitesYeah


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


0:  CCA lot of work is needed for this to develop, so we are working very hard in engineering


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


0:  barbarNo need to use Model 3 as an electric car :)


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


0: ramerA few days after I read the article, I found the wrong link

  Average training loss: 0.34
  Validation Loss: 0.23



Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


0:  NeuroHighly recommend, but a slow release would launch tomorrow. Will feel right on your palate right away.


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


0:  hardTesla is $2X less than other companies, says SESLA contract and company should keep up. We are all about doing better.


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


0:  enthusModel 3 orders online at time of order. No prior engineering experience required.


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


0:  BryantModel X at 1100C (55 ft)

  Average training loss: 0.18
  Validation Loss: 0.24


In [19]:
# fun part, print out text
model.eval()
# start string of start token
prompt = "<|startoftext|>"

generated = torch.tensor(tokenizer.encode(prompt)).unsqueeze(0)
generated = generated.to(device)

print(generated)

sample_outputs = model.generate(
                                generated, 
                                do_sample=True,   
                                top_k=50, 
                                max_length = 300,
                                num_return_sequences=3
                                )

for i, sample_output in enumerate(sample_outputs):
  print("{}: {}\n\n".format(i, tokenizer.decode(sample_output, skip_special_tokens=True)))

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


tensor([[50257]], device='cuda:0')
0: Thanks for the great experience we have with Model 3


1: I would recommend placing your faith in the Tesla Model S


2: Just wanted to say thanks to everyone who sent in their suggestions, suggestions & criticisms. Love :)




## References:


*   HuggingFace https://huggingface.co/transformers/model_doc/gpt2.html
*   Rey Farhan http://reyfarhan.com/posts/easy-gpt2-finetuning-huggingface/ (heavily inspired and sourced from his tutorial)
*   PyTorch Datasets https://pytorch.org/tutorials/beginner/data_loading_tutorial.html

