<a href="https://colab.research.google.com/github/vir-k01/ML-and-DL/blob/main/Techsoc_Conditional_Text_Generation_with_GPT_2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Related article: https://www.ivanlai.project-ds.net/post/conditional-text-generation-by-fine-tuning-gpt-2

Training functions and utilities taken from the HuggingFace library. 


### Install and import libraries

In [None]:
from google.colab import drive
drive.mount('/content/gdrive', force_remount=True)

Mounted at /content/gdrive


In [None]:

%%time
%%capture
!pip install transformers

CPU times: user 48.7 ms, sys: 10.8 ms, total: 59.5 ms
Wall time: 5.95 s


In [None]:
!nvidia-smi

Sat May  1 05:10:18 2021       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 465.19.01    Driver Version: 460.32.03    CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla T4            Off  | 00000000:00:04.0 Off |                    0 |
| N/A   39C    P8     9W /  70W |      0MiB / 15109MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [None]:
import os
import io
import requests
import numpy as np
import pandas as pd
import re
import zipfile
import random
import time
import csv
import datetime
from itertools import compress
from collections import Counter, defaultdict
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity

from transformers import AutoTokenizer, AutoConfig, AutoModelForPreTraining, \
                         AdamW, get_linear_schedule_with_warmup, \
                         TrainingArguments, BeamScorer, Trainer

import torch
from torch.utils.data import Dataset, random_split, DataLoader, \
                             RandomSampler, SequentialSampler

from IPython.display import clear_output

print(f"PyTorch version: {torch.__version__}")

PyTorch version: 1.8.1+cu101


In [None]:

import io, os, sys
import pandas as pd
import numpy as np
import requests
import zipfile
import time
import csv
import re
import multiprocessing
from multiprocessing import Pool
from sklearn.feature_extraction.text import TfidfVectorizer

import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [None]:
!pip install kaggle
!pip uninstall -y kaggle
!pip install --upgrade pip
!pip install kaggle==1.5.12

In [None]:
!mkdir -p ~/.kaggle
!cp kaggle.json ~/.kaggle/
!ls ~/.kaggle
!chmod 600 /root/.kaggle/kaggle.json

In [None]:
!kaggle competitions download -c dl-hack-track-2-nlp

In [None]:
!unzip /content/dl-hack-track-2-nlp.zip

In [None]:
!python -m spacy download en_core_web_lg

### Configurations

In [None]:
DEBUG           = False

USE_APEX        = True
APEX_OPT_LEVEL  = 'O1'

MODEL           = 'gpt2' #{gpt2, gpt2-medium, gpt2-large, gpt2-xl}

UNFREEZE_LAST_N = 6 #The last N layers to unfreeze for training

SPECIAL_TOKENS  = { "bos_token": "<|BOS|>",
                    "eos_token": "<|EOS|>",
                    "unk_token": "<|UNK|>",                    
                    "pad_token": "<|PAD|>",
                    "sep_token": "<|SEP|>"}
                    
MAXLEN          = 768  #{768, 1024, 1280, 1600}

TRAIN_SIZE      = 0.8

if USE_APEX:
    TRAIN_BATCHSIZE = 4
    BATCH_UPDATE    = 16
else:
    TRAIN_BATCHSIZE = 2
    BATCH_UPDATE    = 32

EPOCHS          = 4
LR              = 5e-4
EPS             = 1e-8
WARMUP_STEPS    = 1e2

SEED            = 2020

In [None]:
def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = True

seed_everything(SEED)

### Load dataset

In [None]:
columns = ['title', 'text']

df = pd.read_csv("train.csv", index_col = None, header = 0, names= columns )
print(f"df size: {len(df) :,}")

df.head()

df size: 146,381


Unnamed: 0,title,text
0,Interpretation of 3D CNNs for Brain MRI Data C...,Deep learning shows high potential for many ...
1,Scientific Calculator for Designing Trojan Det...,This work presents a web-based interactive n...
2,Proposal Flow,Finding image correspondences remains a chal...
3,Cloud-based or On-device: An Empirical Study o...,Modern mobile applications are benefiting si...
4,Retrofitting Structure-aware Transformer Langu...,We consider retrofitting structure-aware Tra...


In [None]:
data = dict()
for id in range(len(df)):

  data[id] = [df.title[id], df.text[id]]

In [None]:
data[1]

['Scientific Calculator for Designing Trojan Detectors in Neural Networks',
 '  This work presents a web-based interactive neural network (NN) calculator and a NN inefficiency measurement that has been investigated for the purpose of detecting trojans embedded in NN models. This NN Calculator is designed on top of TensorFlow Playground with in-memory storage of data and NN graphs plus coefficients. It is "like a scientific calculator" with analytical, visualization, and output operations performed on training datasets and NN architectures. The prototype is aaccessible at https://pages.nist.gov/nn-calculator. The analytical capabilities include a novel measurement of NN inefficiency using modified Kullback-Liebler (KL) divergence applied to histograms of NN model states, as well as a quantification of the sensitivity to variables related to data and NNs. Both NN Calculator and KL divergence are used to devise a trojan detector approach for a variety of trojan embeddings. Experimental re

### Datasets and loaders

In [None]:
class myDataset(Dataset):

    def __init__(self, data, tokenizer, randomize=True):

        title, text = [], []
        for k, v in data.items():
            title.append(v[0])
            text.append(v[1])

        self.randomize = randomize
        self.tokenizer = tokenizer 
        self.title     = title
        self.text      = text



    def __len__(self):
        return len(self.text)


    
    def __getitem__(self, i):
        '''keywords = self.keywords[i].copy()
        kw = self.join_keywords(keywords, self.randomize)'''
        
        input = SPECIAL_TOKENS['bos_token'] + self.title[i] + \
                SPECIAL_TOKENS['sep_token'] + SPECIAL_TOKENS['sep_token'] + \
                self.text[i] + SPECIAL_TOKENS['eos_token']

        encodings_dict = tokenizer(input,                                   
                                   truncation=True, 
                                   max_length=MAXLEN, 
                                   padding="max_length")   
        
        input_ids = encodings_dict['input_ids']
        attention_mask = encodings_dict['attention_mask']
        
        return {'label': torch.tensor(input_ids),
                'input_ids': torch.tensor(input_ids), 
                'attention_mask': torch.tensor(attention_mask)}

In [None]:
def split_data(data, S=TRAIN_SIZE):
    # Shuffle ids
    ids = list(data.keys())
    random.shuffle(ids)

    # Split into training and validation sets    
    train_size = int(S * len(data))

    train_ids = ids[:train_size]
    val_ids = ids[train_size:]

    train_data = dict()
    for id in train_ids:
        train_data[id] = data[id]

    val_data = dict()
    for id in val_ids:
        val_data[id] = data[id]

    return train_data, val_data

### Loading Tokenizer, Config and Model

In [None]:
def get_tokenier(special_tokens=None):
    tokenizer = AutoTokenizer.from_pretrained(MODEL) #GPT2Tokenizer

    if special_tokens:
        tokenizer.add_special_tokens(special_tokens)
        print("Special tokens added")
    return tokenizer

def get_model(tokenizer, special_tokens=None, load_model_path='/content/gdrive/MyDrive/pytorch_model_2.bin/pytorch_model.bin'):

    #GPT2LMHeadModel
    if special_tokens:
        config = AutoConfig.from_pretrained(MODEL, 
                                            bos_token_id=tokenizer.bos_token_id,
                                            eos_token_id=tokenizer.eos_token_id,
                                            sep_token_id=tokenizer.sep_token_id,
                                            pad_token_id=tokenizer.pad_token_id,
                                            output_hidden_states=False)
    else: 
        config = AutoConfig.from_pretrained(MODEL,                                     
                                            pad_token_id=tokenizer.eos_token_id,
                                            output_hidden_states=False)    

    #----------------------------------------------------------------#
    model = AutoModelForPreTraining.from_pretrained(MODEL, config=config)

    if special_tokens:
        #Special tokens added, model needs to be resized accordingly
        model.resize_token_embeddings(len(tokenizer))

    if load_model_path:
        model.load_state_dict(torch.load(load_model_path))

    model.cuda()
    return model

In [None]:
%%time

tokenizer = get_tokenier(special_tokens=SPECIAL_TOKENS)
model = get_model(tokenizer, 
                  special_tokens=SPECIAL_TOKENS,
                #   load_model_path='pytorch_model.bin'
                 )

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=665.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=1042301.0, style=ProgressStyle(descript…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=456318.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=1355256.0, style=ProgressStyle(descript…


Special tokens added


HBox(children=(FloatProgress(value=0.0, description='Downloading', max=548118077.0, style=ProgressStyle(descri…


CPU times: user 17.2 s, sys: 3.54 s, total: 20.8 s
Wall time: 43.5 s


In [None]:
# - Freeze selective layers:
# - Freeze all layers except last n:
for parameter in model.parameters():
    parameter.requires_grad = False

for i, m in enumerate(model.transformer.h):        
    #Only un-freeze the last n transformer blocks
    if i+1 > 12 - UNFREEZE_LAST_N:
        for parameter in m.parameters():
            parameter.requires_grad = True 

for parameter in model.transformer.ln_f.parameters():        
    parameter.requires_grad = True

for parameter in model.lm_head.parameters():        
    parameter.requires_grad = True

In [None]:
train_data, val_data = split_data(data)

train_dataset = myDataset(train_data, tokenizer)
val_dataset = myDataset(val_data, tokenizer, randomize=False)

f'There are {len(train_dataset) :,} samples for training, and {len(val_dataset) :,} samples for validation testing'

'There are 117,104 samples for training, and 29,277 samples for validation testing'

### Fine-tune GPT2 using Trainer

In [None]:
%%time

training_args = TrainingArguments(
    output_dir="/content/",
    num_train_epochs=0.5,
    per_device_train_batch_size=TRAIN_BATCHSIZE,
    per_device_eval_batch_size=TRAIN_BATCHSIZE,
    gradient_accumulation_steps=BATCH_UPDATE,
    evaluation_strategy="epoch",
    fp16=True,
    fp16_opt_level=APEX_OPT_LEVEL,
    warmup_steps=WARMUP_STEPS,    
    learning_rate=LR,
    adam_epsilon=EPS,
    weight_decay=0.01,        
    save_total_limit=1,
    load_best_model_at_end=True,     
)

#---------------------------------------------------#
trainer = Trainer(
    model=model,
    args=training_args,    
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    tokenizer=tokenizer
)

#---------------------------------------------------#
trainer.train()
trainer.save_model()    

Epoch,Training Loss,Validation Loss,Runtime,Samples Per Second
0,0.8445,0.853542,2877.2992,10.175


CPU times: user 3h 52min 33s, sys: 2h 16min 19s, total: 6h 8min 52s
Wall time: 3h 12min 35s


In [None]:
model.save_pretrained('pytorch_model_2.bin')

In [None]:
# Save to G-Drive ----------------------------------#
!cp -r 'pytorch_model_2.bin' '/content/gdrive/MyDrive'

### Generating text with Fine-tuned GPT-2 model

In [None]:
 !cp -r '/content/gdrive/pytorch_model_V2.bin' 'pytorch_model.bin' 

In [None]:
tokenizer = get_tokenier(special_tokens=SPECIAL_TOKENS)
model = get_model(tokenizer, 
                  special_tokens=SPECIAL_TOKENS,
                  #load_model_path='/content/pytorch_model.bin/pytorch_model.bin'
                  )

Special tokens added


In [None]:
title = "Scientific Calculator for Designing Trojan Detectors in Neural Networks"
#keywords = ['train', 'lads', 'drinking', 'picture', 'funny', 'instagram']
#kw = myDataset.join_keywords(keywords, randomize=False)

prompt = SPECIAL_TOKENS['bos_token'] + title + \
         SPECIAL_TOKENS['sep_token'] + SPECIAL_TOKENS['sep_token']
         
generated = torch.tensor(tokenizer.encode(prompt)).unsqueeze(0)
device = torch.device("cuda")
generated = generated.to(device)

model.eval();

In [None]:
data[1]

['Scientific Calculator for Designing Trojan Detectors in Neural Networks',
 '  This work presents a web-based interactive neural network (NN) calculator and a NN inefficiency measurement that has been investigated for the purpose of detecting trojans embedded in NN models. This NN Calculator is designed on top of TensorFlow Playground with in-memory storage of data and NN graphs plus coefficients. It is "like a scientific calculator" with analytical, visualization, and output operations performed on training datasets and NN architectures. The prototype is aaccessible at https://pages.nist.gov/nn-calculator. The analytical capabilities include a novel measurement of NN inefficiency using modified Kullback-Liebler (KL) divergence applied to histograms of NN model states, as well as a quantification of the sensitivity to variables related to data and NNs. Both NN Calculator and KL divergence are used to devise a trojan detector approach for a variety of trojan embeddings. Experimental re

In [None]:
# Top-p (nucleus) text generation (10 samples):
sample_outputs = model.generate(generated, 
                                do_sample=True,   
                                min_length=50, 
                                max_length=MAXLEN,
                                top_k=30,                                 
                                top_p=0.7,        
                                temperature=0.9,
                                repetition_penalty=2.0,
                                num_return_sequences=10
                                )

for i, sample_output in enumerate(sample_outputs):
    text = tokenizer.decode(sample_output, skip_special_tokens=True)
    a = len(title) 
    print("{}: {}\n\n".format(i+1,  text[a:]))

1:   The paper describes a new mathematical model that allows to calculate the probability of detecting an unknown program by analyzing its parameters. In this approach, we propose three novel algorithms: (i) Linear Programming and Evolutionary Algorithms; their computational complexity is proportional with respect {1/2} {\log(T)} times \sqrt{N})$, which are more efficient than other machine learning techniques on binary classification problems but not necessarily as accurate or faster compared against standard methods suchas Random Forest-based classifiers when applied within linear programming frameworks like BERT. We also provide empirical evidence showing how these approaches can be used successfully both inside neural networks trained using backpropagation based models while avoiding some known weaknesses present during training procedures from conventional ones - eigenspace, overfitting etc.; they may lead us towards better solutions if further improvements need being made before

In [None]:
# Beam-search text generation:
sample_outputs = model.generate(generated, 
                                do_sample=True,   
                                max_length=MAXLEN,                                                      
                                num_beams=5,
                                repetition_penalty=5.0,
                                early_stopping=True,      
                                num_return_sequences=1
                                )

for i, sample_output in enumerate(sample_outputs):
    text = tokenizer.decode(sample_output, skip_special_tokens=True)
    a = len(title)
    print("{}: {}\n\n".format(i+1,  text[a:]))

1:   In this paper, we propose a novel computer-aided design algorithm based on the Scientific Calculator (SCO) that can be used to automatically determine whether or not an attack has been launched. The SCO is composed of two components: 1) a multi-layer neural network and 2) a stochastic gradient descent method. Each component consists of three steps: first, it calculates the probability density function using Gaussian mixture models; second, it computes the mean squared error by computing the square root of the distance between the input training samples and the target model; and third, it extracts features from the test data via convolutional neural networks. Extensive experiments have been carried out with both synthetic and real adversarial examples. Experimental results show that the SCO significantly outperforms other state-of-the-art techniques such as DNNs and LSTMs when compared to existing methods. 




### Make predictions on test set 

In [None]:
test = pd.read_csv('test.csv')

In [None]:
test.head()

Unnamed: 0,title
0,Multi-factorial Optimization for Large-scale V...
1,Dialogue Act Classification with Context-Aware...
2,Kernel Additive Principal Components
3,Sample Complexity of Learning Mixtures of Spar...
4,Joint Coarse-And-Fine Reasoning for Deep Optic...


In [None]:
def generate(test):
  output = []
  i = 0
  for title in test.title:
  #keywords = ['train', 'lads', 'drinking', 'picture', 'funny', 'instagram']
  #kw = myDataset.join_keywords(keywords, randomize=False)

    prompt = SPECIAL_TOKENS['bos_token'] + title + \
          SPECIAL_TOKENS['sep_token'] + SPECIAL_TOKENS['sep_token']
          
    generated = torch.tensor(tokenizer.encode(prompt)).unsqueeze(0)
    device = torch.device("cuda")
    generated = generated.to(device)


    '''sample_outputs = model.generate(generated, 
                                    do_sample=True,   
                                    max_length=MAXLEN,                                                      
                                    num_beams=5,
                                    repetition_penalty=5.0,
                                    early_stopping=True,      
                                    num_return_sequences=1
                                    )'''
    sample_outputs = model.generate(generated, 
                                do_sample=True,   
                                min_length=50, 
                                max_length=MAXLEN,
                                top_k=45,                                 
                                top_p=0.7,        
                                temperature=0.9,
                                repetition_penalty=2.0,
                                num_return_sequences=1
                                )

    #for i, sample_output in enumerate(sample_outputs):
    text = tokenizer.decode(sample_outputs[0], skip_special_tokens=True)
    a = len(title)
    out = "{}\n\n".format(text[a:])
    output.append(out)
    if i%100:
      print('Done_'+ str(i))
    i += 1
  return output



In [None]:
def generate_beam(test):
  output = []
  i = 0
  for title in test.title:
  #keywords = ['train', 'lads', 'drinking', 'picture', 'funny', 'instagram']
  #kw = myDataset.join_keywords(keywords, randomize=False)

    prompt = SPECIAL_TOKENS['bos_token'] + title + \
          SPECIAL_TOKENS['sep_token'] + SPECIAL_TOKENS['sep_token']
          
    generated = torch.tensor(tokenizer.encode(prompt)).unsqueeze(0)
    device = torch.device("cuda")
    generated = generated.to(device)


    sample_outputs = model.generate(generated, 
                                    do_sample=True,   
                                    max_length=MAXLEN,                                                      
                                    num_beams=5,
                                    repetition_penalty=5.0,
                                    early_stopping=True,      
                                    num_return_sequences=1
                                    )
    '''sample_outputs = model.generate(generated, 
                                do_sample=True,   
                                min_length=50, 
                                max_length=MAXLEN,
                                top_k=30,                                 
                                top_p=0.7,        
                                temperature=0.9,
                                repetition_penalty=2.0,
                                num_return_sequences=10
                                )'''

    #for i, sample_output in enumerate(sample_outputs):
    text = tokenizer.decode(sample_outputs[0], skip_special_tokens=True)
    a = len(title)
    out = "{}\n\n".format(text[a:])
    output.append(out)
    if i%100:
      print('Done_'+ str(i))
    i += 1
  return output



In [None]:
output = generate(test)

In [None]:
output_1 = generate_beam(test)

In [None]:
dum = pd.DataFrame(output)
dum.columns = ['abstract']
df2 = pd.concat([test.title, dum.abstract], axis =1)
df2.to_csv('submission.csv')
!cp -r '/content/submission.csv' '/content/gdrive/MyDrive/submission.csv'

In [None]:
df2.head()

Unnamed: 0,title,abstract
0,Multi-factorial Optimization for Large-scale V...,We study the problem of large scale virtual m...
1,Dialogue Act Classification with Context-Aware...,Dialogue act classification (CAS) is a cruci...
2,Kernel Additive Principal Components,Kernel approximation is a fundamental techni...
3,Sample Complexity of Learning Mixtures of Spar...,We introduce a new formulation for learning ...
4,Joint Coarse-And-Fine Reasoning for Deep Optic...,We present a new method to solve the problem...


In [None]:
import pandas as pd
from typing import Dict, List
import json
import requests
import numpy as np
import sys
import os
from sklearn.decomposition import PCA

# DO NOT MODIFY
pca = PCA(n_components=32)
URL = "https://model-apis.semanticscholar.org/specter/v1/invoke"
MAX_BATCH_SIZE = 16

# split text into batches
def chunkify(test_csv, chunk_size=MAX_BATCH_SIZE):
    for i in range(0, len(test_csv), chunk_size):
        chunk = []
        for j in range(chunk_size):
            indx = i+j
            if indx == len(test_csv):
                break
            chunk.append(
                {
                    "paper_id": indx,
                    "title": test_csv["title"][indx],
                    "abstract": test_csv["abstract"][indx]
                }
            )
        yield chunk
def submit(test_csv):



  test_csv = pd.read_csv(test_csv)
  embeddings = []
  cntr = 0
  for chunk in chunkify(test_csv):
      response = requests.post(URL, json=chunk)
      if response.status_code != 200:
          raise RuntimeError("Sorry, something went wrong, please try later!")
      for paper in response.json()["preds"]:
          embeddings.append(paper["embedding"])
          print(f"[{cntr}/{len(test_csv)}]", end="\r")
          cntr += 1
  print("Done... Creating submission file")

  embeddings = np.array(embeddings)
  embeddings = pca.fit_transform(embeddings)
  
  df = pd.DataFrame.from_records(embeddings)
  cols = {i: f"f_{i}" for i in range(32)}
  df = df.rename(columns=cols)
  df["id"] = np.arange(len(embeddings))
  df.to_csv("submission.csv", index=False)
  print("Submission file created at ./submission.csv")

In [None]:
submit('submission.csv')

Done... Creating submission file
Submission file created at ./submission.csv
