Fine tune GPT-2 on the shopping products dataset so that it can be used as a teacher model for simpler models.

In [2]:
# ml
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score
from sklearn.manifold import TSNE, MDS, Isomap, LocallyLinearEmbedding
from sklearn.decomposition import PCA, KernelPCA, FastICA
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.model_selection import train_test_split

# nlp
import nltk

# deep learning
import torch
import torch.optim as optim
from torchinfo import summary
from torch.utils.data import DataLoader, Dataset
from torch import nn
from datasets import load_dataset
import datasets
from transformers import AutoTokenizer
from transformers import Trainer, TrainingArguments, AutoModel
from transformers import TextDataset, DataCollatorForLanguageModeling
from transformers import GPT2Tokenizer, GPT2LMHeadModel
from transformers import PreTrainedTokenizerFast, GPT2TokenizerFast

# local imports
import importlib
# add '../../deep_learning/src/' to path
import sys
sys.path.insert(1, '../../')
import deep_learning.src.nlp.models as nlp_models
_ = importlib.reload(nlp_models)

import deep_learning.src.nlp.training as nlp_training
_ = importlib.reload(nlp_training)

import deep_learning.src.nlp.preprocessing as local_preprocessing
_ = importlib.reload(local_preprocessing)

import deep_learning.src.nlp.analysis as dl_analysis
_ = importlib.reload(dl_analysis)

# basic data analysis
import pandas as pd
import numpy as np
import re, string, os, pickle
from IPython.display import display, HTML
from tqdm.notebook import tqdm
import seaborn as sns
import matplotlib.pyplot as plt
from collections import Counter

In [4]:
dataset_to_info = {}
with open('local/model_comparison/dataset_to_info.pkl', 'rb') as f:
    dataset_to_info = pickle.load(f)

In [3]:
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
# Add a new padding token if it doesn't exist
if tokenizer.pad_token is None:
    print ("Adding padding token")
    tokenizer.add_special_tokens({'pad_token': '[PAD]'})
vocab = tokenizer.get_vocab()
print (f"Vocab size: {len(vocab)}")
id_to_token = {v: k for k, v in vocab.items()}

Adding padding token
Vocab size: 50258


In [14]:
def load_dataset(file_path, tokenizer, block_size = 40):
    dataset = TextDataset(
        tokenizer = tokenizer,
        file_path = file_path,
        block_size = block_size,
    )
    return dataset

def load_data_collator(tokenizer, mlm = False):
    data_collator = DataCollatorForLanguageModeling(
        tokenizer=tokenizer, 
        mlm=mlm,
    )
    return data_collator



In [22]:
def dump_gpt2_text_samples(dataset, split, dataset_to_info):
    output_dir = f'local/gpt2_finetuning/{dataset}/{split}/'
    lm_prediction_targets = dataset_to_info[dataset]['data'][split]['raw_data']['lm_prediction_targets']

    output_path = f'{output_dir}/lm_prediction_targets.txt'
    os.makedirs(os.path.dirname(output_path), exist_ok=True)
    with open(output_path, 'w') as f:
        for target in lm_prediction_targets:
            f.write(target + '\n')

In [23]:
for dataset in dataset_to_info:
    for split in dataset_to_info[dataset]['data']:
        dump_gpt2_text_samples(dataset, split, dataset_to_info)

In [33]:
dataset = 'amazon'
root_dir = f'local/gpt2_finetuning/{dataset}/'
train_output_dir = f'{root_dir}/train/model_checkpoints/'

train_dataset = load_dataset(f'{root_dir}/train/lm_prediction_targets.txt', tokenizer)
val_dataset = load_dataset(f'{root_dir}/val/lm_prediction_targets.txt', tokenizer)
test_dataset = load_dataset(f'{root_dir}/test/lm_prediction_targets.txt', tokenizer)



In [45]:
# save tokenizer
tokenizer.save_pretrained(train_output_dir+'/tokenizer/')

('local/gpt2_finetuning/amazon//train/model_checkpoints//tokenizer/tokenizer_config.json',
 'local/gpt2_finetuning/amazon//train/model_checkpoints//tokenizer/special_tokens_map.json',
 'local/gpt2_finetuning/amazon//train/model_checkpoints//tokenizer/vocab.json',
 'local/gpt2_finetuning/amazon//train/model_checkpoints//tokenizer/merges.txt',
 'local/gpt2_finetuning/amazon//train/model_checkpoints//tokenizer/added_tokens.json')

In [34]:
data_collator = load_data_collator(tokenizer)

In [35]:
model = GPT2LMHeadModel.from_pretrained('gpt2')

In [40]:
training_args = TrainingArguments(
    output_dir=train_output_dir,
    overwrite_output_dir=False,
    per_device_train_batch_size=8,
    num_train_epochs=5,
    max_steps=100,
    eval_strategy="steps",
    eval_steps=100,
    save_steps=100,
    learning_rate=5e-5
)

In [41]:
trainer = Trainer(
        model=model,
        args=training_args,
        data_collator=data_collator,
        train_dataset=train_dataset,
        eval_dataset=val_dataset,
        )

max_steps is given, it will override any value given in num_train_epochs


In [42]:
trainer.train()

  0%|          | 0/100 [00:00<?, ?it/s]

  0%|          | 0/1371 [00:00<?, ?it/s]

{'eval_loss': 4.3516130447387695, 'eval_runtime': 54.8732, 'eval_samples_per_second': 199.77, 'eval_steps_per_second': 24.985, 'epoch': 0.02}
{'train_runtime': 74.7645, 'train_samples_per_second': 10.7, 'train_steps_per_second': 1.338, 'train_loss': 4.630201416015625, 'epoch': 0.02}


TrainOutput(global_step=100, training_loss=4.630201416015625, metrics={'train_runtime': 74.7645, 'train_samples_per_second': 10.7, 'train_steps_per_second': 1.338, 'total_flos': 16330752000000.0, 'train_loss': 4.630201416015625, 'epoch': 0.018145527127563055})

In [47]:
# generate text from trained checkpoint

In [111]:
def load_model(model_dir, checkpoint=None, device='cuda'):
    if checkpoint is None:
        # get latest checkpoint
        checkpoints = list(filter(lambda x: x.startswith('checkpoint-'), os.listdir(model_dir)))
        checkpoints = [(int(x.split('-')[1]), x) for x in checkpoints]
        latest_checkpoint = max(checkpoints, key=lambda x: x[0])[0]
        model_path = f'{model_dir}/checkpoint-{latest_checkpoint}'
    else:
        model_path = f'{model_dir}/checkpoint-{checkpoint}'
    model = GPT2LMHeadModel.from_pretrained(model_path)
    model.to(device)
    return model

In [112]:
model = load_model(train_output_dir)
tokenizer = GPT2Tokenizer.from_pretrained(train_output_dir+'/tokenizer/')

In [60]:
def generate_text(sequence, max_length, model, tokenizer):
    inputs = tokenizer(f'{sequence}', return_tensors='pt', padding=True)
    ids = inputs['input_ids']
    attention_mask = inputs['attention_mask']
    print (attention_mask)
    final_outputs = model.generate(
        ids,
        do_sample=True,
        max_length=max_length,
        attention_mask=attention_mask,
        pad_token_id=model.config.eos_token_id,
        top_k=50,
        top_p=0.95,
    )
    output_text = tokenizer.decode(final_outputs[0], skip_special_tokens=True)
    return final_outputs, output_text

In [64]:
sequence = tokenizer.eos_token
max_len = 60
final_outputs, output_text = generate_text(sequence, max_len, model, tokenizer)

tensor([[1]])


Measure perplexities on train, validation and test sets using the vocab used by the n-gram model.

In [83]:
eval_split = 'val'
lm_target_input_ids = dataset_to_info[dataset]['data'][eval_split]['lm_target_input_ids']

In [105]:
batches = []
attention_masks = []
batch_size = 32
maxlen = dataset_to_info[dataset]['configs']['maxlen'] + 2
for i in range(0, len(lm_target_input_ids), batch_size):
    batch = lm_target_input_ids[i:i+batch_size]
    attention_mask = [[1] * len(x) for x in batch]
    for j in range(len(batch)):
        batch[j] = batch[j][:maxlen] 
        attention_mask[j] = attention_mask[j][:maxlen]
        batch[j].extend([tokenizer.eos_token_id] * (maxlen - len(batch[j])))
        attention_mask[j].extend([0] * (maxlen - len(attention_mask[j])))
    batches.append(batch)
    attention_masks.append(attention_mask)

print (len(batches))

755


In [117]:
batch = batches[0]
batch_attention_mask = attention_masks[0]
sentence = batch[0]
sentence_attention_mask = batch_attention_mask[0]

In [118]:
print (sentence_attention_mask)
tokenizer.decode(sentence)

[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]


'<|endoftext|>J B Fashion Tops for Women | Tops for Women Tops for Women Stylish | top for Girls | top for Women Stylish Latest (J-464)<|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|>'

In [134]:
next_token_probs = None
model.eval()
with torch.no_grad():
    ids = torch.tensor(batch).to('cuda')
    attention_mask = torch.tensor(batch_attention_mask).to('cuda')
    outputs = model(ids, attention_mask=attention_mask)
    logits = outputs.logits
    next_token_probs = torch.nn.functional.softmax(logits, dim=-1)

In [137]:
sentence_next_token_probs = next_token_probs.cpu().numpy()[0]
sentence_next_token_probs.shape

(52, 50257)

In [139]:
# show top-k most likely tokens for each context
top_k = 5
for i in range(len(sentence_next_token_probs)):
    top_k_indices = np.argsort(sentence_next_token_probs[i])[::-1][:top_k]
    top_k_probs = sentence_next_token_probs[i][top_k_indices]
    top_k_tokens = [id_to_token[x] for x in top_k_indices]
    print (f"Context: {tokenizer.decode(batch[0][:i+1])}")
    print (f"top-{top_k} tokens with their probabilities:")
    for j in range(top_k):
        print (f"\t{top_k_tokens[j]}: {top_k_probs[j]}")
    print ('\n\n')

Context: <|endoftext|>
top-5 tokens with their probabilities:
	Ċ: 0.0430183932185173
	A: 0.03331897035241127
	The: 0.019795283675193787
	S: 0.0194003414362669
	T: 0.01738286018371582



Context: <|endoftext|>J
top-5 tokens with their probabilities:
	ACK: 0.04394003003835678
	AN: 0.033528704196214676
	ER: 0.027945753186941147
	.: 0.02778715267777443
	AK: 0.01632264256477356



Context: <|endoftext|>J B
top-5 tokens with their probabilities:
	ĠK: 0.03272165358066559
	.: 0.03002089262008667
	ĠS: 0.026225674897432327
	Ċ: 0.022505376487970352
	ON: 0.014905023388564587



Context: <|endoftext|>J B Fashion
top-5 tokens with their probabilities:
	ĠMen: 0.1776643544435501
	ĠWomen: 0.08563017845153809
	Ċ: 0.04325247183442116
	ĠMens: 0.024437248706817627
	ĠShoes: 0.02308136224746704



Context: <|endoftext|>J B Fashion T
top-5 tokens with their probabilities:
	-: 0.6529910564422607
	shirt: 0.11977624893188477
	ops: 0.04356880486011505
	shirts: 0.04006678983569145
	ĠShirt: 0.03316032513976097



C

In [133]:
next_token_probs.shape

torch.Size([50257])

In [78]:
logits.shape

torch.Size([1, 36, 50257])

In [79]:
ids.shape

torch.Size([1, 36])