Anaconda environment setup modules.

```
module load cuda10.2/toolkit/10.2.89
module load CUDA/10.2.89-GCC-6.4.0-2.28
```

Retreive Amazon review data.

TODO: Save large files to disk and train model against them in batches.

In [1]:
#DATA_URL = 'http://deepyeti.ucsd.edu/jianmo/amazon/categoryFiles/Electronics.json.gz'
#METADATA_URL = 'http://deepyeti.ucsd.edu/jianmo/amazon/metaFiles/meta_Electronics.json.gz'
DATA_URL = 'http://deepyeti.ucsd.edu/jianmo/amazon/categoryFiles/Gift_Cards.json.gz'
METADATA_URL = 'http://deepyeti.ucsd.edu/jianmo/amazon/metaFiles/meta_Gift_Cards.json.gz'
SAMPLE_SIZE = 60000

MODEL_SAVE_PATH = '/data/user/cren1/bert2gpt'
MODEL_TEMP_PATH = '/data/user/cren1/temp'

In [2]:
import json
import os
import gzip
from urllib.request import urlopen
from tqdm.auto import tqdm

def get_data(url):
    data = []
    with gzip.open(urlopen(url)) as f:
        for l in f:
            data.append(json.loads(l.strip()))
    return data

data = get_data(DATA_URL)
metadata = get_data(METADATA_URL)

Create product ASIN mapping to link products and their reviews.

In [3]:
import re

# Remove HTML tags from text
def remove_html_tags(text):
    return re.sub(r'<.*?>', '', text)

product_dataset = {}
for product in metadata:
    asin = product['asin']
    category = ' '.join(product['category'])
    title = product['title']
    description = ' '.join(product['description'])
    
    # Remove HTML tags
    filtered_description = remove_html_tags(description)
    
    product_dataset[asin] = {
        'category': category,
        'title': title,
        'description': filtered_description
    }

Initialize tokenizer and model.

In [4]:
from transformers import BertTokenizer, GPT2Tokenizer, EncoderDecoderModel

RATING_TOKEN = '[RAT]'
CATEGORY_TOKEN = '[CAT]'
TITLE_TOKEN = '[TTL]'
DESCRIPTION_TOKEN = '[DES]'
LABEL_MASK_TOKEN_ID = -100

ENCODER_MAX_LENGTH = 512
DECODER_MAX_LENGTH = 128

encoder_tokenizer = BertTokenizer(
    'vocab.txt',
    additional_special_tokens=[
        RATING_TOKEN,
        CATEGORY_TOKEN,
        TITLE_TOKEN,
        DESCRIPTION_TOKEN,
    ],
)
encoder_tokenizer.bos_token = encoder_tokenizer.cls_token
encoder_tokenizer.eos_token = encoder_tokenizer.sep_token

def build_inputs_with_special_tokens(self, token_ids_0, token_ids_1=None):
    outputs = [self.bos_token_id] + token_ids_0 + [self.eos_token_id]
    return outputs

GPT2Tokenizer.build_inputs_with_special_tokens = build_inputs_with_special_tokens
decoder_tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
decoder_tokenizer.pad_token = decoder_tokenizer.unk_token

model = EncoderDecoderModel.from_encoder_decoder_pretrained('bert-base-cased', 'gpt2')
model.config.vocab_size = model.config.encoder.vocab_size

model.config.decoder_start_token_id = decoder_tokenizer.bos_token_id
model.config.eos_token_id = decoder_tokenizer.eos_token_id
model.config.pad_token_id = decoder_tokenizer.eos_token_id

model.config.max_length = 142
model.config.min_length = 56
model.config.no_repeat_ngram_size = 3
model.early_stopping = True
model.length_penalty = 2.0
model.num_beams = 4

Some weights of GPT2LMHeadModel were not initialized from the model checkpoint at gpt2 and are newly initialized: ['h.0.crossattention.bias', 'h.0.crossattention.masked_bias', 'h.0.crossattention.c_attn.weight', 'h.0.crossattention.c_attn.bias', 'h.0.crossattention.q_attn.weight', 'h.0.crossattention.q_attn.bias', 'h.0.crossattention.c_proj.weight', 'h.0.crossattention.c_proj.bias', 'h.0.ln_cross_attn.weight', 'h.0.ln_cross_attn.bias', 'h.1.crossattention.bias', 'h.1.crossattention.masked_bias', 'h.1.crossattention.c_attn.weight', 'h.1.crossattention.c_attn.bias', 'h.1.crossattention.q_attn.weight', 'h.1.crossattention.q_attn.bias', 'h.1.crossattention.c_proj.weight', 'h.1.crossattention.c_proj.bias', 'h.1.ln_cross_attn.weight', 'h.1.ln_cross_attn.bias', 'h.2.crossattention.bias', 'h.2.crossattention.masked_bias', 'h.2.crossattention.c_attn.weight', 'h.2.crossattention.c_attn.bias', 'h.2.crossattention.q_attn.weight', 'h.2.crossattention.q_attn.bias', 'h.2.crossattention.c_proj.weight'

Map data to model inputs.

In [5]:
# map data to input
def generate_input_sequence(rating, category, title, description):
    rating_sequence = '{} {}'.format(RATING_TOKEN, rating)
    category_sequence = '{} {}'.format(CATEGORY_TOKEN, category)
    title_sequence = '{} {}'.format(TITLE_TOKEN, title)
    description_sequence = '{} {}'.format(DESCRIPTION_TOKEN, description)
    return ' '.join([rating_sequence, category_sequence, title_sequence, description_sequence])

# tokenize input for encoder
def preprocess_encoder_input(sequence):
    tokenized_input = encoder_tokenizer.tokenize(sequence)
    encoded_input = encoder_tokenizer.convert_tokens_to_ids(tokenized_input)
    prepared_input = encoder_tokenizer.prepare_for_model(
        encoded_input,
        max_length=ENCODER_MAX_LENGTH,
        truncation=True,
        padding='max_length',
        return_attention_mask=True,
    )
    return prepared_input['input_ids'], prepared_input['attention_mask']

# tokenize input for decoder
def preprocess_decoder_input(sequence):
    tokenized_input = decoder_tokenizer.tokenize(sequence)
    encoded_input = decoder_tokenizer.convert_tokens_to_ids(tokenized_input)
    prepared_input = decoder_tokenizer.prepare_for_model(
        encoded_input,
        max_length=DECODER_MAX_LENGTH,
        truncation=True,
        padding='max_length',
        return_attention_mask=True,
    )
    return prepared_input['input_ids'], prepared_input['attention_mask']

model_inputs = []
for review in tqdm(data):
    product_asin = review['asin']
    review_rating = review['overall']
    review_text = review.get('reviewText')
    
    product = product_dataset.get(product_asin)
    
    # filter reviews with missing information
    has_text_review = review_text is not None
    product_exists = product is not None
    product_has_description = product_exists and product['description'] is not []
    
    if has_text_review and product_has_description:
        product_combined = generate_input_sequence(
            review_rating,
            product['category'],
            product['title'],
            product['description'],
        )
        
        input_ids, attention_mask = preprocess_encoder_input(product_combined)
        decoder_input_ids, decoder_attention_mask = preprocess_decoder_input(review_text)
        
        # mask pad tokens in label
        labels = []
        for index in decoder_input_ids:
            label = index
            if index is decoder_tokenizer.pad_token_id:
                label = LABEL_MASK_TOKEN_ID
            labels.append(label)
    
        model_inputs.append({
            'input_ids': input_ids,
            'attention_mask': attention_mask,
            'decoder_input_ids': decoder_input_ids,
            'decoder_attention_mask': decoder_attention_mask,
            'labels': labels,
        })

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=147194.0), HTML(value='')))




Split into training and testing inputs. Initialize datasets.

In [6]:
!pip install datasets==1.0.2
!pip install transformers

Defaulting to user installation because normal site-packages is not writeable
Defaulting to user installation because normal site-packages is not writeable


In [7]:
from datasets import Dataset
import pandas as pd

data_frame = pd.DataFrame.from_records(model_inputs)
dataset = Dataset.from_pandas(data_frame).select(range(SAMPLE_SIZE))
dataset.set_format(
    type="torch", columns=["input_ids", "attention_mask", "decoder_input_ids", "decoder_attention_mask", "labels"],
)
dataset_dict = dataset.train_test_split()
train_dataset = dataset_dict['train']
test_dataset = dataset_dict['test']

Install seq2seq trainer prerequisites

In [8]:
!pip install rouge_score

!rm seq2seq_trainer.py
!rm seq2seq_training_args.py
!wget https://raw.githubusercontent.com/huggingface/transformers/v3.5.1/examples/seq2seq/seq2seq_trainer.py
!wget https://raw.githubusercontent.com/huggingface/transformers/v3.5.1/examples/seq2seq/seq2seq_training_args.py

Defaulting to user installation because normal site-packages is not writeable
--2020-11-16 07:43:46--  https://raw.githubusercontent.com/huggingface/transformers/master/examples/seq2seq/seq2seq_trainer.py
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 151.101.192.133, 151.101.0.133, 151.101.128.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|151.101.192.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 9810 (9.6K) [text/plain]
Saving to: ‘seq2seq_trainer.py’


2020-11-16 07:43:46 (48.0 MB/s) - ‘seq2seq_trainer.py’ saved [9810/9810]

--2020-11-16 07:43:47--  https://raw.githubusercontent.com/huggingface/transformers/master/examples/seq2seq/seq2seq_training_args.py
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 151.101.64.133, 151.101.128.133, 151.101.192.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|151.101.64.133|:443... connected.
HTTP request sent, awaiting 

Train model.

In [9]:
from datasets import load_metric

rouge = load_metric("rouge")

def compute_metrics(outputs):
    predictions_ids = outputs.predictions
    labels_ids = outputs.label_ids

    predictions = decoder_tokenizer.batch_decode(predictions_ids, skip_special_tokens=True)
    labels_ids[labels_ids == LABEL_MASK_TOKEN_ID] = decoder_tokenizer.eos_token_id
    labels = decoder_tokenizer.batch_decode(labels_ids, skip_special_tokens=True)

    rouge_output = rouge.compute(
        predictions=predictions,
        references=labels,
        rouge_types=["rouge2"]
    )["rouge2"].mid

    return {
        "rouge2_precision": round(rouge_output.precision, 4),
        "rouge2_recall": round(rouge_output.recall, 4),
        "rouge2_fmeasure": round(rouge_output.fmeasure, 4),
    }

In [10]:
from seq2seq_trainer import Seq2SeqTrainer
from seq2seq_training_args import Seq2SeqTrainingArguments

BATCH_SIZE = 4

training_args = Seq2SeqTrainingArguments(
    predict_with_generate=True,
    output_dir=os.path.join(MODEL_TEMP_PATH, 'results'),
    overwrite_output_dir=True,
    #save_steps=10,
    save_steps=500,
    evaluation_strategy='steps',
    #eval_steps=4,
    eval_steps=7500,
    logging_dir=os.path.join(MODEL_TEMP_PATH, 'runs'),
    #logging_steps=2,
    logging_steps=1000,
    per_device_train_batch_size=BATCH_SIZE,
    per_device_eval_batch_size=BATCH_SIZE,
    fp16=True,
    warmup_steps=2000,
    save_total_limit=3,
)

trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    compute_metrics=compute_metrics,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
)
trainer.train()

  return torch.tensor(x, **format_kwargs)


Step,Training Loss,Validation Loss,Rouge2 Precision,Rouge2 Recall,Rouge2 Fmeasure
7500,3.127674,3.061293,0.0123,0.0775,0.0191


KeyboardInterrupt: 

Save model to disk.

In [11]:
model.save_pretrained(MODEL_SAVE_PATH)

Load model from disk

In [12]:
model = EncoderDecoderModel.from_pretrained(MODEL_SAVE_PATH)

Predict from input

In [13]:
import torch

product = list(product_dataset.values())[0]
product_category = product['category']
product_title = product['title']
product_description = product['description']
product_combined = generate_input_sequence(5, product_category, product_title, product_description)

input_ids, _ = preprocess_encoder_input(product_combined)
input_ids = torch.tensor(input_ids).unsqueeze(0)

output_ids = model.generate(
    input_ids,
    decoder_start_token_id=model.config.decoder.pad_token_id,
    temperature=1.3,
    top_k=9,
    top_p=0.9,
    repetition_penalty=1.4
)

In [14]:
print('CATEGORY:', product_category)
print('TITLE:', product_title)
print('DESCRIPTION:', product_description)

print('PREPROCESSED_INPUT:', encoder_tokenizer.decode(input_ids.squeeze(0)))
print('OUTPUT:', decoder_tokenizer.decode(output_ids.squeeze(0)))

CATEGORY: Gift Cards Gift Cards
TITLE: Serendipity 3 $100.00 Gift Card
DESCRIPTION: Gift card for the purchase of goods or services at Serendipity 3 in New York City only. Not valid for online purchases. Statements regarding dietary supplements have not been evaluated by the FDA and are not intended to diagnose, treat, cure, or prevent any disease or health condition.
PREPROCESSED_INPUT: [CLS] [RAT] 5 [CAT] gift cards gift cards [TTL] serendipity 3 $ 100. 00 gift card [DES] gift card for the purchase of goods or services at serendipity 3 in new york city only. not valid for online purchases. statements regarding dietary supplements have not been evaluated by the fda and are not intended to diagnose, treat, cure, or prevent any disease or health condition. [SEP] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] 