In [None]:
#from google.colab import drive
#drive.mount('/content/drive/')
#!pip install transformers

In [None]:
#DATA_URL = 'http://deepyeti.ucsd.edu/jianmo/amazon/categoryFiles/Electronics.json.gz'
#METADATA_URL = 'http://deepyeti.ucsd.edu/jianmo/amazon/metaFiles/meta_Electronics.json.gz'
DATA_URL = 'http://deepyeti.ucsd.edu/jianmo/amazon/categoryFiles/Gift_Cards.json.gz'
METADATA_URL = 'http://deepyeti.ucsd.edu/jianmo/amazon/metaFiles/meta_Gift_Cards.json.gz'
SAMPLE_SIZE = 60000

MODEL_SAVE_PATH = '/data/user/jprob/bert2gpt'
MODEL_TEMP_PATH = '/data/user/jprob/temp'

!pip install datasets==1.0.2
!pip install transformers
!pip install rouge_score
!rm seq2seq_trainer.py
!rm seq2seq_training_args.py
!wget https://raw.githubusercontent.com/huggingface/transformers/v3.5.1/examples/seq2seq/seq2seq_trainer.py
!wget https://raw.githubusercontent.com/huggingface/transformers/v3.5.1/examples/seq2seq/seq2seq_training_args.py

In [None]:
import json
import os
import gzip
from urllib.request import urlopen
from tqdm.auto import tqdm
import re
from transformers import BertTokenizer, GPT2Tokenizer, EncoderDecoderModel
from datasets import Dataset
import pandas as pd
from datasets import load_metric
from seq2seq_trainer import Seq2SeqTrainer
from seq2seq_training_args import Seq2SeqTrainingArguments

try:
    os.mkdir("training_data")
    os.mkdir("training_metadata")
except FileExistsError:
    pass # The directory already exists.

# n = int(input("How many batches? Enter int: "))
n = 20

def get_data(url):
    data = []
    with gzip.open(urlopen(url)) as f:
        for l in f:
            data.append(json.loads(l.strip()))
    return data

u_input = input("First time? [y] for yes, anything else for no\n")

if u_input.lower() == "y":
    data = get_data(DATA_URL)
    metadata = get_data(METADATA_URL)

    # https://stackoverflow.com/questions/312443/how-do-you-split-a-list-into-evenly-sized-chunks
    data_batches = [data[i:i + n] for i in range(0, len(data), n)]
    metadata_batches = [metadata[i:i + n] for i in range(0, len(metadata), n)]


    for c, i in enumerate(data_batches):
        with open(f"training_data/{c}.txt", "w") as f:
            f.write(json.dumps(i))
    for c, i in enumerate(metadata_batches):
        with open(f"training_metadata/{c}.txt", "w") as f:
            f.write(json.dumps(i))
else:
    pass



In [None]:
td_listdir = os.listdir("training_data")
md_listdir = os.listdir("training_metadata")

stop_gap = len(td_listdir)//20

used_td = []
used_md = []

continue_question = "y"
while continue_question == "y":
    
    data = []
    metadata = []
    inc = 0
    for i in td_listdir:
        if i not in used_td:
            with open("training_data/" + i, "r") as f:
                for l in f:
                    data.append(json.loads(l.strip()))
            used_td.append(i)
            inc += 1
        if inc >= stop_gap:
            break
    inc = 0
    for i in td_listdir:
        if i not in used_td:
            with open("training_metadata/" + i, "r") as f:
                for l in f:
                    metadata.append(json.loads(l.strip()))
            used_td.append(i)
            inc += 1
        if inc >= stop_gap:
            break
            
    
    
    # Remove HTML tags from text
    def remove_html_tags(text):
        return re.sub(r'<.*?>', '', text)

    product_dataset = {}
    for product in metadata:
        asin = product['asin']
        category = ' '.join(product['category'])
        title = product['title']
        description = ' '.join(product['description'])

        # Remove HTML tags
        filtered_description = remove_html_tags(description)

        product_dataset[asin] = {
            'category': category,
            'title': title,
            'description': filtered_description
        }
    RATING_TOKEN = '[RAT]'
    CATEGORY_TOKEN = '[CAT]'
    TITLE_TOKEN = '[TTL]'
    DESCRIPTION_TOKEN = '[DES]'
    LABEL_MASK_TOKEN_ID = -100

    ENCODER_MAX_LENGTH = 512
    DECODER_MAX_LENGTH = 128

    encoder_tokenizer = BertTokenizer(
        'vocab.txt', #'/content/drive/My Drive/Colab-Notebooks/vocab.txt'
        additional_special_tokens=[
            RATING_TOKEN,
            CATEGORY_TOKEN,
            TITLE_TOKEN,
            DESCRIPTION_TOKEN,
        ],
    )
    encoder_tokenizer.bos_token = encoder_tokenizer.cls_token
    encoder_tokenizer.eos_token = encoder_tokenizer.sep_token

    def build_inputs_with_special_tokens(self, token_ids_0, token_ids_1=None):
        outputs = [self.bos_token_id] + token_ids_0 + [self.eos_token_id]
        return outputs

    GPT2Tokenizer.build_inputs_with_special_tokens = build_inputs_with_special_tokens
    decoder_tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
    decoder_tokenizer.pad_token = decoder_tokenizer.unk_token

    model = EncoderDecoderModel.from_encoder_decoder_pretrained('bert-base-cased', 'gpt2')
    model.config.vocab_size = model.config.encoder.vocab_size

    model.config.decoder_start_token_id = decoder_tokenizer.bos_token_id
    model.config.eos_token_id = decoder_tokenizer.eos_token_id
    model.config.pad_token_id = decoder_tokenizer.eos_token_id

    model.config.max_length = 142
    model.config.min_length = 56
    model.config.no_repeat_ngram_size = 3
    model.early_stopping = True
    model.length_penalty = 2.0
    model.num_beams = 4
    # map data to input
    def generate_input_sequence(rating, category, title, description):
        rating_sequence = '{} {}'.format(RATING_TOKEN, rating)
        category_sequence = '{} {}'.format(CATEGORY_TOKEN, category)
        title_sequence = '{} {}'.format(TITLE_TOKEN, title)
        description_sequence = '{} {}'.format(DESCRIPTION_TOKEN, description)
        return ' '.join([rating_sequence, category_sequence, title_sequence, description_sequence])

    # tokenize input for encoder
    def preprocess_encoder_input(sequence):
        tokenized_input = encoder_tokenizer.tokenize(sequence)
        encoded_input = encoder_tokenizer.convert_tokens_to_ids(tokenized_input)
        prepared_input = encoder_tokenizer.prepare_for_model(
            encoded_input,
            max_length=ENCODER_MAX_LENGTH,
            truncation=True,
            padding='max_length',
            return_attention_mask=True,
        )
        return prepared_input['input_ids'], prepared_input['attention_mask']

    # tokenize input for decoder
    def preprocess_decoder_input(sequence):
        tokenized_input = decoder_tokenizer.tokenize(sequence)
        encoded_input = decoder_tokenizer.convert_tokens_to_ids(tokenized_input)
        prepared_input = decoder_tokenizer.prepare_for_model(
            encoded_input,
            max_length=DECODER_MAX_LENGTH,
            truncation=True,
            padding='max_length',
            return_attention_mask=True,
        )
        return prepared_input['input_ids'], prepared_input['attention_mask']

    model_inputs = []
    for review in tqdm(data):
        product_asin = review['asin']
        review_rating = review['overall']
        review_text = review.get('reviewText')

        product = product_dataset.get(product_asin)

        # filter reviews with missing information
        has_text_review = review_text is not None
        product_exists = product is not None
        product_has_description = product_exists and product['description'] is not []

        if has_text_review and product_has_description:
            product_combined = generate_input_sequence(
                review_rating,
                product['category'],
                product['title'],
                product['description'],
            )

            input_ids, attention_mask = preprocess_encoder_input(product_combined)
            decoder_input_ids, decoder_attention_mask = preprocess_decoder_input(review_text)

            # mask pad tokens in label
            labels = []
            for index in decoder_input_ids:
                label = index
                if index is decoder_tokenizer.pad_token_id:
                    label = LABEL_MASK_TOKEN_ID
                labels.append(label)

            model_inputs.append({
                'input_ids': input_ids,
                'attention_mask': attention_mask,
                'decoder_input_ids': decoder_input_ids,
                'decoder_attention_mask': decoder_attention_mask,
                'labels': labels,
            })


    data_frame = pd.DataFrame.from_records(model_inputs)
    dataset = Dataset.from_pandas(data_frame).select(range(SAMPLE_SIZE))
    dataset.set_format(
        type="torch", columns=["input_ids", "attention_mask", "decoder_input_ids", "decoder_attention_mask", "labels"],
    )
    dataset_dict = dataset.train_test_split()
    train_dataset = dataset_dict['train']
    test_dataset = dataset_dict['test']
    rouge = load_metric("rouge")

    def compute_metrics(outputs):
        predictions_ids = outputs.predictions
        labels_ids = outputs.label_ids

        predictions = decoder_tokenizer.batch_decode(predictions_ids, skip_special_tokens=True)
        labels_ids[labels_ids == LABEL_MASK_TOKEN_ID] = decoder_tokenizer.eos_token_id
        labels = decoder_tokenizer.batch_decode(labels_ids, skip_special_tokens=True)

        rouge_output = rouge.compute(
            predictions=predictions,
            references=labels,
            rouge_types=["rouge2"]
        )["rouge2"].mid

        return {
            "rouge2_precision": round(rouge_output.precision, 4),
            "rouge2_recall": round(rouge_output.recall, 4),
            "rouge2_fmeasure": round(rouge_output.fmeasure, 4),
        }


    BATCH_SIZE = 4

    training_args = Seq2SeqTrainingArguments(
        predict_with_generate=True,
        output_dir=os.path.join(MODEL_TEMP_PATH, 'results'),
        overwrite_output_dir=True,
        #save_steps=10,
        save_steps=500,
        evaluation_strategy='steps',
        #eval_steps=4,
        eval_steps=7500,
        logging_dir=os.path.join(MODEL_TEMP_PATH, 'runs'),
        #logging_steps=2,
        logging_steps=1000,
        per_device_train_batch_size=BATCH_SIZE,
        per_device_eval_batch_size=BATCH_SIZE,
        fp16=True,
        warmup_steps=2000,
        save_total_limit=3,
    )

    trainer = Seq2SeqTrainer(
        model=model,
        args=training_args,
        compute_metrics=compute_metrics,
        train_dataset=train_dataset,
        eval_dataset=test_dataset,
    )
    trainer.train()
    model.save_pretrained(MODEL_SAVE_PATH)
    model = EncoderDecoderModel.from_pretrained(MODEL_SAVE_PATH)
    
    
    continue_question = input("Continue and train on more? [y] to train on more, anything else to get output\n")

In [None]:
import torch

product = list(product_dataset.values())[0]
product_category = product['category']
product_title = product['title']
product_description = product['description']
product_combined = generate_input_sequence(5, product_category, product_title, product_description)

input_ids, _ = preprocess_encoder_input(product_combined)
input_ids = torch.tensor(input_ids).unsqueeze(0)

output_ids = model.generate(
    input_ids,
    decoder_start_token_id=model.config.decoder.pad_token_id,
    temperature=1.3,
    top_k=9,
    top_p=0.9,
    repetition_penalty=1.4
)

In [None]:
print('CATEGORY:', product_category)
print('TITLE:', product_title)
print('DESCRIPTION:', product_description)

print('PREPROCESSED_INPUT:', encoder_tokenizer.decode(input_ids.squeeze(0)))
print('OUTPUT:', decoder_tokenizer.decode(output_ids.squeeze(0)))