In [None]:
import torch
torch.cuda.empty_cache()

In [None]:
#PREPARE DATASET

import pandas as pd
import numpy as np

#MOBILE DATA

selected_cols =['product_id','star_rating','helpful_votes','total_votes','review_headline','review_body']
# Read TSV file into DataFrame
df = pd.read_csv('amazon_reviews_us_Mobile_Electronics_v1_00.tsv', sep='\t', on_bad_lines='skip',usecols=selected_cols)
df = df.loc[(df.helpful_votes > df.total_votes / 2) & (df.helpful_votes > 1)][0:800]
df=df.reset_index(drop=True)
pred_texts = df['review_body'].dropna().astype('str').tolist()
del df

In [None]:
### AUTOMOTIVE DATA ###
import pandas as pd
import numpy as np

selected_cols =['product_id','star_rating','helpful_votes','total_votes','review_body']
# Read TSV file into DataFrame
df2 = pd.read_csv('amazon_reviews_us_Automotive_v1_00.tsv', sep='\t', on_bad_lines='skip',usecols=selected_cols)
df2 = df2.loc[(df2.helpful_votes > df2.total_votes / 2) & (df2.helpful_votes > 1)][0:700]
df2=df2.reset_index(drop=True)
#PREPROCESS
pred_texts2 = df2['review_body'].dropna().astype('str').tolist() 
del df2

In [None]:
from datasets import Dataset,load_dataset, load_metric
from transformers import AutoTokenizer,DataCollatorForSeq2Seq
from transformers import AutoModelForSeq2SeqLM, Seq2SeqTrainingArguments, Seq2SeqTrainer
import evaluate

# Tokenize the reviews  - called inside fine_tune()
def preprocess_function(batch):
    inputs = [doc for doc in batch["review"]]

    model_inputs = tokenizer(inputs, max_length=512, truncation=True)

    labels = tokenizer(text_target=batch["target"], max_length=95, truncation=True)

    model_inputs["labels"] = labels["input_ids"]

    return model_inputs

In [None]:
#Function to calculate metrics - called inside fine_tune()
def compute_metrics(eval_pred):

    predictions, labels = eval_pred

    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)

    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)

    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    result = rouge.compute(predictions=decoded_preds, references=decoded_labels, use_stemmer=True)

    prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in predictions]

    result["gen_len"] = np.mean(prediction_lens)

    return {k: round(v, 4) for k, v in result.items()}

In [None]:
from huggingface_hub import notebook_login

#We fine-tuned the model using Trainer API from Hugging Face.
#The target summaries are loaded from "Mobile_Electronics_summaries" created from "Create_Summaries_Openai" function.

#We created two different txt's that relate to the two dataframes above.

# #THIS IS REQUIRED TO UPLOAD MODEL TO HUGGING FACE
# notebook_login()


# Create the dataframe (Target/Review)
training = pd.read_csv('Mobile_Electronics_summaries.txt', delimiter='\t', header=None, names=["target"], encoding='unicode_escape')
training["review"] = pred_texts[:500]

# Convert dataframe to Datasets
dataset = Dataset.from_pandas(training)

# Split into train/test
dataset = dataset.train_test_split(test_size=0.1)

# Load tokenizer/model
checkpoint = "facebook/bart-large-cnn"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
model = AutoModelForSeq2SeqLM.from_pretrained(checkpoint)

tokenized_dataset = dataset.map(preprocess_function,batched=True)

#Collation
data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=checkpoint)

#Define metric
rouge = evaluate.load("rouge")

#Load model
model = AutoModelForSeq2SeqLM.from_pretrained(checkpoint)


In [None]:
#Define arguments and train model
training_args = Seq2SeqTrainingArguments(

    output_dir="Summarization-Product-Reviews",

    evaluation_strategy="epoch",

    learning_rate=2.3e-5,

    per_device_train_batch_size=8,

    per_device_eval_batch_size=8,

    weight_decay=0.008,

    save_total_limit=3,

    num_train_epochs=3,

    predict_with_generate=True,

    fp16=True,

    push_to_hub=True,
    optim="adamw_torch"
)

trainer = Seq2SeqTrainer(

    model=model,

    args=training_args,

    train_dataset=tokenized_dataset["train"],

    eval_dataset=tokenized_dataset["test"],

    tokenizer=tokenizer,

    data_collator=data_collator,

    compute_metrics=compute_metrics,

)


trainer.train() 
# trainer.push_to_hub()