In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
!pip uninstall -y transformers accelerate evaluate rouge_score
!pip install transformers accelerate evaluate rouge_score
!pip install sacrebleu
!pip install meteor


In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import os

import evaluate
import tensorflow as tf
import matplotlib.pyplot as plt

import torch
from torch.utils.data import Dataset, DataLoader


from transformers import LineByLineTextDataset
from transformers import DataCollatorForLanguageModeling
from transformers import GPT2Tokenizer, GPT2LMHeadModel
from transformers import Trainer, TrainingArguments

In [None]:
train_file_path = "/content/drive/MyDrive/AESLC-master/EmailSubjectTrain.txt"
eval_file_path = "/content/drive/MyDrive/AESLC-master/EmailSubjectEval.txt"
model_name = 'gpt2'
rouge = evaluate.load('rouge')
sacrebleu = evaluate.load("sacrebleu")
#meteor = evaluate.load('meteor')
output_dir = '/content/drive/MyDrive/AESLC-master/'
overwrite_output_dir = False
per_device_train_batch_size = 1
num_train_epochs = 1
save_steps = 1000

In [None]:
list_of_email = []
list_of_subject = []
list_of_ann0 = []
list_of_ann1 = []
list_of_ann2 = []
list_of_references = []
with open(eval_file_path, "r") as file:
    for line in file:
        file_content = line.replace("\t", " ").replace("\n", " ").split("Subject :")
        word = file_content[0]
#         print(line)
        if len(word.split()) <= 400 :
            try:
              subject = file_content[1].split("ann0 :")[0]
              ann0 = file_content[1].split("ann0 :")[1].split("ann1 :")[0]
              ann1 = file_content[1].split("ann0 :")[1].split("ann1 :")[1].split("ann2 :")[0]
              ann2 = file_content[1].split("ann0 :")[1].split("ann1 :")[1].split("ann2 :")[1]
              list_of_email.append(file_content[0])
              list_of_references.append([subject, ann0, ann1, ann2])
            except :
              print()





In [None]:
def load_dataset(file_path, tokenizer):
    dataset = LineByLineTextDataset(
                tokenizer=tokenizer,
                file_path=file_path,
                block_size=512
    )

    return dataset


def load_data_collator(tokenizer, mlm = False):
    data_collator = DataCollatorForLanguageModeling(
        tokenizer=tokenizer,
        mlm=mlm
    )

    return data_collator


def postprocess_text(preds, labels):
    # TODO: Separate only the subject from string
    # Ensure that for preds, you have a list of only the generated subject parts
    # For labels, it should be a list of list of only the reference subjects
    # NO OTHER CONTENT: EMAIL / SEPARATORS SHOULD BE OUTPUT AFTER POSTPROCESSING

    preds = [pred.strip() for pred in preds]
    labels = [[label.strip()] for label in labels]

    return preds, labels


def preprocess_logits_for_metrics(logits, labels):
    """
    Original Trainer may have a memory leak.
    This is a workaround to avoid storing too many tensors that are not needed.
    """
    # print('logits:', logits.shape)
    pred_ids = torch.argmax(logits, dim=-1)
    # print('pred_ids:', pred_ids.shape)

    return pred_ids, labels


def compute_metrics(eval_preds):
    i = 0
    size = len(list_of_email)
    list_of_prediction = []
    while i < size:
#               print(i)
#               print(list_of_references[i])
              inputs = tokenizer(list_of_email[i] + ' Subject : ', return_tensors="pt")
              inputs['input_ids'] = inputs['input_ids'].cpu()  # Move input tensor to CPU if necessary
              device = torch.device("cuda:0")  # Specify the CUDA device
              model.to(device)  # Move the model to the CUDA device

              # Move the input tensor to the CUDA device
              inputs['input_ids'] = inputs['input_ids'].to(device)
              outputs = model.generate(inputs['input_ids'], max_new_tokens=5, do_sample=True, top_k=30, top_p=0.95)
              prediction = tokenizer.decode(outputs[0], skip_special_tokens=True)
              # Generate outputs using the model on the CUDA device
              #print(prediction)
              prediction = prediction.split('Subject : ')[1]
              list_of_prediction.append(prediction)
#               print(prediction)
              i = i + 1
    result = rouge.compute(predictions=list_of_prediction, references=list_of_references)
    results_sacrebleu = sacrebleu.compute(predictions=list_of_prediction, references=list_of_references, lowercase = True)
    #results_meteor = meteor.compute(predictions=decoded_preds, references=decoded_labels)
    return {
        "R1": round(result["rouge1"], 4),
        "R2": round(result["rouge2"], 4),
        "RL": round(result["rougeL"], 4),
        "RLsum": round(result["rougeLsum"], 4),
        "bleu": round(results_sacrebleu["score"], 4)
    }


tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
tokenizer.add_special_tokens({'pad_token': '[PAD]'})

model = GPT2LMHeadModel.from_pretrained(model_name)
model.resize_token_embeddings(len(tokenizer))
model.save_pretrained(output_dir)

train_dataset = load_dataset(train_file_path, tokenizer)
eval_dataset = load_dataset(eval_file_path, tokenizer)
data_collator = load_data_collator(tokenizer)

tokenizer.save_pretrained(output_dir)

training_args = TrainingArguments(
          output_dir=output_dir,
#           evaluation_strategy = "epoch",
#           eval_steps = 500,
          learning_rate=1e-5,
          save_strategy = "epoch",
          overwrite_output_dir=overwrite_output_dir,
          per_device_train_batch_size=per_device_train_batch_size,
          per_device_eval_batch_size=1,
          num_train_epochs=num_train_epochs
      )

trainer = Trainer(
          model=model,
          args=training_args,
          data_collator=data_collator,
          train_dataset=train_dataset,
#           eval_dataset=eval_dataset,
#           preprocess_logits_for_metrics=preprocess_logits_for_metrics,
#           compute_metrics=compute_metrics
)

trainer.train()
trainer.save_model()