In [None]:
!pip uninstall -y transformers accelerate evaluate rouge_score
!pip install transformers accelerate evaluate rouge_score
!pip install sacrebleu
!pip install -U ray
!pip install bert_score
!pip install wandb

In [None]:
# pip install wandb

In [None]:
# import wandb

# wandb.init(project="Conversation")
# # 7ae06a5af942cfd2e21de64ee697126eb8b0d0b9

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import os

import evaluate
import numpy as np
import pandas as pd
import tensorflow as tf
import matplotlib.pyplot as plt

import torch
from torch.utils.data import Dataset, DataLoader

from transformers import LineByLineTextDataset
from transformers import DataCollatorForLanguageModeling
from transformers import GPT2Tokenizer, GPT2LMHeadModel
from transformers import Trainer, TrainingArguments

In [None]:
df_train = pd.read_json('/content/drive/MyDrive/AESLC-master/conversation-qa/train.jsonl', lines=True)
df_train.head()

In [None]:
df_dev = pd.read_json('/content/drive/MyDrive/AESLC-master/conversation-qa/dev.jsonl', lines=True)
df_dev.head()

In [None]:
df_test = pd.read_json('/content/drive/MyDrive/AESLC-master/conversation-qa/test.jsonl', lines=True)
df_test.head()

In [None]:
prompt_word_count_list = []
for sentence in df_train['prompt']:
  prompt_word_count_list.append(len(sentence.split(" ")))

response_word_count_list = []
for sentence in df_train['response']:
  response_word_count_list.append(len(sentence.split(" ")))

In [None]:
plt.hist(prompt_word_count_list, bins=10, color='red')
plt.title('Prompt word count')
plt.xlabel('Prompt')
plt.ylabel('Frequency')
plt.show()

In [None]:
plt.hist(response_word_count_list, bins=10, color='red')
plt.title('Response word count')
plt.xlabel('Response')
plt.ylabel('Frequency')
plt.show()

In [None]:
with open('/content/train.txt', 'w') as file:
    for prompt, response in zip(df_train['prompt'], df_train['response']):
      file.write("Prompt: " + prompt + " Response: " + response + "\n")

In [None]:
with open('/content/dev.txt', 'w') as file:
    for prompt, response in zip(df_dev['prompt'], df_dev['response']):
      file.write("Prompt: " + prompt + " Response: " + response + "\n")

In [None]:
# with open('/content/test.txt', 'w') as file:
#     for prompt, response in zip(df_test['prompt'], df_test['response']):
#       file.write("Prompt: " + prompt + " Response: " + response + "\n")

In [None]:
train_file_path = "/content/drive/MyDrive/convertational/train.txt"
eval_file_path = "/content/drive/MyDrive/convertational/dev.txt"
model_name = 'gpt2'
rouge = evaluate.load('rouge')
sacrebleu = evaluate.load("sacrebleu")
bertscore = evaluate.load("bertscore")
#meteor = evaluate.load('meteor')
output_dir = '/content/output'
overwrite_output_dir = False
per_device_train_batch_size = 8
num_train_epochs = 10
save_steps = 10000

In [None]:
len(df_dev['prompt'])

3750

In [None]:
import warnings

# Ignore all warnings
warnings.filterwarnings("ignore")

# Your code that may generate warnings

# To reset the warning settings to their defaults:
warnings.resetwarnings()


In [None]:
def load_dataset(file_path, tokenizer):
    dataset = LineByLineTextDataset(
                tokenizer=tokenizer,
                file_path=file_path,
                block_size=512
    )

    return dataset


def load_data_collator(tokenizer, mlm = False):
    data_collator = DataCollatorForLanguageModeling(
        tokenizer=tokenizer,
        mlm=mlm
    )

    return data_collator


def postprocess_text(preds, labels):
    # TODO: Separate only the subject from string
    # Ensure that for preds, you have a list of only the generated subject parts
    # For labels, it should be a list of list of only the reference subjects
    # NO OTHER CONTENT: EMAIL / SEPARATORS SHOULD BE OUTPUT AFTER POSTPROCESSING

    preds = [pred.strip() for pred in preds]
    labels = [[label.strip()] for label in labels]

    return preds, labels


def preprocess_logits_for_metrics(logits, labels):
    """
    Original Trainer may have a memory leak.
    This is a workaround to avoid storing too many tensors that are not needed.
    """
    # print('logits:', logits.shape)
    pred_ids = torch.argmax(logits, dim=-1)
    # print('pred_ids:', pred_ids.shape)

    return pred_ids, labels

def compute_metrics(eval_preds):
    i = 0
    size = len(df_dev['prompt'])
    list_of_prediction = []
    while i < size:
              print(i)
#               print(list_of_references[i])
              inputs = tokenizer("Prompt: " + df_dev['prompt'].iloc[i] + ' Response: ', return_tensors="pt")
              inputs['input_ids'] = inputs['input_ids'].cpu()  # Move input tensor to CPU if necessary
              device = torch.device("cuda:0")  # Specify the CUDA device
              model.to(device)  # Move the model to the CUDA device

              # Move the input tensor to the CUDA device
              inputs['input_ids'] = inputs['input_ids'].to(device)
              outputs = model.generate(inputs['input_ids'], max_new_tokens=15, do_sample=True, top_k=30, top_p=0.95)
              prediction = tokenizer.decode(outputs[0], skip_special_tokens=True)
              # Generate outputs using the model on the CUDA device
              #print(prediction)
              prediction = prediction.split('Response: ')[1]
              list_of_prediction.append(prediction)
#               print(prediction)
              i = i + 1
    result = rouge.compute(predictions=list_of_prediction, references=df_dev['response'])
    results_sacrebleu = sacrebleu.compute(predictions=list_of_prediction, references=df_dev['response'], lowercase = True)
    results_bert = bertscore.compute(predictions=list_of_prediction, references=df_dev['response'], lang="en")
    #results_meteor = meteor.compute(predictions=decoded_preds, references=decoded_labels)

    # wandb.log({
    #     "R1": round(result["rouge1"], 4),
    #     "R2": round(result["rouge2"], 4),
    #     "RL": round(result["rougeL"], 4),
    #     "RLsum": round(result["rougeLsum"], 4),
    #     "bleu": round(results_sacrebleu["score"], 4),
    #     "precision1":round(results_bert["precision"][0], 4),
    #     "precision2":round(results_bert["precision"][1], 4),
    #     "recall1":round(results_bert["recall"][0], 4),
    #     "recall2":round(results_bert["recall"][1], 4),
    #     "f1-score1":round(results_bert["f1"][0], 4),
    #     "f1-score2":round(results_bert["f1"][1], 4)
    # })
    return {
        "R1": round(result["rouge1"], 4),
        "R2": round(result["rouge2"], 4),
        "RL": round(result["rougeL"], 4),
        "RLsum": round(result["rougeLsum"], 4),
        "bleu": round(results_sacrebleu["score"], 4),
        "precision1":round(results_bert["precision"][0], 4),
        "precision2":round(results_bert["precision"][1], 4),
        "recall1":round(results_bert["recall"][0], 4),
        "recall2":round(results_bert["recall"][1], 4),
        "f1-score1":round(results_bert["f1"][0], 4),
        "f1-score2":round(results_bert["f1"][1], 4)
    }


tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
tokenizer.add_special_tokens({'pad_token': '[PAD]'})
tokenizer.pad_token_id = tokenizer.convert_tokens_to_ids('[PAD]')

model = GPT2LMHeadModel.from_pretrained(model_name)
model.resize_token_embeddings(len(tokenizer))
model.save_pretrained(output_dir)

train_dataset = load_dataset(train_file_path, tokenizer)
eval_dataset = load_dataset(eval_file_path, tokenizer)
data_collator = load_data_collator(tokenizer)

tokenizer.save_pretrained(output_dir)

training_args = TrainingArguments(
          output_dir=output_dir,
          evaluation_strategy = "epoch",
 #         eval_steps = 5000,
          learning_rate=1e-5,
          save_strategy = "epoch",
          overwrite_output_dir=overwrite_output_dir,
          per_device_train_batch_size=per_device_train_batch_size,
          per_device_eval_batch_size=1,
          num_train_epochs=num_train_epochs
      )

trainer = Trainer(
          model=model,
          args=training_args,
          data_collator=data_collator,
          train_dataset=train_dataset,
          eval_dataset=eval_dataset,
          preprocess_logits_for_metrics=preprocess_logits_for_metrics,
          compute_metrics=compute_metrics
)

trainer.train()
trainer.save_model()