<a href="https://www.kaggle.com/code/khunanonr/bart-with-text-rank?scriptVersionId=131230070" target="_blank"><img align="left" alt="Kaggle" title="Open in Kaggle" src="https://kaggle.com/static/images/open-in-kaggle.svg"></a>

# Load dataset

* Paper source: Automatic Pull Request Title Generation
* Dataset: https://github.com/soarsmu/PRTiger/raw/main/data/PRTiger.zip
* model source: https://colab.research.google.com/github/elsanns/xai-nlp-notebooks/blob/master/fine_tune_bart_summarization_two_langs.ipynb

# Load dataset

In [None]:
!pip install gdown

In [None]:
!gdown 1afDEBUClq5Oq7cSvQXLIftypYSsG8X5z
!gdown 1Ue4U5cwz8Kt26go_X0a1x-1d_r1cPUim
!gdown 10D00QF27gGgOaVTwzhdrQ11iqq72oV5U

# Prepare data

In [None]:
import pandas as pd

In [None]:
df_train = pd.read_csv('/kaggle/working/preprocessed_train.csv')
df_valid = pd.read_csv('/kaggle/working/preprocessed_validation.csv')
df_test = pd.read_csv('/kaggle/working/preprocessed_test.csv')
df_train.head()

# Import
---

In [None]:
! pip --q install transformers
! pip --q install datasets
! pip --q install sentencepiece
! pip --q install rouge_score
# ! pip install wandb

In [None]:
import torch
import numpy as np
import datasets

from transformers import (
    AutoModelForSeq2SeqLM,
    AutoTokenizer,
    Seq2SeqTrainingArguments,
    Seq2SeqTrainer,
    DataCollatorForSeq2Seq,
)

from tabulate import tabulate
import nltk
from datetime import datetime

import datasets
from datasets import Dataset, DatasetDict

# First model

## Model and tokenizer

---

In [None]:
model_name = "facebook/bart-base"

model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)
model.to("cuda")

# tokenization
encoder_max_length = 512  # demo
decoder_max_length = 64

In [None]:
extra_token = "<extra>"
end_extra_token = "</extra>"
list_tokens = [ end_extra_token, extra_token]

tokenizer.add_tokens(list_tokens, special_tokens=True) ##This line is updated
tokenizer.additional_special_tokens= list_tokens
tokenizer.additional_special_tokens
model.resize_token_embeddings(len(tokenizer))
# model.resize_token_embeddings(len(tokenizer))

## Prepare data

---

In [None]:
# Pack data
def flatten(example):
    return {
        "document": example["document"],
        "summary": example["summary"],
    }

df_custom_train = Dataset.from_pandas(df_train)
df_custom_valid = Dataset.from_pandas(df_valid)
df_custom_test = Dataset.from_pandas(df_test)

df_custom_train = df_custom_train.map(flatten)
df_custom_valid = df_custom_valid.map(flatten)
df_custom_test = df_custom_test.map(flatten)

## Preprocess and tokenize

In [None]:
def batch_tokenize_preprocess(batch, tokenizer, max_source_length, max_target_length):
    source, target = batch["document"], batch["summary"]
    source_tokenized = tokenizer(source, padding="max_length", truncation=True, max_length=max_source_length )
    target_tokenized = tokenizer(target, padding="max_length", truncation=True, max_length=max_target_length )

    batch = {k: v for k, v in source_tokenized.items()}
    # Ignore padding in the loss
    batch["labels"] = [
        [-100 if token == tokenizer.pad_token_id else token for token in l]
        for l in target_tokenized["input_ids"]
    ]
    return batch


train_data = df_custom_train.map( lambda batch: batch_tokenize_preprocess(
        batch, tokenizer, encoder_max_length, decoder_max_length
    ),
    batched=True,
    remove_columns=df_custom_train.column_names,
)

validation_data = df_custom_valid.map(lambda batch: batch_tokenize_preprocess(
        batch, tokenizer, encoder_max_length, decoder_max_length
    ),
    batched=True,
    remove_columns=df_custom_valid.column_names,
)


test_data = df_custom_test.map(lambda batch: batch_tokenize_preprocess(
        batch, tokenizer, encoder_max_length, decoder_max_length
    ),
    batched=True,
    remove_columns=df_custom_test.column_names,
)

## Training

---

### Metrics

In [None]:
# Borrowed from https://github.com/huggingface/transformers/blob/master/examples/seq2seq/run_summarization.py

nltk.download("punkt", quiet=True)

metric = datasets.load_metric("rouge")


def postprocess_text(preds, labels):
    preds = [pred.strip() for pred in preds]
    labels = [label.strip() for label in labels]

    # rougeLSum expects newline after each sentence
    preds = ["\n".join(nltk.sent_tokenize(pred)) for pred in preds]
    labels = ["\n".join(nltk.sent_tokenize(label)) for label in labels]

    return preds, labels


def compute_metrics(eval_preds):
    preds, labels = eval_preds
    if isinstance(preds, tuple):
        preds = preds[0]
    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)
    # Replace -100 in the labels as we can't decode them.
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    # Some simple post-processing
    decoded_preds, decoded_labels = postprocess_text(decoded_preds, decoded_labels)

    result = metric.compute(
        predictions=decoded_preds, references=decoded_labels, use_stemmer=True
    )
    print(result.keys())
    # Extract a few results from ROUGE
    result = {key: value.mid.fmeasure * 100 for key, value in result.items()}

    prediction_lens = [
        np.count_nonzero(pred != tokenizer.pad_token_id) for pred in preds
    ]
    result["gen_len"] = np.mean(prediction_lens)
    result = {k: round(v, 4) for k, v in result.items()}
    return result

### Training arguments

In [None]:
training_args = Seq2SeqTrainingArguments(
    output_dir="/NLP_project/bart_1",
    seed = 42,
    data_seed = 42,
    num_train_epochs=4,  # demo
    do_train=True,
    do_eval=True,
    per_device_train_batch_size=4,  # demo
    per_device_eval_batch_size=4,
    warmup_steps=500,
    weight_decay=0.1,
    label_smoothing_factor=0.1,
    predict_with_generate=True,
    logging_dir="logs",
    logging_steps=6000,
    evaluation_strategy="steps",
    save_total_limit = 5,
    save_strategy = "steps",
    save_steps = 6000,
    load_best_model_at_end=True,
)


data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=train_data,
    eval_dataset=validation_data,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)
model.device

### Train

In [None]:
WANDB_INTEGRATION = False

In [None]:
print(trainer.evaluate())

In [None]:
trainer.train()

## Evaluate validation

In [None]:
print(trainer.evaluate())

## Save model

In [None]:
trainer.save_model("/kaggle/working/NLP_project/best_1_model")

## Evaluate Test

In [None]:
tester = Seq2SeqTrainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=train_data,
    eval_dataset=test_data,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

tester.evaluate()

## Gen output

In [None]:
from torch.utils.data import DataLoader

def gen_output(model, df_custom, df_new, name, batch_size=8):
    dataset_custom_loader = DataLoader(df_custom, batch_size=batch_size, shuffle=False)
    first_output = []
    for batch in dataset_custom_loader:
        inputs = tokenizer(
            batch["document"],
            padding="max_length",
            truncation=True,
            max_length=encoder_max_length,
            return_tensors="pt",
        )
        input_ids = inputs.input_ids.to(model.device)
        attention_mask = inputs.attention_mask.to(model.device)
        outputs = model.generate(input_ids, attention_mask=attention_mask)
        output_str = tokenizer.batch_decode(outputs, skip_special_tokens=True)
        first_output += output_str
    df_new[name] = first_output
    return df_new

# add new cols
df_train = gen_output(model, df_custom_train, df_train, "first_output")
df_valid = gen_output(model, df_custom_valid, df_valid, "first_output")
df_test = gen_output(model, df_custom_test, df_test, "first_output")
df_train

### Sample output validation


In [None]:
def generate_summary(test_samples, model):
    inputs = tokenizer(
        test_samples["document"],
        padding="max_length",
        truncation=True,
        max_length=encoder_max_length,
        return_tensors="pt",
    )
    input_ids = inputs.input_ids.to(model.device)
    attention_mask = inputs.attention_mask.to(model.device)
    outputs = model.generate(input_ids, attention_mask=attention_mask)
    output_str = tokenizer.batch_decode(outputs, skip_special_tokens=True)
    return outputs, output_str


# model_before_tuning = AutoModelForSeq2SeqLM.from_pretrained(model_name)

validation_samples = df_custom_valid.select(range(5))

# summaries_before_tuning = generate_summary(validation_samples, model_before_tuning)[1]
summaries_after_tuning = generate_summary(validation_samples, model)[1]

In [None]:
print(
    tabulate(
        zip(
            range(len(summaries_after_tuning)),
            summaries_after_tuning,
            validation_samples["summary"],
        ),
        headers=["Id", "Summary after", "Summary before"],
    )
)
print("\nTarget summaries:\n")
print(
    tabulate(list(enumerate(validation_samples["summary"])), headers=["Id", "Target summary"])
)
print("\nSource documents:\n")
print(tabulate(list(enumerate(validation_samples["document"])), headers=["Id", "Document"]))

### Sample output Test

In [None]:
def generate_summary(test_samples, model):
    inputs = tokenizer(
        test_samples["document"],
        padding="max_length",
        truncation=True,
        max_length=encoder_max_length,
        return_tensors="pt",
    )
    input_ids = inputs.input_ids.to(model.device)
    attention_mask = inputs.attention_mask.to(model.device)
    outputs = model.generate(input_ids, attention_mask=attention_mask)
    output_str = tokenizer.batch_decode(outputs, skip_special_tokens=True)
    return outputs, output_str

test_samples =  df_custom_test.select(range(5))

test_summaries_after_tuning = generate_summary(test_samples, model)[1]

In [None]:
print(
    tabulate(
        zip(
            range(len(test_summaries_after_tuning)),
            test_summaries_after_tuning,
            test_samples["summary"],
        ),
        headers=["Id", "Summary predict", "Summary target"],
    )
)

print("\nSource documents:\n")
print(tabulate(list(enumerate(test_samples["document"])), headers=["Id", "Document"]))

# TextRank

In [None]:
import nltk
nltk.download('punkt')
nltk.download('stopwords')

In [None]:
!wget http://nlp.stanford.edu/data/glove.6B.zip
!unzip glove*.zip

In [None]:
# Extract word vectors
word_embeddings = {}
f = open('glove.6B.100d.txt', encoding='utf-8')
for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    word_embeddings[word] = coefs
f.close()
len(word_embeddings)

In [None]:
import re

import numpy as np
from nltk import sent_tokenize, word_tokenize

from nltk.cluster.util import cosine_distance

MULTIPLE_WHITESPACE_PATTERN = re.compile(r"\s+", re.UNICODE)


def normalize_whitespace(text):
    """
    Translates multiple whitespace into single space character.
    If there is at least one new line character chunk is replaced
    by single LF (Unix new line) character.
    """
    return MULTIPLE_WHITESPACE_PATTERN.sub(_replace_whitespace, text)


def _replace_whitespace(match):
    text = match.group()

    if "\n" in text or "\r" in text:
        return "\n"
    else:
        return " "


def is_blank(string):
    """
    Returns `True` if string contains only white-space characters
    or is empty. Otherwise `False` is returned.
    """
    return not string or string.isspace()


def get_symmetric_matrix(matrix):
    """
    Get Symmetric matrix
    :param matrix:
    :return: matrix
    """
    return matrix + matrix.T - np.diag(matrix.diagonal())


def core_cosine_similarity(vector1, vector2):
    """
    measure cosine similarity between two vectors
    :param vector1:
    :param vector2:
    :return: 0 < cosine similarity value < 1
    """
    return 1 - cosine_distance(vector1, vector2)


'''
Note: This is not a summarization algorithm. This Algorithm pics top sentences irrespective of the order they appeared.
'''


class TextRank4Sentences():
    def __init__(self):
        self.damping = 0.85  # damping coefficient, usually is .85
        self.min_diff = 1e-5  # convergence threshold
        self.steps = 100  # iteration steps
        self.text_str = None
        self.sentences = None
        self.pr_vector = None

    def _sentence_similarity(self, sent1, sent2, stopwords=None):
        if stopwords is None:
            stopwords = []

        sent1 = [w.lower() for w in sent1]
        sent2 = [w.lower() for w in sent2]

        all_words = list(set(sent1 + sent2))

        vector1 = [0] * len(all_words)
        vector2 = [0] * len(all_words)

        # build the vector for the first sentence
        for w in sent1:
            if w in stopwords:
                continue
            vector1[all_words.index(w)] += 1

        # build the vector for the second sentence
        for w in sent2:
            if w in stopwords:
                continue
            vector2[all_words.index(w)] += 1

        return core_cosine_similarity(vector1, vector2)

    def _build_similarity_matrix(self, sentences, stopwords=None):
        # create an empty similarity matrix
        sm = np.zeros([len(sentences), len(sentences)])

        for idx1 in range(len(sentences)):
            for idx2 in range(len(sentences)):
                if idx1 == idx2:
                    continue

                sm[idx1][idx2] = self._sentence_similarity(sentences[idx1], sentences[idx2], stopwords=stopwords)

        # Get Symmeric matrix
        sm = get_symmetric_matrix(sm)

        # Normalize matrix by column
        norm = np.sum(sm, axis=0)
        sm_norm = np.divide(sm, norm, where=norm != 0)  # this is ignore the 0 element in norm

        return sm_norm

    def _run_page_rank(self, similarity_matrix):

        pr_vector = np.array([1] * len(similarity_matrix))

        # Iteration
        previous_pr = 0
        for epoch in range(self.steps):
            pr_vector = (1 - self.damping) + self.damping * np.matmul(similarity_matrix, pr_vector)
            if abs(previous_pr - sum(pr_vector)) < self.min_diff:
                break
            else:
                previous_pr = sum(pr_vector)

        return pr_vector

    def _get_sentence(self, index):

        try:
            return self.sentences[index]
        except IndexError:
            return ""

    def get_top_sentences(self, number=5, concate=True):

        top_sentences = []

        if self.pr_vector is not None:

            sorted_pr = np.argsort(self.pr_vector)
            sorted_pr = list(sorted_pr)
            sorted_pr.reverse()

            index = 0
            for epoch in range(min(number, len(self.sentences))):
                sent = self.sentences[sorted_pr[index]]
                sent = normalize_whitespace(sent)
                top_sentences.append(sent)
                index += 1
        if concate :
            return ",".join(top_sentences)
        return top_sentences

    def analyze(self, text, stop_words=None):
        self.text_str = text
        self.sentences = sent_tokenize(self.text_str)

        tokenized_sentences = [word_tokenize(sent) for sent in self.sentences]

        similarity_matrix = self._build_similarity_matrix(tokenized_sentences, stop_words)

        self.pr_vector = self._run_page_rank(similarity_matrix)


def get_sum_textrank(text, k_rank=5) :
    tr4sh = TextRank4Sentences()
    tr4sh.analyze(text)
    text_str = tr4sh.get_top_sentences(k_rank)
    return text_str  

In [None]:
# get text rank
print("calculate_textrank")
text_rank = TextRank4Sentences()

df_train["text_rank"] = df_train['document'].map(lambda x: get_sum_textrank(x))
df_valid["text_rank"] = df_valid['document'].map(lambda x: get_sum_textrank(x))
df_test["text_rank"] = df_test['document'].map(lambda x: get_sum_textrank(x))
df_train.head()

## Concat text_rank and first_output

In [None]:
special_token = "<txtRank>"
end_special_token = "</txtRank>"

def concate_tr_fo(data):
    return data["first_output"] + " " + special_token + " " + data['text_rank'] + " "+ end_special_token

print('concat text_rank')
df_train['second_document'] = df_train.apply(concate_tr_fo, axis=1)
df_valid['second_document'] = df_valid.apply(concate_tr_fo, axis=1)
df_test['second_document'] = df_test.apply(concate_tr_fo, axis=1)
df_train.head()

# Second model

In [None]:
model_name = "facebook/bart-base"

second_model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)

# tokenization
encoder_max_length = 512  # demo
decoder_max_length = 64

In [None]:
extra_token = "<extra>"
end_extra_token = "</extra>"
list_tokens = [special_token, end_special_token, end_extra_token, extra_token]

tokenizer.add_tokens(list_tokens, special_tokens=True) ##This line is updated
tokenizer.additional_special_tokens= list_tokens
tokenizer.additional_special_tokens
second_model.resize_token_embeddings(len(tokenizer))

In [None]:
# Pack data
def flatten_2(example):
    return {
        "document": example["second_document"],
        "summary": example["summary"],
    }

df_custom_train_2 = Dataset.from_pandas(df_train)
df_custom_valid_2 = Dataset.from_pandas(df_valid)
df_custom_test_2 = Dataset.from_pandas(df_test)

df_custom_train_2 = df_custom_train_2.map(flatten_2)
df_custom_valid_2 = df_custom_valid_2.map(flatten_2)
df_custom_test_2 = df_custom_test_2.map(flatten_2)

In [None]:
def batch_tokenize_preprocess(batch, tokenizer, max_source_length, max_target_length):
    source, target = batch["document"], batch["summary"]
    source_tokenized = tokenizer(source, padding="max_length", truncation=True, max_length=max_source_length )
    target_tokenized = tokenizer(target, padding="max_length", truncation=True, max_length=max_target_length )

    batch = {k: v for k, v in source_tokenized.items()}
    # Ignore padding in the loss
    batch["labels"] = [
        [-100 if token == tokenizer.pad_token_id else token for token in l]
        for l in target_tokenized["input_ids"]
    ]
    return batch


train_data_2 = df_custom_train_2.map( lambda batch: batch_tokenize_preprocess(
        batch, tokenizer, encoder_max_length, decoder_max_length
    ),
    batched=True,
    remove_columns=df_custom_train_2.column_names,
)

validation_data_2 = df_custom_valid_2.map(lambda batch: batch_tokenize_preprocess(
        batch, tokenizer, encoder_max_length, decoder_max_length
    ),
    batched=True,
    remove_columns=df_custom_valid_2.column_names,
)


test_data_2 = df_custom_test_2.map(lambda batch: batch_tokenize_preprocess(
        batch, tokenizer, encoder_max_length, decoder_max_length
    ),
    batched=True,
    remove_columns=df_custom_test_2.column_names,
)

In [None]:
training_args = Seq2SeqTrainingArguments(
    output_dir="/NLP_project/bart_2",
    seed = 42,
    data_seed = 42,
    num_train_epochs=4,  # demo
    do_train=True,
    do_eval=True,
    per_device_train_batch_size=4,  # demo
    per_device_eval_batch_size=4,
    warmup_steps=500,
    weight_decay=0.1,
    label_smoothing_factor=0.1,
    predict_with_generate=True,
    logging_dir="logs",
    logging_steps=6000,
    evaluation_strategy="steps",
    save_total_limit = 5,
    save_strategy = "steps",
    save_steps = 6000,
    load_best_model_at_end=True,
)

data_collator = DataCollatorForSeq2Seq(tokenizer, model=second_model)

trainer = Seq2SeqTrainer(
    model=second_model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=train_data_2,
    eval_dataset=validation_data_2,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)
second_model.device

## Training

In [None]:
print(trainer.evaluate())

In [None]:
print("start train 2")
trainer.train()

## Save model

In [None]:
print("Finish and get output")
trainer.save_model("/kaggle/working/NLP_project/best_2_model")
print(trainer.evaluate())

## Evaluate Test

In [None]:
tester = Seq2SeqTrainer(
    model=second_model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=train_data_2,
    eval_dataset=test_data_2,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

tester.evaluate()

## Output second model

In [None]:
from torch.utils.data import DataLoader

# add new cols
df_train = gen_output(second_model, df_custom_train_2, df_train, 'second_output')
df_valid = gen_output(second_model, df_custom_valid_2, df_valid, 'second_output')
df_test = gen_output(second_model, df_custom_test_2, df_test, 'second_output')
df_train.head()

# Save file csv

In [None]:
# Save results
print("Save results")
df_train.to_csv("df_train_result_textrank.csv")
df_valid.to_csv("df_valid_result_textrank.csv")
df_test.to_csv("df_test_result_textrank.csv")