# Summarizer

In [None]:
! pip install transformers
! pip install datasets
! pip install sentencepiece
! pip install rouge_score
! pip install wandb

In [84]:
from transformers import (
    AutoModelForSequenceClassification,
    TrainingArguments,
    Trainer,
    AutoTokenizer,
    DataCollatorWithPadding,
)
import time
import evaluate
import datasets
import numpy as np
import torch
import nltk
from nltk.tokenize import word_tokenize
import json
import re
import collections



DETAILS_JSON = "data/email_thread_details.json"
SUMMARIES_JSON = "data/email_thread_summaries.json"

kThreadId = "thread_id"
kSubject = "subject"
kTimestamp = "timestamp"
kFrom = "from"
kTo = "to"
kBody = "body"

kThread = "thread"

kSummary = "summary"
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [None]:
class Utils():
    @staticmethod
    def load_dataset(DETAILS_FILE, SUMMARIES_FILE):
        '''
            This function loads the dataset from the file
            ARGS:
                filename: the name of the file
            RETURN:
                dataset: the dataset
        '''
        with open(DETAILS_FILE, 'r') as f:
            details = json.load(f)
        
        with open(SUMMARIES_FILE, 'r') as f:
            summaries = json.load(f)   
        
        dataset = {}
        for i in range(len(details)):
            item = details[i]
            thread_id = item[kThreadId]
            item = Utils.tokenize_body(item)
            dataset[thread_id] = dataset.get(thread_id, []) + [item]
        
        for i in range(len(summaries)):
            item = summaries[i]
            thread_id = item[kThreadId]
            item = Utils.tokenize_summary(item)
            dataset[thread_id] = (dataset.get(thread_id), item)


        return dataset

    @staticmethod
    def tokenize_body(item):
        item[kBody] = word_tokenize(item[kBody])
        item[kBody] = [re.sub(r'[^\w\s.]', '', word) for word in item[kBody]]
        item[kBody] = [word.strip() for word in item[kBody] if word.strip() and word.strip() not in ['--', '=']]
        # Lowercase the email body
        item[kBody] = [word.lower() for word in item[kBody]]
        item[kBody] = ["<BOS>"] + item[kBody] + ["<EOS>"]
        item[kBody] = " ".join(item[kBody])
        return item

    @staticmethod
    def tokenize_summary(item):
        item[kSummary] = word_tokenize(item[kSummary])
        item[kSummary] = [re.sub(r'[^\w\s.]', '', word) for word in item[kSummary]]
        item[kSummary] = [word.strip() for word in item[kSummary] if word.strip() and word.strip() not in ['--', '=']]
        # Lowercase the summary
        item[kSummary] = [word.lower() for word in item[kSummary]]
        item[kSummary] = "<BOS> " + " ".join(item[kSummary]) + "<EOS>"
        return item
    
    @staticmethod
    def build_vocab(data):
        '''
            This function builds the vocabulary from the data
            ARGS:
                data: the data to build the vocabulary from ([Email], EmailSummaries)
            RETURN:
                vocab: the vocabulary
        '''
        vocab = Vocab()
        for _, (email_list, summary) in data.items():
            for email in email_list:

                for word in email:
                    vocab.add(word)
            for word in summary[kSummary].split():
                vocab.add(word)
        
        return vocab

class Vocab(collections.abc.MutableSet):
    """
        Set-like data structure that can change words into numbers and back.
        From Prof. David Chiang Code
    """
    def __init__(self):
        words = {'<BOS>', '<EOS>', '<UNK>'}
        self.num_to_word = list(words)
        self.word_to_num = {word:num for num, word in enumerate(self.num_to_word)}
    def add(self, word):
        if word in self: return
        num = len(self.num_to_word)
        self.num_to_word.append(word)
        self.word_to_num[word] = num
    def discard(self, word):
        raise NotImplementedError()
    def update(self, words):
        self |= words
    def __contains__(self, word):
        return word in self.word_to_num
    def __len__(self):
        return len(self.num_to_word)
    def __iter__(self):
        return iter(self.num_to_word)

    def numberize(self, word):
        """Convert a word into a number."""
        if word in self.word_to_num:
            return self.word_to_num[word]
        else:
            return self.word_to_num['<UNK>']

    def denumberize(self, num):
        """Convert a number into a word."""
        return self.num_to_word[num]

In [None]:
raw_dataset = Utils.load_dataset(DETAILS_JSON, SUMMARIES_JSON)

In [None]:
# d = datasets.Dataset.from_dict({
#     kThread: [" ".join([email[kBody] for email in thread])for thread, _ in raw_dataset.values()],
#     kSummary: [summary[kSummary] for _, summary in raw_dataset.values()]
# })

In [None]:
# # Split the dataset into train, validation, and test
# test_size = int(len(d) * 0.25)
# train_size = len(d) - test_size
# d = datasets.Dataset.train_test_split(d, test_size=test_size, train_size=train_size, shuffle=True)

In [64]:
d = datasets.load_dataset("knkarthick/dialogsum")

In [93]:
model_name = "google/flan-t5-base"

og_model = AutoModelForSequenceClassification.from_pretrained(model_name, torch_dtype=torch.float32)
tokenizer = AutoTokenizer.from_pretrained(model_name)

config.json:   0%|          | 0.00/1.40k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/990M [00:00<?, ?B/s]

Some weights of T5ForSequenceClassification were not initialized from the model checkpoint at google/flan-t5-base and are newly initialized: ['classification_head.dense.bias', 'classification_head.out_proj.weight', 'classification_head.out_proj.bias', 'classification_head.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


tokenizer_config.json:   0%|          | 0.00/2.54k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.42M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/2.20k [00:00<?, ?B/s]

In [95]:
def tokenize_function(example):
    start_prompt = 'Summarize the following conversation.\n\n'
    end_prompt = '\n\nSummary: '
    prompt = [start_prompt + dialogue + end_prompt for dialogue in example["dialogue"]]
    example['input_ids'] = tokenizer(prompt, padding="max_length", truncation=True, return_tensors="pt").input_ids
    example['labels'] = tokenizer(example["summary"], padding="max_length", truncation=True, return_tensors="pt").input_ids
    
    return example

In [96]:
tokenized_datasets = d.map(
    tokenize_function, 
    batched=True,
    remove_columns= ['id', 'topic', 'dialogue', 'summary',]
)
data_collator = DataCollatorWithPadding(tokenizer)

Map:   0%|          | 0/12460 [00:00<?, ? examples/s]

Map:   0%|          | 0/500 [00:00<?, ? examples/s]

Map:   0%|          | 0/1500 [00:00<?, ? examples/s]

In [77]:
# Evaluation
rouge = evaluate.load("rouge")
metric = evaluate.load("accuracy")

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    return rouge.compute(predictions=predictions, references=labels)

In [88]:
class CustomTrainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False):
        outputs = model(**inputs)
        logits = outputs.logits
        loss = self.compute_loss(logits, labels)
        return (loss, outputs) if return_outputs else loss
    

In [89]:
curr_time = time.strftime("%Y-%m-%d_%H:%M:%S")
MODEL_PATH = f"models/model.pt-{curr_time}.pt"

assert len(d["train"]) > 0, "Training dataset is empty"
assert len(d["test"]) > 0, "Test dataset is empty"


training_args = TrainingArguments(
    output_dir=MODEL_PATH,
    learning_rate=1e-5,
    num_train_epochs=1,
    weight_decay=0.01,
    logging_steps=1,
    max_steps=1
)

trainer = Trainer(
    model=og_model,
    args=training_args,
    train_dataset=tokenized_datasets['train'],
    eval_dataset=tokenized_datasets['validation']
)

trainer.train()

  0%|          | 0/1 [00:00<?, ?it/s]

ValueError: Expected input batch_size (8) to match target batch_size (4096).

## Model

## Utils

## Data

In [None]:
def batch_tokenize_preprocess(batch, tokenizer, max_source_length, max_target_length):
    source = " ".join([email[kBody] for email in batch['email_list']])
    target = batch['summary'][kSummary]  
    
    source_tokenized = tokenizer(source, padding="max_length", truncation=True, max_length=max_source_length)
    target_tokenized = tokenizer(target, padding="max_length", truncation=True, max_length=max_target_length)



    source_tokenized = tokenizer(
        source, padding="max_length", truncation=True, max_length=max_source_length
    )
    
    target_tokenized = tokenizer(
        target, padding="max_length", truncation=True, max_length=max_target_length
    )

    batch = {k: v for k, v in source_tokenized.items()}

    batch["labels"] = target_tokenized["input_ids"]    
    
    # Ignore padding in the loss

    # batch["labels"] = [
    #     [-100 if token == tokenizer.pad_token_id else token for token in l]
    #     for l in target_tokenized["input_ids"]
    # ]
    return batch


In [None]:
dataset = Utils.load_dataset(DETAILS_JSON, SUMMARIES_JSON) # load the dataset

data = list(dataset.items())
random.shuffle(data)
train, test = train_test_split(data, test_size=0.2, random_state=42)

train = [(email_list, summary) for _, (email_list, summary) in train]
test = [(email_list, summary) for _, (email_list, summary) in test]

train_dataset = datasets.Dataset.from_dict({
    'email_list': [email_list for email_list, _ in train],
    'summary': [summary for _, summary in train]
})

test_dataset = datasets.Dataset.from_dict({
    'email_list': [email_list for email_list, _ in test],
    'summary': [summary for _, summary in test]
})


## Training

### Metrics

In [None]:
# Borrowed from https://github.com/huggingface/transformers/blob/master/examples/seq2seq/run_summarization.py

nltk.download("punkt", quiet=True)

metric = datasets.load_metric("rouge")


def postprocess_text(preds, labels):
    preds = [pred.strip() for pred in preds]
    labels = [label.strip() for label in labels]

    # rougeLSum expects newline after each sentence
    preds = ["\n".join(nltk.sent_tokenize(pred)) for pred in preds]
    labels = ["\n".join(nltk.sent_tokenize(label)) for label in labels]

    return preds, labels


def compute_metrics(eval_preds):
    preds, labels = eval_preds
    if isinstance(preds, tuple):
        preds = preds[0]
    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)
    # Replace -100 in the labels as we can't decode them.
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    # Some simple post-processing
    decoded_preds, decoded_labels = postprocess_text(decoded_preds, decoded_labels)

    result = metric.compute(
        predictions=decoded_preds, references=decoded_labels, use_stemmer=True
    )
    # Extract a few results from ROUGE
    result = {key: value.mid.fmeasure * 100 for key, value in result.items()}

    prediction_lens = [
        np.count_nonzero(pred != tokenizer.pad_token_id) for pred in preds
    ]
    result["gen_len"] = np.mean(prediction_lens)
    result = {k: round(v, 4) for k, v in result.items()}
    return result

In [None]:
# training_args = Seq2SeqTrainingArguments(
#     output_dir="results",
#     num_train_epochs=1,  # demo
#     do_train=True,
#     do_eval=True,
#     per_device_train_batch_size=4,  # demo
#     per_device_eval_batch_size=4,
#     learning_rate=3e-05,
#     warmup_steps=500,
#     weight_decay=0.1,
#     label_smoothing_factor=0.1,
#     predict_with_generate=True,
#     logging_dir="logs",
#     logging_steps=50,
#     save_total_limit=3,
# )

data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

# trainer = Seq2SeqTrainer(
#     model=model,
#     args=training_args,
#     data_collator=data_collator,
#     train_dataset=train,
#     eval_dataset=test,
#     tokenizer=tokenizer,
#     compute_metrics=compute_metrics,
# )

In [None]:
training_args = Seq2SeqTrainingArguments(
    output_dir="results",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    weight_decay=0.01,
    save_total_limit=3,
    num_train_epochs=4,
    predict_with_generate=True,
)

trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=train_data,
    eval_dataset=test_dataset,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

In [None]:
trainer.train()