# Summarizer

In [1]:
! pip install transformers
! pip install datasets
! pip install sentencepiece
! pip install rouge_score
! pip install wandb



In [2]:
import torch
import numpy as np
import datasets
import json
import collections.abc
import random
from sklearn.model_selection import train_test_split

from transformers import (
    AutoModelForSeq2SeqLM,
    AutoTokenizer,
    Seq2SeqTrainingArguments,
    Seq2SeqTrainer,
    DataCollatorForSeq2Seq,
)

import nltk
from datetime import datetime

## Model

In [3]:
model_name = "google/pegasus-large"

model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)

encoder_max_length = 256  # demo
decoder_max_length = 64

Some weights of PegasusForConditionalGeneration were not initialized from the model checkpoint at google/pegasus-large and are newly initialized: ['model.encoder.embed_positions.weight', 'model.decoder.embed_positions.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


## Utils

In [4]:
DETAILS_JSON = "data/email_thread_details.json"
SUMMARIES_JSON = "data/email_thread_summaries.json"

kThreadId = "thread_id"
kSubject = "subject"
kTimestamp = "timestamp"
kFrom = "from"
kTo = "to"
kBody = "body"

kSummary = "summary"


class Utils():
    @staticmethod
    def load_dataset(DETAILS_FILE, SUMMARIES_FILE):
        '''
            This function loads the dataset from the file
            ARGS:
                filename: the name of the file
            RETURN:
                dataset: the dataset
        '''
        with open(DETAILS_FILE, 'r') as f:
            details = json.load(f)
        
        with open(SUMMARIES_FILE, 'r') as f:
            summaries = json.load(f)   
        
        dataset = {}
        for i in range(len(details)):
            item = details[i]
            thread_id = item[kThreadId]
            dataset[thread_id] = dataset.get(thread_id, []) + [item]
        
        for i in range(len(summaries)):
            item = summaries[i]
            thread_id = item[kThreadId]
            dataset[thread_id] = (dataset.get(thread_id), item)


        return dataset
    
    @staticmethod
    def build_vocab(data):
        '''
            This function builds the vocabulary from the data
            ARGS:
                data: the data to build the vocabulary from ([Email], EmailSummaries)
            RETURN:
                vocab: the vocabulary
        '''
        vocab = Vocab()
        for _, (email_list, summary) in data.items():
            for email in email_list:
                for word in email[kBody].split():
                    vocab.add(word)
            for word in summary[kSummary].split():
                vocab.add(word)
        
        return vocab

class Vocab(collections.abc.MutableSet):
    """
        Set-like data structure that can change words into numbers and back.
        From Prof. David Chiang Code
    """
    def __init__(self):
        words = {'<BOS>', '<EOS>', '<UNK>'}
        self.num_to_word = list(words)
        self.word_to_num = {word:num for num, word in enumerate(self.num_to_word)}
    def add(self, word):
        if word in self: return
        num = len(self.num_to_word)
        self.num_to_word.append(word)
        self.word_to_num[word] = num
    def discard(self, word):
        raise NotImplementedError()
    def update(self, words):
        self |= words
    def __contains__(self, word):
        return word in self.word_to_num
    def __len__(self):
        return len(self.num_to_word)
    def __iter__(self):
        return iter(self.num_to_word)

    def numberize(self, word):
        """Convert a word into a number."""
        if word in self.word_to_num:
            return self.word_to_num[word]
        else:
            return self.word_to_num['<UNK>']

    def denumberize(self, num):
        """Convert a number into a word."""
        return self.num_to_word[num]

## Data

In [5]:
def batch_tokenize_preprocess(batch, tokenizer, max_source_length, max_target_length):
    source = " ".join([email[kBody] for email in batch['email_list']])
    target = batch['summary'][kSummary]  
    
    source_tokenized = tokenizer(source, padding="max_length", truncation=True, max_length=max_source_length)
    target_tokenized = tokenizer(target, padding="max_length", truncation=True, max_length=max_target_length)



    source_tokenized = tokenizer(
        source, padding="max_length", truncation=True, max_length=max_source_length
    )
    
    target_tokenized = tokenizer(
        target, padding="max_length", truncation=True, max_length=max_target_length
    )

    batch = {k: v for k, v in source_tokenized.items()}

    batch["labels"] = target_tokenized["input_ids"]    
    
    # Ignore padding in the loss

    # batch["labels"] = [
    #     [-100 if token == tokenizer.pad_token_id else token for token in l]
    #     for l in target_tokenized["input_ids"]
    # ]
    return batch


In [6]:
dataset = Utils.load_dataset(DETAILS_JSON, SUMMARIES_JSON) # load the dataset

data = list(dataset.items())
random.shuffle(data)
train, test = train_test_split(data, test_size=0.2, random_state=42)

train = [(email_list, summary) for _, (email_list, summary) in train]
test = [(email_list, summary) for _, (email_list, summary) in test]

train_dataset = datasets.Dataset.from_dict({
    'email_list': [email_list for email_list, _ in train],
    'summary': [summary for _, summary in train]
})

test_dataset = datasets.Dataset.from_dict({
    'email_list': [email_list for email_list, _ in test],
    'summary': [summary for _, summary in test]
})


In [7]:
for k, v in train_dataset[0].items():
    print(k)
    print(v)

email_list
[{'body': "Mark - \n\nGood suggestion.  I spoke to Scott Rebltiz who told me that important \ndecisions are to made this week at OPIC concering the Gaza project.  I will \nwork with Scott and John Hardy to make sure that our message is consistent \nand helpful.  I will also work with Mac for a comprehensive update and \nstrategy.\n\n-Chris \n\n\n\n\n\n\nMark Schroeder@ECT\n02/09/2000 06:35 AM\nTo: Joe Hillings/Corp/Enron@ENRON, Chris Long/Corp/Enron@ENRON\ncc: Steven J Kean/HOU/EES@EES, Mac \nMcClelland/ENRON_DEVELOPMENT@ENRON_DEVELOPMENT \n\nSubject: Department of Energy Pre-departure Briefing w/ Asst. Secy. Goldwyn\n\nJoe?Chris - this covers some of my new turf.  We have, I am told, a power \nproject in Gaza that we are working on.  It is probably early days, but \nperhpas Mac mcClelland in Dubai can shed some light on whether we would want \nto get that project on the Secretary of Energy's radar screen.   mcs \n---------------------- Forwarded by Mark Schroeder/LON/ECT on

In [8]:
train_data = train_dataset.map(
    lambda batch: batch_tokenize_preprocess(batch, tokenizer, encoder_max_length, decoder_max_length),
    batched=False,
    batch_size=8,
    remove_columns=["email_list", "summary"],
)

Map:   0%|          | 0/3333 [00:00<?, ? examples/s]

## Training

### Metrics

In [9]:
# Borrowed from https://github.com/huggingface/transformers/blob/master/examples/seq2seq/run_summarization.py

nltk.download("punkt", quiet=True)

metric = datasets.load_metric("rouge")


def postprocess_text(preds, labels):
    preds = [pred.strip() for pred in preds]
    labels = [label.strip() for label in labels]

    # rougeLSum expects newline after each sentence
    preds = ["\n".join(nltk.sent_tokenize(pred)) for pred in preds]
    labels = ["\n".join(nltk.sent_tokenize(label)) for label in labels]

    return preds, labels


def compute_metrics(eval_preds):
    preds, labels = eval_preds
    if isinstance(preds, tuple):
        preds = preds[0]
    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)
    # Replace -100 in the labels as we can't decode them.
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    # Some simple post-processing
    decoded_preds, decoded_labels = postprocess_text(decoded_preds, decoded_labels)

    result = metric.compute(
        predictions=decoded_preds, references=decoded_labels, use_stemmer=True
    )
    # Extract a few results from ROUGE
    result = {key: value.mid.fmeasure * 100 for key, value in result.items()}

    prediction_lens = [
        np.count_nonzero(pred != tokenizer.pad_token_id) for pred in preds
    ]
    result["gen_len"] = np.mean(prediction_lens)
    result = {k: round(v, 4) for k, v in result.items()}
    return result

  metric = datasets.load_metric("rouge")


In [10]:
# training_args = Seq2SeqTrainingArguments(
#     output_dir="results",
#     num_train_epochs=1,  # demo
#     do_train=True,
#     do_eval=True,
#     per_device_train_batch_size=4,  # demo
#     per_device_eval_batch_size=4,
#     learning_rate=3e-05,
#     warmup_steps=500,
#     weight_decay=0.1,
#     label_smoothing_factor=0.1,
#     predict_with_generate=True,
#     logging_dir="logs",
#     logging_steps=50,
#     save_total_limit=3,
# )

data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

# trainer = Seq2SeqTrainer(
#     model=model,
#     args=training_args,
#     data_collator=data_collator,
#     train_dataset=train,
#     eval_dataset=test,
#     tokenizer=tokenizer,
#     compute_metrics=compute_metrics,
# )

In [11]:
training_args = Seq2SeqTrainingArguments(
    output_dir="results",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    weight_decay=0.01,
    save_total_limit=3,
    num_train_epochs=4,
    predict_with_generate=True,
)

trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=train_data,
    eval_dataset=test_dataset,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [12]:
trainer.train()

Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
[34m[1mwandb[0m: Currently logged in as: [33msvntii[0m. Use [1m`wandb login --relogin`[0m to force relogin
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` befor

  0%|          | 0/836 [00:00<?, ?it/s]

You're using a PegasusTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


: 