In [None]:
! pip install transformers
! pip install datasets
! pip install sentencepiece
! pip install rouge_score

Collecting transformers
  Downloading transformers-4.11.3-py3-none-any.whl (2.9 MB)
[K     |████████████████████████████████| 2.9 MB 6.3 MB/s 
[?25hCollecting huggingface-hub>=0.0.17
  Downloading huggingface_hub-0.0.19-py3-none-any.whl (56 kB)
[K     |████████████████████████████████| 56 kB 4.0 MB/s 
Collecting sacremoses
  Downloading sacremoses-0.0.46-py3-none-any.whl (895 kB)
[K     |████████████████████████████████| 895 kB 29.4 MB/s 
Collecting pyyaml>=5.1
  Downloading PyYAML-6.0-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (596 kB)
[K     |████████████████████████████████| 596 kB 36.7 MB/s 
[?25hCollecting tokenizers<0.11,>=0.10.1
  Downloading tokenizers-0.10.3-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (3.3 MB)
[K     |████████████████████████████████| 3.3 MB 33.0 MB/s 
Installing collected packages: pyyaml, tokenizers, sacremoses, huggingface-hub, transformers
  At

In [None]:
import torch
import numpy as np
import datasets

from transformers import (
    AutoModelForSeq2SeqLM,
    AutoTokenizer,
    Seq2SeqTrainingArguments,
    Seq2SeqTrainer,
    DataCollatorForSeq2Seq,
)

from tabulate import tabulate
import nltk
from datetime import datetime

# Model and Tokenizer

## Downloading model and tokenizer.

In [None]:
model_name = "sshleifer/distilbart-xsum-12-3"

model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)

# tokenization
encoder_max_length = 256  # demo
decoder_max_length = 64

loading configuration file https://huggingface.co/sshleifer/distilbart-xsum-12-3/resolve/main/config.json from cache at /root/.cache/huggingface/transformers/4a0f7fb35f1504b6e865136124e3781fb488792aa105a84a991a3145a027791f.10ebe969457e130b9da526e7994b6191d3765d1d01ac6abc2eb20bb8adcbd4e0
Model config BartConfig {
  "_num_labels": 3,
  "activation_dropout": 0.0,
  "activation_function": "gelu",
  "add_bias_logits": false,
  "add_final_layer_norm": false,
  "architectures": [
    "BartForConditionalGeneration"
  ],
  "attention_dropout": 0.1,
  "bos_token_id": 0,
  "classif_dropout": 0.0,
  "classifier_dropout": 0.0,
  "d_model": 1024,
  "decoder_attention_heads": 16,
  "decoder_ffn_dim": 4096,
  "decoder_layerdrop": 0.0,
  "decoder_layers": 3,
  "decoder_start_token_id": 2,
  "dropout": 0.1,
  "early_stopping": true,
  "encoder_attention_heads": 16,
  "encoder_ffn_dim": 4096,
  "encoder_layerdrop": 0.0,
  "encoder_layers": 12,
  "eos_token_id": 2,
  "eos_token_ids": [
    2
  ],
  "extra

## Data

In [None]:
data = datasets.load_dataset("wiki_lingua", name="english", split="train[:2000]")

# Take a look at the data
for k, v in data["article"][0].items():
    print(k)
    print(v)

Reusing dataset wiki_lingua (/root/.cache/huggingface/datasets/wiki_lingua/english/1.1.0/5a847811c9eb16984e4de85c918d7a41c193c313e0393c7ee3efb2c4693a6547)


section_name
['Finding Other Transportation', 'Designating a Driver', 'Staying Safe']
document
['make sure that the area is a safe place, especially if you plan on walking home at night.  It’s always a good idea to practice the buddy system.  Have a friend meet up and walk with you. Research the bus, train, or streetcar routes available in your area to find safe and affordable travel to your destination.  Make sure you check the schedule for your outgoing and return travel.  Some public transportation will cease to run late at night.  Be sure if you take public transportation to the venue that you will also be able to get home late at night. Check the routes.  Even if some public transit is still running late at night, the routing may change.  Some may run express past many of the stops, or not travel all the way to the ends.  Be sure that your stop will still be available when you need it for your return trip. If you are taking public transit in a vulnerable state after drinking, it i

## **Format and split into train and validation sets**

In [None]:
def flatten(example):
    return {
        "document": example["article"]["document"],
        "summary": example["article"]["summary"],
    }


def list2samples(example):
    documents = []
    summaries = []
    for sample in zip(example["document"], example["summary"]):
        if len(sample[0]) > 0:
            documents += sample[0]
            summaries += sample[1]
    return {"document": documents, "summary": summaries}


dataset = data.map(flatten, remove_columns=["article", "url"])
dataset = dataset.map(list2samples, batched=True)

train_data_txt, validation_data_txt = dataset.train_test_split(test_size=0.1).values()

  0%|          | 0/2000 [00:00<?, ?ex/s]

  0%|          | 0/2 [00:00<?, ?ba/s]

## **Preprocess and tokenize**

In [None]:
def batch_tokenize_preprocess(batch, tokenizer, max_source_length, max_target_length):
    source, target = batch["document"], batch["summary"]
    source_tokenized = tokenizer(
        source, padding="max_length", truncation=True, max_length=max_source_length
    )
    target_tokenized = tokenizer(
        target, padding="max_length", truncation=True, max_length=max_target_length
    )

    batch = {k: v for k, v in source_tokenized.items()}
    # Ignore padding in the loss
    batch["labels"] = [
        [-100 if token == tokenizer.pad_token_id else token for token in l]
        for l in target_tokenized["input_ids"]
    ]
    return batch


train_data = train_data_txt.map(
    lambda batch: batch_tokenize_preprocess(
        batch, tokenizer, encoder_max_length, decoder_max_length
    ),
    batched=True,
    remove_columns=train_data_txt.column_names,
)

validation_data = validation_data_txt.map(
    lambda batch: batch_tokenize_preprocess(
        batch, tokenizer, encoder_max_length, decoder_max_length
    ),
    batched=True,
    remove_columns=validation_data_txt.column_names,
)

  0%|          | 0/5 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

## Training

### Training arguments

In [None]:
training_args = Seq2SeqTrainingArguments(
    output_dir="results",
    num_train_epochs=1,  # demo
    do_train=True,
    do_eval=True,
    per_device_train_batch_size=4,  # demo
    per_device_eval_batch_size=4,
    # learning_rate=3e-05,
    warmup_steps=500,
    weight_decay=0.1,
    label_smoothing_factor=0.1,
    predict_with_generate=True,
    logging_dir="logs",
    logging_steps=50,
    save_total_limit=3,
)

data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=train_data,
    eval_dataset=validation_data,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

### Train

Evaluate before fine-tuning

In [None]:
trainer.evaluate()

***** Running Evaluation *****
  Num examples = 484
  Batch size = 4


{'eval_gen_len': 23.2727,
 'eval_loss': 6.779993057250977,
 'eval_rouge1': 20.1368,
 'eval_rouge2': 4.8604,
 'eval_rougeL': 15.0447,
 'eval_rougeLsum': 17.9399,
 'eval_runtime': 155.5974,
 'eval_samples_per_second': 3.111,
 'eval_steps_per_second': 0.778}

Train the model

In [None]:
trainer.train()

***** Running training *****
  Num examples = 4351
  Num Epochs = 1
  Instantaneous batch size per device = 4
  Total train batch size (w. parallel, distributed & accumulation) = 4
  Gradient Accumulation steps = 1
  Total optimization steps = 1088


Step,Training Loss
50,6.5396
100,5.5779
150,5.1813
200,4.8674
250,4.8134
300,4.7719
350,4.7913
400,4.7022
450,4.5294
500,4.6425


Saving model checkpoint to results/checkpoint-500
Configuration saved in results/checkpoint-500/config.json
Model weights saved in results/checkpoint-500/pytorch_model.bin
tokenizer config file saved in results/checkpoint-500/tokenizer_config.json
Special tokens file saved in results/checkpoint-500/special_tokens_map.json
Saving model checkpoint to results/checkpoint-1000
Configuration saved in results/checkpoint-1000/config.json
Model weights saved in results/checkpoint-1000/pytorch_model.bin
tokenizer config file saved in results/checkpoint-1000/tokenizer_config.json
Special tokens file saved in results/checkpoint-1000/special_tokens_map.json


Training completed. Do not forget to share your model on huggingface.co/models =)




TrainOutput(global_step=1088, training_loss=4.712542996687048, metrics={'train_runtime': 1025.1275, 'train_samples_per_second': 4.244, 'train_steps_per_second': 1.061, 'total_flos': 1683722551296000.0, 'train_loss': 4.712542996687048, 'epoch': 1.0})

Evaluate after fine-tuning

In [None]:
trainer.evaluate()

***** Running Evaluation *****
  Num examples = 484
  Batch size = 4


{'epoch': 1.0,
 'eval_gen_len': 23.3202,
 'eval_loss': 4.246743679046631,
 'eval_rouge1': 31.6478,
 'eval_rouge2': 12.2928,
 'eval_rougeL': 25.306,
 'eval_rougeLsum': 30.6054,
 'eval_runtime': 172.2873,
 'eval_samples_per_second': 2.809,
 'eval_steps_per_second': 0.702}

## Evaluation

**Generate summaries from the fine-tuned model and compare them with those generated from the original, pre-trained one.**

In [None]:
def generate_summary(test_samples, model):
    inputs = tokenizer(
        test_samples,
        padding="max_length",
        truncation=True,
        max_length=encoder_max_length,
        return_tensors="pt",
    )
    input_ids = inputs.input_ids.to(model.device)
    attention_mask = inputs.attention_mask.to(model.device)
    outputs = model.generate(input_ids, attention_mask=attention_mask)
    output_str = tokenizer.batch_decode(outputs, skip_special_tokens=True)
    return outputs, output_str


model_before_tuning = AutoModelForSeq2SeqLM.from_pretrained(model_name)

loading configuration file https://huggingface.co/sshleifer/distilbart-xsum-12-3/resolve/main/config.json from cache at /root/.cache/huggingface/transformers/4a0f7fb35f1504b6e865136124e3781fb488792aa105a84a991a3145a027791f.10ebe969457e130b9da526e7994b6191d3765d1d01ac6abc2eb20bb8adcbd4e0
Model config BartConfig {
  "_num_labels": 3,
  "activation_dropout": 0.0,
  "activation_function": "gelu",
  "add_bias_logits": false,
  "add_final_layer_norm": false,
  "architectures": [
    "BartForConditionalGeneration"
  ],
  "attention_dropout": 0.1,
  "bos_token_id": 0,
  "classif_dropout": 0.0,
  "classifier_dropout": 0.0,
  "d_model": 1024,
  "decoder_attention_heads": 16,
  "decoder_ffn_dim": 4096,
  "decoder_layerdrop": 0.0,
  "decoder_layers": 3,
  "decoder_start_token_id": 2,
  "dropout": 0.1,
  "early_stopping": true,
  "encoder_attention_heads": 16,
  "encoder_ffn_dim": 4096,
  "encoder_layerdrop": 0.0,
  "encoder_layers": 12,
  "eos_token_id": 2,
  "eos_token_ids": [
    2
  ],
  "extra

In [None]:
text_sample= """The tower is 324 metres (1,063 ft) tall, about the same height as an 81-storey building, and the tallest structure in Paris. Its base is square, measuring 125 metres (410 ft) on each side. During its construction, the Eiffel Tower surpassed the Washington Monument to become the tallest man-made structure in the world, a title it held for 41 years until the Chrysler Building in New York City was finished in 1930. It was the first structure to reach a height of 300 metres. Due to the addition of a broadcasting aerial at the top of the tower in 1957, it is now taller than the Chrysler Building by 5.2 metres (17 ft). Excluding transmitters, the Eiffel Tower is the second tallest free-standing structure in France after the Millau Viaduct."""

### Sample output from pretrained model

In [None]:
generate_summary(text_sample, model_before_tuning)

(tensor([[   2,    0,   20,  381, 4822,  523, 7186,   11, 2201,   34,   57, 4142,
          1357,    7,    5,  285,    4,    2]]),
 [' The Eiffel Tower in Paris has been officially opened to the public.'])

### Sample output from fine tuned pretrained model

In [None]:
generate_summary(text_sample, model_before_tuning)

(tensor([[   2,    0,   20,  381, 4822,  523, 7186,   11, 2201,   34,   57, 4142,
          1357,    7,    5,  285,    4,    2]]),
 [' The Eiffel Tower in Paris has been officially opened to the public.'])

# Gradio App Integration

In [None]:
!pip install gradio --quiet 

In [None]:
def generate_summary(test_samples):
    inputs = tokenizer(
        test_samples,
        padding="max_length",
        truncation=True,
        max_length=encoder_max_length,
        return_tensors="pt",
    )
    input_ids = inputs.input_ids.to(model.device)
    attention_mask = inputs.attention_mask.to(model.device)
    outputs = model.generate(input_ids, attention_mask=attention_mask)
    output_str = tokenizer.batch_decode(outputs, skip_special_tokens=True)
    return output_str[0]

test = """This step will only work if he can see you during your communications. Having closed off body language or looking like you are in a rush may allow him to realize that his communication attempt with you is unwanted.  When he approaches you again, try looking away, slouching, fidgeting, or yawning, to signal that you are not interested.  Be careful not to accidentally signal interest with body language such as leaning in or laughing. Sometimes telling someone you are not interested is not enough, or there may not have been an opportunity to pull him aside and break the news. Keeping any communications short and to the point will help him get the hint that you are not interested. It will also make it more difficult for him to continue communicating as there will be less to discuss. For example, if he texts you and asks you how you your day is going and whether you want to go for dinner, you might ignore answering the question about your day and just say "thanks for the offer but no thanks!" If he has not yet gotten the hint and speaking with him directly has not helped, it is time to stop all communication. Do not give into feelings of guilt about backing off from the situation. If you believe that removing this individual from your life is a good idea, then keep that in mind if you start to feel guilty. Guilt motivates us to repair relationships, but sometimes it tries to motivate us when it is actually not in our best long term interest to do so.  If, after you say no to his date requests, he tries to make you feel guilty by saying things such as "I'm going through a hard time right now so you saying no really stings" keep in mind that guilt can misfire and lead you to make poor decisions. Just because you are stopping your communications does not mean you should delete the communications he sends you, particularly if you feel that you are or may be stalked by this individual, in which case it is a good idea to have a record of all communications should you need them for legal purposes. In severe cases, such as if you feel you are being stalked, changing your email address, phone number, or in the most severe cases, your home address and/or work location will greatly influence your chances of getting any unwanted individual to leave you alone."""

In [None]:
generate_summary(test)

'Close off body language. Tell him that you are not interested. Stop all communication.'

In [None]:
import gradio as gr


gr.Interface(generate_summary,
    [
        gr.inputs.Textbox(lines=7, label="Text to summarize")
    ],
    gr.outputs.Textbox(label="Summary"),
    title="Text Summarizer",
    ).launch()

Colab notebook detected. To show errors in colab notebook, set `debug=True` in `launch()`
This share link will expire in 72 hours. If you need a permanent link, visit: https://gradio.app/introducing-hosted
Running on External URL: https://25949.gradio.app


(<Flask 'gradio.networking'>,
 'http://127.0.0.1:7867/',
 'https://25949.gradio.app')