In [5]:
import pandas as pd
import numpy as np

from datasets import Dataset

import torch
import torch.nn as nn

from transformers import AutoModelForSeq2SeqLM, AutoTokenizer, pipeline, set_seed
from transformers import DataCollatorForSeq2Seq
from transformers import Seq2SeqTrainer
from transformers import Seq2SeqTrainingArguments

import evaluate


In [2]:
!pip3 install datasets

Collecting datasets
  Downloading datasets-2.18.0-py3-none-any.whl (510 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m510.5/510.5 kB[0m [31m6.5 MB/s[0m eta [36m0:00:00[0m
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m15.3 MB/s[0m eta [36m0:00:00[0m
Collecting multiprocess (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl (134 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [31m18.9 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: dill, multiprocess, datasets
Successfully installed datasets-2.18.0 dill-0.3.8 multiprocess-0.70.16


In [3]:
!pip3 install "accelerate>=0.21.0"

Collecting accelerate>=0.21.0
  Downloading accelerate-0.27.2-py3-none-any.whl (279 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/280.0 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[90m╺[0m [32m276.5/280.0 kB[0m [31m9.2 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m280.0/280.0 kB[0m [31m7.2 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: accelerate
Successfully installed accelerate-0.27.2


In [4]:
!pip3 install evaluate

Collecting evaluate
  Downloading evaluate-0.4.1-py3-none-any.whl (84 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/84.1 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m2.7 MB/s[0m eta [36m0:00:00[0m
Collecting responses<0.19 (from evaluate)
  Downloading responses-0.18.0-py3-none-any.whl (38 kB)
Installing collected packages: responses, evaluate
Successfully installed evaluate-0.4.1 responses-0.18.0


In [6]:
import accelerate
print(accelerate.__version__)


0.27.2


In [7]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [8]:
# Set device
device = "cuda" if torch.cuda.is_available() else "cpu"

In [9]:
print(device)

cuda


# Data Loading

### Load pandas Dataframe

In [10]:
# Load the dataset
file_path="/content/drive/MyDrive/Reviews.csv"
df = pd.read_csv(file_path).head(100)
df_eval=pd.read_csv(file_path)[100:110]

In [11]:
df=df[["Summary","Text"]]
df_eval=df_eval[["Summary","Text"]]

### Use datasets to transform the dataframe

In [12]:
dataset=Dataset.from_pandas(df)

In [13]:
dataset

Dataset({
    features: ['Summary', 'Text'],
    num_rows: 100
})

In [14]:
dataset = dataset.train_test_split(test_size=0.1)

In [15]:
dataset["train"]

Dataset({
    features: ['Summary', 'Text'],
    num_rows: 90
})

In [16]:
dataset["train"][0] #dataset is a dictionary like data structure

{'Summary': 'Great allergy sensitive dog food, dogs love it',
 'Text': "Our pup has experienced allergies in forms of hotspots and itching from other dog foods. The cheap 'you can buy it anywhere' food not only have crazy preservatives in them but can cause health problems for your pets.  This food works wonders on reducing allergies and our dog loves the food.<br />This message is RAMSEY FrAnkenSteiN approved."}

In [17]:
dataset["test"]

Dataset({
    features: ['Summary', 'Text'],
    num_rows: 10
})

# Data Preprocessing

### Tokenize the data

In [18]:
model_name = "facebook/bart-large-cnn"
tokenizer = AutoTokenizer.from_pretrained(model_name)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/1.58k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

In [19]:
# Tokenize the dataset
def preprocess_function(examples):
    '''
    args: The input is dataset (contain train and test)
    returns : add input_ids ( index of the tokenized text from the word vocabulary ) and labels (index of the tokenized summary from the word vocabulary ),
    attention_mask (1 for real tokens and 0 for padding tokens) of the input text
    '''
    model_inputs = tokenizer(examples["Text"], max_length=1024, truncation=True)
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(examples["Summary"], max_length=512, truncation=True)
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs


In [20]:
tokenized_dataset = dataset.map(preprocess_function, batched=True)

Map:   0%|          | 0/90 [00:00<?, ? examples/s]



Map:   0%|          | 0/10 [00:00<?, ? examples/s]

In [21]:
tokenized_dataset

DatasetDict({
    train: Dataset({
        features: ['Summary', 'Text', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 90
    })
    test: Dataset({
        features: ['Summary', 'Text', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 10
    })
})

In [22]:
print(len(tokenized_dataset["train"]["input_ids"][0]))
print(type(tokenized_dataset["train"]["input_ids"][0]))

77
<class 'list'>


In [23]:
print(len(tokenized_dataset["train"]["input_ids"][1]))
print(type(tokenized_dataset["train"]["input_ids"][1]))

73
<class 'list'>


In [24]:
tokenized_dataset = tokenized_dataset.remove_columns(['Summary', 'Text'])

In [25]:
tokenized_dataset

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 90
    })
    test: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 10
    })
})

### Define the data collector

In [26]:
data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer)

# Training

### Define the model

#### Model name and tokenizer

In [27]:
# Set model, tokenizer, and metric
model_name = "facebook/bart-large-cnn"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name).to(device)

model.safetensors:   0%|          | 0.00/1.63G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]

In [28]:
sent_tokenize=tokenizer

#### Model *parameters*

In [29]:
batch_size = 8
num_train_epochs = 1

# Show training loss at every epoch
logging_steps = len(tokenized_dataset["train"]) // batch_size

# Set up training argument
args = Seq2SeqTrainingArguments(
    output_dir=f"{model_name}-finetuned-claudius",
    evaluation_strategy="epoch",
    learning_rate=5.6e-5,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    weight_decay=0.01,
    save_total_limit=3,
    num_train_epochs=num_train_epochs,
    predict_with_generate=True,
    logging_steps=logging_steps,)

#### Evaluation mertices

In [30]:
!pip3 install rouge_score

Collecting rouge_score
  Downloading rouge_score-0.1.2.tar.gz (17 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: rouge_score
  Building wheel for rouge_score (setup.py) ... [?25l[?25hdone
  Created wheel for rouge_score: filename=rouge_score-0.1.2-py3-none-any.whl size=24933 sha256=17aa3fc9ad8944cf9fde5e525caaee72d9a9accad62cd4b4b602755144cb6ac6
  Stored in directory: /root/.cache/pip/wheels/5f/dd/89/461065a73be61a532ff8599a28e9beef17985c9e9c31e541b4
Successfully built rouge_score
Installing collected packages: rouge_score
Successfully installed rouge_score-0.1.2


In [31]:
rouge_score = evaluate.load("rouge")

Downloading builder script:   0%|          | 0.00/6.27k [00:00<?, ?B/s]

In [32]:
def compute_rouge_score(pred_data):
  """Compute ROUGE scores"""
  # Retrieve predictions
  predictions, labels = pred_data

  # Decode predicted summaries
  decoded_predictions = tokenizer.batch_decode(predictions, skip_special_tokens=True)

  # Replace -100 in the labels as we can't decode them
  labels = np.where(labels != -100, labels, tokenizer.pad_token_id)

  # Decode reference summaries into text
  decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

  # ROUGE expects a newline after each sentence
  decoded_predictions = ["\n".join(sent_tokenize(pred.strip())) for pred in decoded_predictions]
  decoded_labels = ["\n".join(sent_tokenize(label.strip())) for label in decoded_labels]

  # Compute ROUGE scores
  result = rouge_score.compute(predictions=decoded_predictions, references=decoded_labels, use_stemmer=True)

  return {k: round(v, 4) for k, v in result.items()}

In [33]:
# Start training
trainer = Seq2SeqTrainer(
    model,
    args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["test"],
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_rouge_score,)

In [34]:
trainer.train()

Epoch,Training Loss,Validation Loss,Rouge1,Rouge2,Rougel,Rougelsum
1,3.0435,3.917325,1.0,1.0,1.0,1.0


TrainOutput(global_step=12, training_loss=3.067586521307627, metrics={'train_runtime': 14.9432, 'train_samples_per_second': 6.023, 'train_steps_per_second': 0.803, 'total_flos': 40751724822528.0, 'train_loss': 3.067586521307627, 'epoch': 1.0})

In [35]:
#save the model
# Save the model
model_path = f"/content/drive/My Drive/{model_name}-finetuned"
trainer.save_model(model_path)

# Save the tokenizer in the same directory
tokenizer.save_pretrained(model_path)

Non-default generation parameters: {'max_length': 142, 'min_length': 56, 'early_stopping': True, 'num_beams': 4, 'length_penalty': 2.0, 'no_repeat_ngram_size': 3, 'forced_bos_token_id': 0, 'forced_eos_token_id': 2}


('/content/drive/My Drive/facebook/bart-large-cnn-finetuned/tokenizer_config.json',
 '/content/drive/My Drive/facebook/bart-large-cnn-finetuned/special_tokens_map.json',
 '/content/drive/My Drive/facebook/bart-large-cnn-finetuned/vocab.json',
 '/content/drive/My Drive/facebook/bart-large-cnn-finetuned/merges.txt',
 '/content/drive/My Drive/facebook/bart-large-cnn-finetuned/added_tokens.json',
 '/content/drive/My Drive/facebook/bart-large-cnn-finetuned/tokenizer.json')

# Inference

In [36]:
# Call the model
tokenizer = AutoTokenizer.from_pretrained(model_path)
model = AutoModelForSeq2SeqLM.from_pretrained(model_path)

In [39]:
df_eval.reset_index(inplace=True)

In [43]:
df_eval.drop(["index"],axis=1,inplace=True)

In [44]:
df_eval

Unnamed: 0,Summary,Text
0,Taste wise it is a 6 star item,"The mouth says, ""How do I love thee, let me co..."
1,Great Support,Arrived slightly thawed. My parents wouldn't a...
2,TART!,The crust on these tarts are perfect. My husb...
3,Omaha Apple Tartlets,These are absolutely scrumptuous! My husband ...
4,Loved these Tartlets,What a nice alternative to an apple pie. Love ...
5,The best,I like Creme Brulee. I loved that these were s...
6,disappointing,not what I was expecting in terms of the compa...
7,Wasting Vinegar on a Cucumber is a Shame!,I first bought pickled asparagus at an Amish m...
8,Asparagus Bliss,"I love asparagus. Up until very recently, I h..."
9,My Idea of a Good Diet Food.,I'm presently on a diet and I was at my Fresh ...


In [37]:
def generate_prediction(input_text, tokenizer, model):
  """Generate summary prediction for inference set"""
  # Tokenize input text
  inputs = tokenizer(input_text, return_tensors="pt", padding=True, truncation=True, max_length=1024)

  # Generate summaries
  output_sequences = model.generate(
      input_ids=inputs["input_ids"],
      attention_mask=inputs["attention_mask"],
      max_length=1024,
      no_repeat_ngram_size=2, # Parameters to improve output quality
      early_stopping=True
  )

  # Decode and return prediction
  return tokenizer.decode(output_sequences[0], skip_special_tokens=True)

In [45]:
summary_lis=[]
for i in range(len(df_eval)):
  summary_lis.append(generate_prediction(df_eval["Text"][i],tokenizer,model))

In [47]:
df_eval["generated summary"]=summary_lis

In [48]:
df_eval.to_csv("generated_summary.csv")

In [53]:
#simply put a sentences
sentence="I love China since China is my homeland. Moreover, China has many fantastic places and a grand and long history. The most important is that I have many friends and family memebers in China. "

In [55]:
generate_prediction(sentence,tokenizer,model)

"Great friends and family in China. Great history and great places to visit. I love you, China! I'm in love with my homeland, my home country, and my best friend in all the world, you are in my heart!   I'll never forget you!"