# Training

In [None]:
!pip install transformers[torch]
!pip install SentencePiece


Collecting accelerate>=0.21.0 (from transformers[torch])
  Downloading accelerate-0.29.3-py3-none-any.whl (297 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m297.6/297.6 kB[0m [31m4.2 MB/s[0m eta [36m0:00:00[0m
Collecting nvidia-cuda-nvrtc-cu12==12.1.105 (from torch->transformers[torch])
  Using cached nvidia_cuda_nvrtc_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (23.7 MB)
Collecting nvidia-cuda-runtime-cu12==12.1.105 (from torch->transformers[torch])
  Using cached nvidia_cuda_runtime_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (823 kB)
Collecting nvidia-cuda-cupti-cu12==12.1.105 (from torch->transformers[torch])
  Using cached nvidia_cuda_cupti_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (14.1 MB)
Collecting nvidia-cudnn-cu12==8.9.2.26 (from torch->transformers[torch])
  Using cached nvidia_cudnn_cu12-8.9.2.26-py3-none-manylinux1_x86_64.whl (731.7 MB)
Collecting nvidia-cublas-cu12==12.1.3.1 (from torch->transformers[torch])
  Using cached nvidia_cublas_cu

In [None]:
# Importing required libraries
import numpy as np
import pandas as pd
import torch
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader, RandomSampler, SequentialSampler

# Importing the T5 modules from huggingface/transformers
from transformers import T5Tokenizer, T5ForConditionalGeneration
from torch.cuda.amp import GradScaler, autocast


In [None]:
scaler = GradScaler()


In [None]:
# # Setting up the device for GPU usage
from torch import cuda
device = 'cuda' if cuda.is_available() else 'cpu'


In [None]:
# Creating a custom dataset for reading the dataframe and loading it into the dataloader to pass it to the neural network at a later stage for finetuning the model and to prepare it for predictions
class CustomDataset(Dataset):
    def __init__(self, dataframe, tokenizer, source_len, summ_len):
        self.tokenizer = tokenizer
        self.data = dataframe
        self.source_len = source_len
        self.summ_len = summ_len
        self.text = self.data.text
        self.ctext = self.data.ctext

    def __len__(self):
        return len(self.text)

    def __getitem__(self, index):
        ctext = str(self.ctext[index])
        ctext = ' '.join(ctext.split())

        text = str(self.text[index])
        text = ' '.join(text.split())

        source = self.tokenizer.batch_encode_plus([ctext], max_length= self.source_len, pad_to_max_length=True,return_tensors='pt')
        target = self.tokenizer.batch_encode_plus([text], max_length= self.summ_len, pad_to_max_length=True,return_tensors='pt')

        source_ids = source['input_ids'].squeeze()
        source_mask = source['attention_mask'].squeeze()
        target_ids = target['input_ids'].squeeze()
        target_mask = target['attention_mask'].squeeze()

        return {
            'source_ids': source_ids.to(dtype=torch.long),
            'source_mask': source_mask.to(dtype=torch.long),
            'target_ids': target_ids.to(dtype=torch.long),
            'target_ids_y': target_ids.to(dtype=torch.long)
        }

In [None]:
def train(epoch, tokenizer, model, device, loader, optimizer):
    model.train()
    for _, data in enumerate(loader, 0):
        y = data['target_ids'].to(device, dtype=torch.long)
        y_ids = y[:, :-1].contiguous()
        lm_labels = y[:, 1:].clone().detach()
        lm_labels[y[:, 1:] == tokenizer.pad_token_id] = -100
        ids = data['source_ids'].to(device, dtype=torch.long)
        mask = data['source_mask'].to(device, dtype=torch.long)

        # Automatic Mixed Precision context
        with autocast():
            outputs = model(input_ids=ids, attention_mask=mask, decoder_input_ids=y_ids, labels=lm_labels)
            loss = outputs.loss

        if _ % 500 == 0:
            print(f'Epoch: {epoch}, Loss:  {loss.item()}')

        optimizer.zero_grad()
        # Use scaler to scale the loss for backward pass
        scaler.scale(loss).backward()
        scaler.step(optimizer)
        scaler.update()


In [None]:
def validate(epoch, tokenizer, model, device, loader):
    model.eval()
    predictions = []
    actuals = []
    with torch.no_grad():
        for _, data in enumerate(loader, 0):
            y = data['target_ids'].to(device, dtype = torch.long)
            ids = data['source_ids'].to(device, dtype = torch.long)
            mask = data['source_mask'].to(device, dtype = torch.long)

            generated_ids = model.generate(
                input_ids = ids,
                attention_mask = mask,
                max_length=150,
                num_beams=2,
                repetition_penalty=2.5,
                length_penalty=1.0,
                early_stopping=True
                )
            preds = [tokenizer.decode(g, skip_special_tokens=True, clean_up_tokenization_spaces=True) for g in generated_ids]
            target = [tokenizer.decode(t, skip_special_tokens=True, clean_up_tokenization_spaces=True)for t in y]
            if _%100==0:
                print(f'Completed {_}')

            predictions.extend(preds)
            actuals.extend(target)
    return predictions, actuals

In [None]:
# Defining some key variables that will be used later on in the training
TRAIN_BATCH_SIZE = 4    # input batch size for training (default: 64)
VALID_BATCH_SIZE = 4    # input batch size for testing (default: 1000)
TRAIN_EPOCHS = 10        # number of epochs to train (default: 10)
VAL_EPOCHS = 1
LEARNING_RATE = 1e-4    # learning rate (default: 0.01)
SEED = 42               # random seed (default: 42)
MAX_LEN = 256
SUMMARY_LEN = 256

In [None]:
# Set random seeds and deterministic pytorch for reproducibility
torch.manual_seed(SEED) # pytorch random seed
np.random.seed(SEED) # numpy random seed
torch.backends.cudnn.deterministic = True

# tokenzier for encoding the text
tokenizer = T5Tokenizer.from_pretrained("t5-base")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.39M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.21k [00:00<?, ?B/s]

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
df = pd.read_excel('/content/drive/MyDrive/T5 Summary Data_1.xlsx')
df.head()

Unnamed: 0,Text,Summary
0,A*STAR continued to contribute as a strategic\...,"A*STAR, Singapore's innovation hub, continued ..."
1,"A*STAR SIMTech Innovation Factory (SIF), in pa...",A*STAR SIMTech Innovation Factory has collabor...
2,The dedicated efforts of our researchers and s...,A*STAR's research team earned awards for their...
3,Name of subsidiary company\nAccelerate Technol...,"A*ccelerate, a wholly-owned subsidiary of A*ST..."
4,The Intelligent Vision Joint Lab is a R&D coll...,"The Intelligent Vision Joint Lab, a collaborat..."


In [None]:
df = df.rename(columns={'Text': 'ctext', 'Summary':'text'})
df = df[['text','ctext']]
df.ctext = 'summarize: ' + df.ctext # add prefix "summarize: " to input indicating the task
print(df.head())

                                                text  \
0  A*STAR, Singapore's innovation hub, continued ...   
1  A*STAR SIMTech Innovation Factory has collabor...   
2  A*STAR's research team earned awards for their...   
3  A*ccelerate, a wholly-owned subsidiary of A*ST...   
4  The Intelligent Vision Joint Lab, a collaborat...   

                                               ctext  
0  summarize: A*STAR continued to contribute as a...  
1  summarize: A*STAR SIMTech Innovation Factory (...  
2  summarize: The dedicated efforts of our resear...  
3  summarize: Name of subsidiary company\nAcceler...  
4  summarize: The Intelligent Vision Joint Lab is...  


In [None]:
train_size = 0.8

train_dataset=df.sample(frac=train_size,random_state = SEED)
val_dataset=df.drop(train_dataset.index).reset_index(drop=True)
train_dataset = train_dataset.reset_index(drop=True)

print("FULL Dataset: {}".format(df.shape))
print("TRAIN Dataset: {}".format(train_dataset.shape))
print("TEST Dataset: {}".format(val_dataset.shape))

FULL Dataset: (32, 2)
TRAIN Dataset: (0, 2)
TEST Dataset: (32, 2)


In [None]:
# Creating the Training and Validation dataset for further creation of Dataloader
training_set = CustomDataset(train_dataset, tokenizer, MAX_LEN, SUMMARY_LEN)
val_set = CustomDataset(val_dataset, tokenizer, MAX_LEN, SUMMARY_LEN)

In [None]:
# Creation of Dataloaders for testing and validation. This will be used down for training and validation stage for the model.
training_loader = DataLoader(training_set, batch_size=TRAIN_BATCH_SIZE, shuffle=True)
val_loader = DataLoader(val_set, batch_size=VALID_BATCH_SIZE, shuffle=False)

In [None]:
# Defining the model. We are using t5-base model and added a Language model layer on top for generation of Summary.
# Further this model is sent to device (GPU/TPU) for using the hardware.
model = T5ForConditionalGeneration.from_pretrained("t5-base")
model = model.to(device)

# Defining the optimizer that will be used to tune the weights of the network in the training session.
optimizer = torch.optim.Adam(params=model.parameters(), lr=LEARNING_RATE)


model.safetensors:   0%|          | 0.00/892M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

In [None]:
# Training loop (taking around 22 mins) (8:00 until ...  8:10)
print('Initiating Fine-Tuning for the model on our dataset')

for epoch in range(TRAIN_EPOCHS):
    train(epoch, tokenizer, model, device, training_loader, optimizer)


Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


Initiating Fine-Tuning for the model on our dataset




Epoch: 0, Loss:  8.142342567443848
Epoch: 1, Loss:  1.7627187967300415
Epoch: 2, Loss:  1.4514206647872925
Epoch: 3, Loss:  1.6115971803665161
Epoch: 4, Loss:  1.1380404233932495
Epoch: 5, Loss:  1.175763487815857
Epoch: 6, Loss:  0.9170572757720947
Epoch: 7, Loss:  0.8899930715560913
Epoch: 8, Loss:  0.8801516890525818
Epoch: 9, Loss:  0.6297731399536133


In [None]:
print('Now generating summaries on our fine tuned model for the validation dataset and saving it in a dataframe')
for epoch in range(VAL_EPOCHS):
    predictions, actuals = validate(epoch, tokenizer, model, device, val_loader)
    final_df = pd.DataFrame({'Generated Text':predictions,'Actual Text':actuals})

Now generating summaries on our fine tuned model for the validation dataset and saving it in a dataframe
Completed 0


In [None]:
print(final_df.head())

                                      Generated Text  \
0  A&W Revenue Royalties Income Fund Management D...   
1  Royalty Pool Same Store Sales Growth increased...   
2  the fair value of interest rate swaps and inde...   
3  the CEO and CFO have designed or caused to des...   
4  general and administrative expenses increased ...   

                                         Actual Text  
0  **A&W Income Fund MD&A: December 31, 2021** * ...  
1  In Q4 2021, A&W's same-store sales grew 13.8% ...  
2  The fair value of interest rate swaps is not c...  
3  The Fund's CEO and CFO have established intern...  
4  The Fund's expenses witnessed an increase in g...  


In [None]:
#to evaluate the generated text using metrics like "bleu" and "rouge"
!pip install evaluate
import evaluate
import nltk
nltk.download('punkt')
from nltk.tokenize import word_tokenize

Collecting evaluate
  Downloading evaluate-0.4.1-py3-none-any.whl (84 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m3.3 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting datasets>=2.0.0 (from evaluate)
  Downloading datasets-2.19.0-py3-none-any.whl (542 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m542.0/542.0 kB[0m [31m12.1 MB/s[0m eta [36m0:00:00[0m
Collecting dill (from evaluate)
  Downloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m9.5 MB/s[0m eta [36m0:00:00[0m
Collecting xxhash (from evaluate)
  Downloading xxhash-3.4.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (194 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m194.1/194.1 kB[0m [31m13.8 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting multiprocess (from evaluate)
  Downloading multiprocess-0.70.16-py310-none-any.whl (134 kB)
[2K     [9

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


In [None]:
metric = evaluate.load("bleu")
references = [ [a] for a in actuals ]
results = metric.compute(predictions=predictions, references=references, tokenizer=word_tokenize)
results

In [None]:
!pip install rouge_score
metric = evaluate.load('rouge')
results = metric.compute(predictions=predictions, references=references, tokenizer=word_tokenize)
results

Collecting rouge_score
  Downloading rouge_score-0.1.2.tar.gz (17 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: rouge_score
  Building wheel for rouge_score (setup.py) ... [?25l[?25hdone
  Created wheel for rouge_score: filename=rouge_score-0.1.2-py3-none-any.whl size=24933 sha256=71a838cbca9df18ea3108ce03d204c5a033caed65c72c883da038829cdbdc150
  Stored in directory: /root/.cache/pip/wheels/5f/dd/89/461065a73be61a532ff8599a28e9beef17985c9e9c31e541b4
Successfully built rouge_score
Installing collected packages: rouge_score
Successfully installed rouge_score-0.1.2


Downloading builder script:   0%|          | 0.00/6.27k [00:00<?, ?B/s]

{'rouge1': 0.5216326931146267,
 'rouge2': 0.25792032395231035,
 'rougeL': 0.3994198999958677,
 'rougeLsum': 0.39958374376625816}

In [None]:
model.save_pretrained('/content/drive/MyDrive/t5_model1_epochs_10')
tokenizer.save_pretrained('/content/drive/MyDrive/t5_tokenizer1_epochs_10')

('/content/drive/MyDrive/t5_tokenizer1_epochs_10/tokenizer_config.json',
 '/content/drive/MyDrive/t5_tokenizer1_epochs_10/special_tokens_map.json',
 '/content/drive/MyDrive/t5_tokenizer1_epochs_10/spiece.model',
 '/content/drive/MyDrive/t5_tokenizer1_epochs_10/added_tokens.json')

#Inference

In [3]:
# Importing required libraries
from google.colab import drive
import numpy as np
import pandas as pd
import torch
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader, RandomSampler, SequentialSampler

# Importing the T5 modules from huggingface/transformers
from transformers import T5Tokenizer, T5ForConditionalGeneration
from torch.cuda.amp import GradScaler, autocast

# # Setting up the device for GPU usage
from torch import cuda
device = 'cuda' if cuda.is_available() else 'cpu'


drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [10]:
PDF_section = "OVERVIEW OF OUR BUSINESS FERRARI IS AMONG THE WORLD’S LEADING LUXURY BRANDS, FOCUSED ON THE DESIGN, ENGINEERING, PRODUCTION AND SALE OF THE WORLD’S MOST RECOGNIZABLE LUXURY PERFORMANCE SPORTS CARS. €3,460 million, EBIT of €716 million, net profit of €609 million and earnings before interest, taxes, depreciation, and amortization (EBITDA) of €1,143 million. For additional information regarding EBITDA, including a reconciliation of EBITDA to net profit, as well as other non-GAAP measures we present, see “Operating Results – Non-GAAP Financial Measures”. WHILST BROADENING OUR PRODUCT PORTFOLIO TO TARGET A LARGER CUSTOMER BASE, WE CONTINUE TO PURSUE A LOW VOLUME PRODUCTION STRATEGY IN ORDER TO MAINTAIN A REPUTATION FOR EXCLUSIVITY AND SCARCITY AMONG PURCHASERS OF OUR CARS AND WE CAREFULLY MANAGE OUR PRODUCTION VOLUMES AND DELIVERY WAITING LISTS TO PROMOTE THIS REPUTATION. We divide our regional markets into (i) EMEA, (ii) Americas, (iii) Mainland China, Hong Kong and Taiwan, and (iv) Rest of APAC, which represented respectively 52.8 percent, 25.5 percent, 5.0 percent and 16.7 percent of units shipped in 2020. The geographical distribution of shipments in 2020 reflects deliberate allocations driven by the phase-in pace of individual models. Shipments in 2020 decreased as a result of the seven-week production suspension in the first half of 2020 and the temporary closure of certain dealerships caused by the COVID-19 pandemic, with a partial recovery of production and shipments in the second half of the year. WE FOCUS OUR MARKETING AND PROMOTION EFFORTS IN THE INVESTMENTS WE MAKE IN OUR RACING ACTIVITIES AND IN PARTICULAR, SCUDERIA FERRARI’S PARTICIPATION IN THE FORMULA 1 WORLD CHAMPIONSHIP Our brand symbolizes exclusivity, innovation, state-of-the-art sporting performance and Italian design and engineering heritage. Our name and history and the image enjoyed by our cars are closely associated with our Formula 1 racing team, Scuderia Ferrari, the most successful team in Formula 1 history. From the inaugural year of Formula 1 in 1950 through the present, Scuderia Ferrari has won 238 Grand Prix races, 16 Constructor World titles and 15 Drivers’ World titles. We are the only team which has taken part in more than 1,000 Formula 1 races. We believe our history of excellence, technological innovation and defining style transcends the automotive industry, and is the foundation of the Ferrari brand and image. We design, engineer and produce our cars in Maranello, Italy, and sell them in over 60 markets worldwide through a network of 168 authorized dealers operating 188 points of sale as of the end of 2020. WE BELIEVE OUR CARS ARE THE EPITOME OF PERFORMANCE, LUXURY AND STYLING. Our product offering comprises four main pillars: the sports range, the GT range, special series and Icona, a line of modern cars inspired by our iconic cars of the past. Our current product range (including cars presented in 2020, for which shipments will commence in 2021) is comprised of six sports cars (SF90 Stradale, SF90 Spider, Ferrari F8 Tributo, Ferrari F8 Spider, 812 Superfast and 812 GTS), two GT cars (Ferrari Roma and Ferrari Portofino M) as well as two versions of our first Icona car, the Ferrari Monza SP1 and the Ferrari Monza SP2. In 2020 we completed shipments of the GTC4Lusso and the GTC4Lusso T, as well as our most recent special series models, the Ferrari 488 Pista and the Ferrari 488 Pista Spider, which completed their respective lifecycles in 2020. We also produce limited edition hypercars and one-off cars. Our most recent hypercar, the LaFerrari Aperta, was launched in 2016 to celebrate our 70th Anniversary and finished its limited series run in 2018. We followed up our record of 5 model launches in 2019 with the unveiling in 2020 of the Ferrari Portofino M and the SF90 Spider, with shipments of both models expected to commence in 2021. In 2020, we shipped 9,119 cars and recorded net revenues of"

In [12]:
def generate_summary(model, tokenizer, device, text_to_summarize, max_length=1500):
    model.eval()
    input_ids = tokenizer.encode("summarize: " + text_to_summarize, return_tensors="pt", add_special_tokens=True)
    input_ids = input_ids.to(device)

    with torch.no_grad():
      summary_ids = model.generate(input_ids, max_length=max_length, num_beams=5, early_stopping=True)

    summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)
    return summary

# Load your model and tokenizer
model_path = '/content/drive/MyDrive/t5_model1_epochs_10'
tokenizer_path = '/content/drive/MyDrive/t5_tokenizer1_epochs_10'

model = T5ForConditionalGeneration.from_pretrained(model_path)
tokenizer = T5Tokenizer.from_pretrained(tokenizer_path)
model.to(device)

article = PDF_section

summary = generate_summary(model, tokenizer, device, article)
print(summary)


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


FERRARI's LUXURY BUSINESS focuses on the design, engineering, production, and sale of the world's most recognisable sports cars: €3,460 million, EBITDA: €716 million, net profit: €609 million, and Icona: €1,143 million. Regional markets: EMEA, Americas, Mainland China, Hong Kong, Taiwan, and Rest of APAC account for 52.8% of units shipped in 2020, primarily due to production suspensions and the Formula 1 racing team's winning cars. The current product range includes six sports cars, two hypercar launches, with shipments expected to commence in 2021.


#Validation set

In [None]:
!pip install rouge_score

Collecting rouge_score
  Downloading rouge_score-0.1.2.tar.gz (17 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: rouge_score
  Building wheel for rouge_score (setup.py) ... [?25l[?25hdone
  Created wheel for rouge_score: filename=rouge_score-0.1.2-py3-none-any.whl size=24933 sha256=5e7e7b8d1a1f9808e7de291b5c01869d950784815f390ab31000954250781fed
  Stored in directory: /root/.cache/pip/wheels/5f/dd/89/461065a73be61a532ff8599a28e9beef17985c9e9c31e541b4
Successfully built rouge_score
Installing collected packages: rouge_score
Successfully installed rouge_score-0.1.2


In [None]:
# Assuming 'new_df' is your new DataFrame with the text and summaries
df = pd.read_excel('/content/drive/MyDrive/T5 Summary Data_1.xlsx')

df = df.rename(columns={'Text': 'ctext', 'Summary':'text'})
df = df[['text','ctext']]
df.ctext = 'summarize: ' + df.ctext # add prefix "summarize: " to input indicating the task
print(df.head())

# Tokenize and encode the new dataset
new_dataset = CustomDataset(df, tokenizer, MAX_LEN, SUMMARY_LEN)

# Create a DataLoader for the new dataset
new_loader = DataLoader(new_dataset, batch_size=VALID_BATCH_SIZE, shuffle=False)

# Generate predictions and actuals
new_predictions, new_actuals = validate(0, tokenizer, model, device, new_loader)

# Calculate BLEU and ROUGE metrics
import evaluate
from nltk.tokenize import word_tokenize
nltk.download('punkt')

bleu_metric = evaluate.load("bleu")
rouge_metric = evaluate.load('rouge')

# BLEU
bleu_references = [ [a] for a in new_actuals ]
bleu_results = bleu_metric.compute(predictions=new_predictions, references=bleu_references, tokenizer=word_tokenize)

# ROUGE
rouge_results = rouge_metric.compute(predictions=new_predictions, references=new_actuals, tokenizer=word_tokenize)

print("BLEU score:", bleu_results)
print("ROUGE score:", rouge_results)


                                                text  \
0  A*STAR, Singapore's innovation hub, continued ...   
1  A*STAR SIMTech Innovation Factory has collabor...   
2  A*STAR's research team earned awards for their...   
3  A*ccelerate, a wholly-owned subsidiary of A*ST...   
4  The Intelligent Vision Joint Lab, a collaborat...   

                                               ctext  
0  summarize: A*STAR continued to contribute as a...  
1  summarize: A*STAR SIMTech Innovation Factory (...  
2  summarize: The dedicated efforts of our resear...  
3  summarize: Name of subsidiary company\nAcceler...  
4  summarize: The Intelligent Vision Joint Lab is...  




Completed 0


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


BLEU score: {'bleu': 0.2313721072798704, 'precisions': [0.5227906976744187, 0.2785646836638338, 0.17497603068072867, 0.1124634858812074], 'brevity_penalty': 1.0, 'length_ratio': 1.1174636174636174, 'translation_length': 2150, 'reference_length': 1924}
ROUGE score: {'rouge1': 0.552375254044064, 'rouge2': 0.29854327774295086, 'rougeL': 0.4348465250493224, 'rougeLsum': 0.43440305264465195}
