# **1. Installation and importing libraries**

In [22]:
# Installation of basic Natural Language Processing tools
!pip install nltk contractions

# Installation of machine learning and transformers libraries
!pip install transformers datasets accelerate -U
!pip install transformers[torch]

# Audio processing libraries
!pip install ffmpeg-python
!pip install whisper
!pip install git+https://github.com/openai/whisper.git
!pip install noisereduce

# Speech recognition and evaluation metrics
!pip install jiwer
!pip install rouge_score

Collecting git+https://github.com/openai/whisper.git
  Cloning https://github.com/openai/whisper.git to /tmp/pip-req-build-4t6tl5dy
  Running command git clone --filter=blob:none --quiet https://github.com/openai/whisper.git /tmp/pip-req-build-4t6tl5dy
  Resolved https://github.com/openai/whisper.git to commit ba3f3cd54b0e5b8ce1ab3de13e32122d0d5f98ab
  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone


In [58]:
# Natural Language Processing tools
import nltk
nltk.download('wordnet')
from nltk.tokenize import word_tokenize
from nltk.translate.bleu_score import corpus_bleu, sentence_bleu
from nltk.translate.meteor_score import meteor_score

# Regex and string operations for text manipulation
import re
import string

# Audio processing and handling libraries
import ffmpeg
import librosa
import noisereduce as nr
import soundfile as sf
import whisper
import torch

# Machine Learning and Transformer models from Hugging Face
from transformers import T5Tokenizer, T5ForConditionalGeneration, Trainer, TrainingArguments
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
from torch.utils.data import Dataset

# Datasets and metrics for model training and evaluation
from datasets import load_metric, load_dataset

# Calculation of edit distance for evaluation metrics
from editdistance import eval as editdistance_eval

# Speech recognition evaluation
import jiwer

# System and subprocess handling
import subprocess

# Integration with Google Colab environment
from google.colab import drive

[nltk_data] Downloading package wordnet to /root/nltk_data...


# **2. Extract video transcript using whisper model**

In [24]:
# 1-Extracts the audio from a video
def video_to_audio(video_path, audio_path="temp_audio.wav"):
    """
    Extracts the audio from a video file and saves it as a WAV file.
    """
    (
        ffmpeg
        .input(video_path)
        .output(audio_path, ac=1, ar="16k")  # Convert to mono and 16kHz sample rate
        .run(overwrite_output=True)
    )
    print(f"Audio extracted to {audio_path}")
    return audio_path

In [25]:
# 2-Preprocessing the audio before giving it to Whisper
def reduce_noise(audio_path):
    """
    Reduces noise from the extracted audio file using the noisereduce library.
    """
    audio, sr = librosa.load(audio_path, sr=None)  # Load the audio file
    noise_clip = audio[:int(0.5 * sr)]  # Assuming first 0.5 seconds is noise
    audio_cleaned = nr.reduce_noise(y=audio, sr=sr, y_noise=noise_clip)

    sf.write(audio_path, audio_cleaned, sr)  # Using soundfile to write the WAV file
    print("Noise reduced audio saved back to the same path.")

In [66]:
# 3-Transcribes the audio file using Whisper
def transcribe_audio(audio_path):
    """
    Transcribes the audio file using Whisper and returns the transcription text.
    """
    # Load the model
    model = whisper.load_model("base")

    # Perform the transcription. This automatically loads and processes the audio file.
    result = model.transcribe(audio_path)

    # Extract the transcription text
    transcription = result["text"]
    print("Transcription:\n", transcription)
    return transcription

# Example usage
video_path = "/content/Show younger children why eating their fruit and veg is good for them.mp4"
audio_path = video_to_audio(video_path)
reduce_noise(audio_path)  # Apply noise reduction
transcription = transcribe_audio(audio_path)

Audio extracted to temp_audio.wav
Noise reduced audio saved back to the same path.
Transcription:
  You probably know that it's important to eat fruit and vegetables. But do you know why? Fruit and vegetables contain some very important nutrients that our bodies need. These include vitamins, minerals and fibre. Do you know what vitamins and minerals are? Vitamin and minerals help our bodies to work properly. We only need them in smaller amounts, but we do need lots of different ones. The very best way to make sure we get all the nutrients we need is to eat a wide variety of different foods. All of the healthy food that we eat contains vitamins and minerals. To get enough of all the vitamins and minerals our bodies need, it's very important to eat a variety of fruit and vegetables. And vitamins do some very important jobs indeed. Vitamin A found in orange fruit and vegetables such as apricots and carrots and dark leafy vegetables such as the boy cabbage helps to keep our vision healthy.

In [27]:
# 4-Evalouate the Whisper model preformance
def compute_wer(reference_text, transcribed_text):
    wer = jiwer.wer(reference_text, transcribed_text)
    return wer

def compute_cer(reference_text, transcribed_text):
    cer = editdistance.eval(reference_text, transcribed_text) / len(reference_text)
    return cer

# Example usage
reference_text = "You probably know that it's important to eat fruit and vegetables, but do you know why? Fruit and vegetables contain some very important nutrients that our bodies need. These include vitamins, minerals and fibre. Do you know what vitamins and minerals are? We need vitamins and minerals help our bodies to work properly. We only need them in small amounts, but we do need lots of different ones. The very best way to make sure we get all the nutrients we need is to eat a wide variety of different foods. All of the healthy food that we eat contains vitamins and minerals. To get enough of all the vitamins and minerals our bodies need, it's very important to eat a variety of fruit and vegetables. And vitamins do some very important jobs indeed. Vitamin A found in orange fruit and vegetables such as apricots and carrots and dark leafy vegetables such as Savoy cabbage helps to keep our vision healthy. Vitamin B1 helps our bodies release energy from food so that we don't feel tired. Lots of fruit and vegetables contain vitamin B1 including peas, spinach and mushrooms. Vitamin C, which is found in citrus fruits such as oranges and lemons as well as strawberries, tomatoes and broccoli, among others, is important for helping our bodies heal. For example, if we graze a knee. You can see that vitamins are really amazing. They protect us and make us strong. Fruits and vegetables are also a really important source of fiber. Fiber helps to keep our digestive system healthy. Do you know what is meant by your digestive system? Your digestive system allows your body to take in the nutrients from the food you eat, starting from your mouth, moving through your stomach, small intestine and large intestine. Fiber helps this process work properly in order to keep our bodies working as they should. We should make sure to eat at least five portions of fruit and vegetables a day to get all of the nutrients our bodies need from them. A portion is the amount you can hold in your cupped hand . And a great tip is to try and eat as many different colors as possible so that we get lots of different vitamins and minerals. What is some of your favourite fruits and vegetables? See if you can think of fruit and vegetables to match every color of the rainbow."
wer = compute_wer(reference_text, transcription)
cer = compute_cer(reference_text, transcription)

print("Word Error Rate (WER):", wer)
print("Character Error Rate (CER):", cer)

Word Error Rate (WER): 0.04455445544554455
Character Error Rate (CER): 0.016821602478972998


# **3. Dataset loading and  preprocessing**

In [28]:
# Load the dataset
dataset = load_dataset("cnn_dailymail", "3.0.0")

# Split the dataset
train_dataset = dataset["train"].select(range(50000))
val_dataset = dataset["validation"].select(range(10000))

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Downloading readme:   0%|          | 0.00/15.6k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/257M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/257M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/259M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/34.7M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/30.0M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/287113 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/13368 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/11490 [00:00<?, ? examples/s]

In [30]:
# Function to clean text
def clean_text(text):
    # Remove URLs
    text = re.sub(r'http\S+', '', text)
    # Replace multiple whitespaces with a single space
    text = re.sub(r'\s+', ' ', text).strip()
    return text

In [31]:
# Tokenization function for preprocessing the data
def tokenize(batch):
    # Apply the clean_text function to each article and highlight in the batch
    cleaned_articles = [clean_text(article) for article in batch['article']]
    cleaned_highlights = [clean_text(highlight) for highlight in batch['highlights']]

    # Tokenize the cleaned articles and highlights
    tokenized_input = tokenizer(cleaned_articles, padding='max_length', truncation=True, max_length=512)
    tokenized_label = tokenizer(cleaned_highlights, padding='max_length', truncation=True, max_length=128)

    # Assign the input_ids from tokenized highlights to 'labels' in tokenized_input
    tokenized_input['labels'] = tokenized_label['input_ids']

    return tokenized_input

# Applying the function
train_dataset = train_dataset.map(tokenize, batched=True)
val_dataset = val_dataset.map(tokenize, batched=True)

# Set the datasets to return PyTorch tensors
train_dataset.set_format('torch', columns=['input_ids', 'attention_mask', 'labels'])
val_dataset .set_format('torch', columns=['input_ids', 'attention_mask', 'labels'])

Map:   0%|          | 0/50000 [00:00<?, ? examples/s]

Map:   0%|          | 0/10000 [00:00<?, ? examples/s]

# **4. Fine-tuning and training the T5-small model**

In [33]:
# Initialize the T5 tokenizer
tokenizer = T5Tokenizer.from_pretrained("t5-small")

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [34]:
# Load the pre-trained T5 model
model = T5ForConditionalGeneration.from_pretrained("t5-small")

In [35]:
# Training Arguments
training_args = TrainingArguments(
    output_dir='./results',           # Output directory
    num_train_epochs=3,               # Total number of training epochs
    per_device_train_batch_size=16,   # Batch size per device during training
    per_device_eval_batch_size=64,    # Batch size for evaluation
    warmup_steps=500,                 # Number of warmup steps for learning rate scheduler
    weight_decay=0.01,                # Weight decay if we apply some.
    logging_dir='./logs',             # Directory for storing logs
    learning_rate=5e-5,               # Learning rate
    adam_epsilon=1e-8,                # Epsilon for Adam optimizer
    max_grad_norm=1.0,
    #fp16=True ,
    gradient_accumulation_steps=4,  # Accumulate gradients to effectively have batch_size of 32
    # `no` to disable saving checkpoints
    save_strategy="no",
    # Evaluation strategy to 'no' , do not want to evaluate during training to speed it up
    evaluation_strategy="no"
)

In [None]:
# We basically saved the model, so we don't need to run this cell again

# Initialize the Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    tokenizer=tokenizer,
)

# Train the model
trainer.train()

# Save the fine-tuned model
trainer.save_model("./fine_tuned_t5_50k")

Step,Training Loss
500,3.6764
1000,1.1123
1500,1.0969
2000,1.0935


# **5. Inference the fine-tuned T5-small model**

In [47]:
# For using the saved model
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [48]:
# Unzip the 'fine_tuned_t5_50k' file from Google Drive to the '/content/model' directory
!unzip "/content/drive/MyDrive/fine_tuned_t5_50k" -d /content/model

unzip:  cannot find or open /content/drive/MyDrive/fine_tuned_t5_50k, /content/drive/MyDrive/fine_tuned_t5_50k.zip or /content/drive/MyDrive/fine_tuned_t5_50k.ZIP.


In [49]:
# Set up the device for model training or inference. Use GPU if available, otherwise use CPU.
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Define the path to the pre-trained model.
model_name = "/content/drive/MyDrive/fine_tuned_t5_50k"

# Load the pre-trained Seq2Seq language model from the specified directory and move it to the appropriate device (GPU or CPU).
modelsum = AutoModelForSeq2SeqLM.from_pretrained(model_name).to(device)

# Load the corresponding tokenizer for the pre-trained model from the same directory as the model.
tokenizer = AutoTokenizer.from_pretrained(model_name)

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [67]:
def generate_summary(article):
    modelsum.eval()  # put model in evaluation mode
    inputs = tokenizer.encode("summarize: " + article, return_tensors="pt", max_length=512, truncation=True)
    inputs = inputs.to(device)  # Move input tensor to the correct device
    summary_ids = modelsum.generate(inputs, max_length=150, length_penalty=2.0, num_beams=4, early_stopping=True)
    summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)
    return summary

cleaned_transcription = clean_text(transcription)
# Generate the summary for the cleaned text
summary = generate_summary(cleaned_transcription)
print(summary)

Fruit and vegetables contain vitamins, minerals and fibre. Vitamin B1 helps our bodies release energy from food so that we don't feel tired. Fruit and vegetables contain vitamin B1 including peas, spinach and mushrooms.


# **6. Evaluation the output from the inference**

In [68]:
# 1-Evaluate generated summaries against a manually written reference using (ROUGE)
rouge = load_metric('rouge')
r="Fruit and vegetables are essential because they contain important nutrients our bodies need, including vitamins, minerals, and fibre. Vitamins like Vitamin A, found in orange fruits and vegetables such as apricots and carrots, help keep our vision healthy. Vitamin B1, present in peas, spinach, and mushrooms, helps release energy from food, preventing tiredness. Vitamin C, found in citrus fruits like oranges and lemons, as well as strawberries, tomatoes, and broccoli, aids in healing wounds. Consuming a variety of these foods ensures we get the necessary nutrients, and it's recommended to eat at least five portions daily. Fiber from these foods also supports a healthy digestive system, helping our body absorb these nutrients effectively."

cleaned_r = clean_text(r)
# Use the metric
scores = rouge.compute(predictions=[summary], references=[cleaned_r])

# Print out the scores with precision, recall, and F-measure
for key, score in scores.items():
    print(f"{key}:")
    print(f"  Precision: {score.mid.precision:.4f}")
    print(f"  Recall: {score.mid.recall:.4f}")
    print(f"  F-measure: {score.mid.fmeasure:.4f}\n")

You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.


rouge1:
  Precision: 0.7429
  Recall: 0.2281
  F-measure: 0.3490

rouge2:
  Precision: 0.4118
  Recall: 0.1239
  F-measure: 0.1905

rougeL:
  Precision: 0.4857
  Recall: 0.1491
  F-measure: 0.2282

rougeLsum:
  Precision: 0.4857
  Recall: 0.1491
  F-measure: 0.2282



In [69]:
# 2-Evaluate using (BLEU score)

def compute_bleu(reference, candidate):
    # Tokenizing the reference and the candidate texts
    reference_tokens = word_tokenize(reference)
    candidate_tokens = word_tokenize(candidate)

    # BLEU expects a list of references as a list of tokens and candidate as a list of tokens
    score = sentence_bleu([reference_tokens], candidate_tokens)
    return score

# Calculate BLEU score
bleu_score = compute_bleu(summary,cleaned_r)

# Print the BLEU score
print(f"BLEU score: {bleu_score:.4f}")

BLEU score: 0.0562


In [70]:
# 3-Evaluate using (METEOR score)
def compute_meteor_multiref(references, candidate):
    # Tokenizing references and candidate
    reference_tokens_list = [word_tokenize(ref) for ref in references]
    candidate_tokens = word_tokenize(candidate)

    # Calculate the METEOR score
    scores = [meteor_score([ref_tokens], candidate_tokens) for ref_tokens in reference_tokens_list]
    avg_score = sum(scores) / len(scores)
    return avg_score

# Calculate METEOR score
meteor = compute_meteor_multiref(summary, cleaned_r)

# Print the METEOR score
print(f"Average METEOR score: {meteor:.4f}")

Average METEOR score: 0.0047


# **7. Evaluation the fine-tuned T5-small model on the test data**

In [60]:
# Generates a clean summary of the test data
def generate_clean_summary_test(model, tokenizer, text, device):

    # Clean the input text to remove unwanted characters or spaces
    cleaned_text = clean_text(text)

    # Put the model in evaluation mode to disable dropout and batch normalization
    model.eval()

    # Encode the cleaned text using the tokenizer. Prepend it with 'summarize: ' to guide the model's generation.
    inputs = tokenizer.encode("summarize: " + cleaned_text, return_tensors="pt", max_length=512, truncation=True).to(device)

    # Generate summary IDs using the model
    summary_ids = model.generate(inputs, max_length=128, min_length=40, length_penalty=2.0, num_beams=4, early_stopping=True)

    # Decode the generated IDs back to text string while skipping special tokens like padding or EOS tokens.
    summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)

    # Return the processed summary text
    return summary

In [61]:
# Select a subset of the test dataset for evaluation purposes
subset = dataset["test"].select(range(50))

In [62]:
# 1-Evaluate generated summaries against dataset highlights using (ROUGE).
rouge_metric = load_metric('rouge')

def evaluate_summaries_rouge(dataset, model, tokenizer, device):
    # Generate and clean summaries
    generated_summaries = [generate_clean_summary_test(model, tokenizer, article, device) for article in subset["article"]]
    cleaned_references = [clean_text(ref) for ref in subset["highlights"]]

    # Compute ROUGE scores
    rouge_scores = rouge_metric.compute(predictions=generated_summaries, references=cleaned_references)

    # Print ROUGE scores
    for key in rouge_scores:
        score = rouge_scores[key]
        precision = score.mid.precision
        recall = score.mid.recall
        fmeasure = score.mid.fmeasure
        print(f"{key}:")
        print(f"  Precision: {precision:.4f}")
        print(f"  Recall: {recall:.4f}")
        print(f"  F-measure: {fmeasure:.4f}")
        print("-" * 40)  # Separator line

# Call the evaluation function
evaluate_summaries_rouge(dataset, modelsum, tokenizer, device)

You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.


rouge1:
  Precision: 0.2795
  Recall: 0.3657
  F-measure: 0.3082
----------------------------------------
rouge2:
  Precision: 0.1175
  Recall: 0.1576
  F-measure: 0.1307
----------------------------------------
rougeL:
  Precision: 0.2206
  Recall: 0.2860
  F-measure: 0.2421
----------------------------------------
rougeLsum:
  Precision: 0.2202
  Recall: 0.2871
  F-measure: 0.2427
----------------------------------------


In [63]:
# 2-Evaluate using (BLEU SCORE).

def evaluate_bleu_score(articles, highlights):

    # Clean articles before generating summaries
    clean_articles = [clean_text(article) for article in articles]

    # Generate summaries from the cleaned articles
    generated_summaries = [generate_summary(clean_article) for clean_article in clean_articles]

    # Clean and tokenize generated summaries
    clean_generated_summaries = [clean_text(summary) for summary in generated_summaries]
    tokenized_generated_summaries = [word_tokenize(summary) for summary in clean_generated_summaries]

    # Clean and tokenize reference summaries
    clean_reference_summaries = [clean_text(ref) for ref in highlights]
    tokenized_reference_summaries = [word_tokenize(ref) for ref in clean_reference_summaries]

    # Calculate BLEU scores
    bleu_score = corpus_bleu([[ref] for ref in tokenized_reference_summaries], tokenized_generated_summaries, weights=(1, 0, 0, 0))

    return bleu_score


#  'subset' has 'article' and 'highlights' fields
articles = [article['article'] for article in subset]  # Collect articles from the subset
highlights = [highlight['highlights'] for highlight in subset]  # Collect corresponding highlights

# Calculate the BLEU score using the function
bleu_score = evaluate_bleu_score(articles, highlights)
print(f"BLEU-1 score (corpus): {bleu_score:.4f}")

BLEU-1 score (corpus): 0.3075


In [64]:
# 3-Evaluate using (METEOR SCORE).
def evaluate_meteor_score(articles, highlights):

    # Clean articles, generate summaries, and clean summaries
    generated_summaries = [generate_summary(clean_text(article)) for article in articles]

    # Clean and tokenize generated summaries
    clean_generated_summaries = [clean_text(summary) for summary in generated_summaries]
    tokenized_generated_summaries = [word_tokenize(summary) for summary in clean_generated_summaries]

    # Clean and tokenize reference summaries
    clean_reference_summaries = [clean_text(ref) for ref in highlights]
    tokenized_reference_summaries = [[word_tokenize(ref)] for ref in clean_reference_summaries]

    # Compute METEOR scores
    meteor_scores = [meteor_score(refs, gen) for refs, gen in zip(tokenized_reference_summaries, tokenized_generated_summaries)]

    # Calculate and print the average METEOR score
    avg_meteor_score = sum(meteor_scores) / len(meteor_scores)
    print(f"Average METEOR Score: {avg_meteor_score:.4f}")

    # Print individual METEOR scores
    for i, score in enumerate(meteor_scores):
        print(f"Summary {i+1}: METEOR score = {score:.4f}")

# Call the function
evaluate_meteor_score(articles, highlights)

Average METEOR Score: 0.2996
Summary 1: METEOR score = 0.5256
Summary 2: METEOR score = 0.1805
Summary 3: METEOR score = 0.3350
Summary 4: METEOR score = 0.1681
Summary 5: METEOR score = 0.2341
Summary 6: METEOR score = 0.2715
Summary 7: METEOR score = 0.1849
Summary 8: METEOR score = 0.1426
Summary 9: METEOR score = 0.3466
Summary 10: METEOR score = 0.4959
Summary 11: METEOR score = 0.3827
Summary 12: METEOR score = 0.2774
Summary 13: METEOR score = 0.5491
Summary 14: METEOR score = 0.1233
Summary 15: METEOR score = 0.2717
Summary 16: METEOR score = 0.0896
Summary 17: METEOR score = 0.4317
Summary 18: METEOR score = 0.2071
Summary 19: METEOR score = 0.4226
Summary 20: METEOR score = 0.1108
Summary 21: METEOR score = 0.2477
Summary 22: METEOR score = 0.2598
Summary 23: METEOR score = 0.2452
Summary 24: METEOR score = 0.3049
Summary 25: METEOR score = 0.4097
Summary 26: METEOR score = 0.1737
Summary 27: METEOR score = 0.4430
Summary 28: METEOR score = 0.2463
Summary 29: METEOR score = 0