# Install required libs

In [2]:
!pip install transformers datasets pandas torch






In [1]:
# Import libraries
import pandas as pd
from transformers import BartTokenizer, BartForConditionalGeneration, Trainer, TrainingArguments
from datasets import Dataset
import torch

2025-01-07 21:50:52.142069: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1736283052.287816   32255 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1736283052.320066   32255 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2025-01-07 21:50:52.545552: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


# Load and prepare the Datset

In [2]:
# Import Dataset from Hugging Face library
# This allows us to apply tokenization and other operations more effectively
#  within the Hugging Face framework
from datasets import Dataset  


try:
    # 1. Load your dataset
    df = pd.read_csv("/home/yassine/Textra-edu/outputs/extractive_summarized_dataframe_final.csv")  
    print(f"Dataset loaded successfully with {len(df)} rows.")

    # Filter and drop NaN values
    df = df[["full_text", "extractive_summarized_text"]].dropna()  # Remove NaN values to avoid errors.
    print(f"Filtered dataset has {len(df)} rows after removing NaNs.")

    # Filter out overly long summaries
    tokenizer = BartTokenizer.from_pretrained("facebook/bart-large-cnn")  # Initialize tokenizer
    df['summary_token_length'] = df['extractive_summarized_text'].apply(
        lambda x: len(tokenizer(x)["input_ids"])
    )
    df_filtered = df[df['summary_token_length'] <= 1024].drop(columns=['summary_token_length'])
    print(f"Filtered dataset has {len(df_filtered)} rows after removing long summaries.")

    if len(df_filtered) == 0:
        raise ValueError("No valid summaries left after filtering. Check your data.")

    # Convert the filtered DataFrame to Hugging Face Dataset
    dataset = Dataset.from_pandas(df_filtered)
except FileNotFoundError:
    print("Error: File 'extractive_summarized_dataframe_final.csv' not found. Check the file path.")
except Exception as e:
    print(f"An error occurred: {e}")


Dataset loaded successfully with 1009 rows.
Filtered dataset has 944 rows after removing NaNs.
Filtered dataset has 944 rows after removing long summaries.


The tokenize_function processes the dataset by converting text into numerical representations (tokens) that the model can understand. Here's what it does step by step:

1. Tokenize Input Text (full_text):
Converts the input text (examples["full_text"]) into tokens using the model's tokenizer. It truncates or pads the text to a fixed length (max_length=1024) to ensure all inputs are uniform in size.

2. Tokenize Target Text (extractive_summarized_text):
Similarly, converts the summary (examples["extractive_summarized_text"]) into tokens. It truncates or pads to a shorter fixed length (max_length=128), since summaries are typically much shorter than input text.

3. Create Model Inputs:

The tokenized full_text is saved as model_inputs.
The tokenized extractive_summarized_text is added as labels to model_inputs. These labels will guide the model during fine-tuning by showing the expected output.

In [4]:
from datasets import Dataset

def tokenize_function(examples):
    model_inputs = tokenizer(
        examples["full_text"], max_length=1024, truncation=True, padding="max_length"
    )
    labels = tokenizer(
        text_target=examples["extractive_summarized_text"],
        max_length=128,  # Keep summaries concise
        truncation=True,
        padding="max_length"
    )
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

tokenized_datasets = dataset.map(tokenize_function, batched=True)


Map:   0%|          | 0/944 [00:00<?, ? examples/s]

Final Output:
The function returns a dictionary for each data point containing:

    . Input tokens (model_inputs): The tokenized full text.
    . Labels: The tokenized summary that the model should generate.

# Define the Model and Training Args 

In [5]:
from transformers import BartForConditionalGeneration, TrainingArguments, Trainer
import torch

model = BartForConditionalGeneration.from_pretrained("facebook/bart-large-cnn")

# Move model to GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# Define training arguments
training_args = TrainingArguments(
    output_dir="outputs/bart-distilled-from-scibert",  # Directory to save the model
    per_device_train_batch_size=2,  # Adjust based on memory availability
    gradient_accumulation_steps=8,  # Accumulate gradients to simulate larger batch size
    num_train_epochs=2,  # Number of epochs to train
    learning_rate=5e-5,
    weight_decay=0.01,
    save_strategy="epoch",  # Save model after each epoch
    fp16=torch.cuda.is_available(),  # Use mixed precision (FP16) if GPU is available
    push_to_hub=False,
    logging_steps=50,  # Log progress every 50 steps
    evaluation_strategy="no"  # No evaluation during training
)



In [6]:
trainer = Trainer(
    model=model,  # Your fine-tuned BART model
    args=training_args,  # TrainingArguments object
    train_dataset=tokenized_datasets,  # Tokenized dataset to train on
)

In [7]:
trainer.train()
trainer.save_model("outputs/bart-fine-tuned")  # Save the model after training


Step,Training Loss
50,19.7979
100,15.7179




In [8]:
model.save_pretrained("outputs/bart-fine-tuned")


In [13]:
!ls TunedModels/

bart-distilled-from-scibert  bart-fine-tuned


In [11]:
!ls -a outputs/

.  ..  bart-distilled-from-scibert  bart-fine-tuned


In [1]:
import torch
print(torch.cuda.is_available()) 

True


# Testing the fine-tuned model 

In [4]:
!ls TunedModels/bart-fine-tuned/

config.json  generation_config.json  model.safetensors	training_args.bin


## Loading the model and the tokenizer 

In [8]:
from transformers import BartTokenizer

# Re-initialize the tokenizer from the original pre-trained model
tokenizer = BartTokenizer.from_pretrained("facebook/bart-large-cnn")

# Save the tokenizer to your fine-tuned model directory
tokenizer.save_pretrained("TunedModels/bart-fine-tuned")

print("Tokenizer saved successfully!")

Tokenizer saved successfully!


In [10]:
from transformers import BartForConditionalGeneration, BartTokenizer

model_dir = "TunedModels/bart-fine-tuned"

# Load the fine-tuned model 

model = BartForConditionalGeneration.from_pretrained(model_dir)

# Load the tokenizer 

tokenizer = BartTokenizer.from_pretrained(model_dir)

print("Model and tokenizer loaded successfully!")

Model and tokenizer loaded successfully!


In [28]:
def generate_summary(model, tokenizer, text):
    """
    Generate a summary for the given input text using the fine-tuned BART model.
    
    Args:
    - model: The fine-tuned BART model.
    - tokenizer: The tokenizer corresponding to the BART model.
    - text: The input text to summarize.
    
    Returns:
    - str: The generated summary.
    """
    # Tokenize the input text
    inputs = tokenizer(
        text, 
        return_tensors="pt", 
        max_length=1024,  # Ensure it matches the model's input size
        truncation=True, 
        padding="max_length"
    )

    # Move inputs to the same device as the model
    input_ids = inputs["input_ids"].to(model.device)
    attention_mask = inputs["attention_mask"].to(model.device)

    # Generate the summary
    generated_ids = model.generate(
        input_ids=input_ids,
        attention_mask=attention_mask,
        max_length=300,  # Maximum length of the generated summary
        num_beams=5,     # Use beam search for diversity
        early_stopping=True,  # Stop generation upon reaching end token
        repetition_penalty=5.0,  # Penalize repetitions
        no_repeat_ngram_size=3,  # Prevent repeating n-grams
        temperature=2.0,  # Sampling temperature for randomness
        top_k=50,  # Limit to top-k tokens
        top_p=0.9  # Nucleus sampling for diversity
    )

    # Decode and return the generated summary
    summary = tokenizer.decode(generated_ids[0], skip_special_tokens=True)
    return summary

In [29]:
# Example usage
if __name__ == "__main__":
    # Input text for testing
    input_text = """
   Encoder: The encoder is composed of a stack of N = 6 identical layers. Each layer has two
sub-layers. The first is a multi-head self-attention mechanism, and the second is a simple, positionwise fully connected feed-forward network. We employ a residual connection [11] around each of
the two sub-layers, followed by layer normalization [1]. That is, the output of each sub-layer is
LayerNorm(x + Sublayer(x)), where Sublayer(x) is the function implemented by the sub-layer
itself. To facilitate these residual connections, all sub-layers in the model, as well as the embedding
layers, produce outputs of dimension dmodel = 512.
Decoder: The decoder is also composed of a stack of N = 6 identical layers. In addition to the two
sub-layers in each encoder layer, the decoder inserts a third sub-layer, which performs multi-head
attention over the output of the encoder stack. Similar to the encoder, we employ residual connections
around each of the sub-layers, followed by layer normalization. We also modify the self-attention
sub-layer in the decoder stack to prevent positions from attending to subsequent positions. This
masking, combined with fact that the output embeddings are offset by one position, ensures that the
predictions for position i can depend only on the known outputs at positions less than i.
    """

    # Generate and print the summary
    summary = generate_summary(model, tokenizer, input_text)
    print("Input Text:\n", input_text)
    print("\nGenerated Summary:\n", summary)



Input Text:
 
   Encoder: The encoder is composed of a stack of N = 6 identical layers. Each layer has two
sub-layers. The first is a multi-head self-attention mechanism, and the second is a simple, positionwise fully connected feed-forward network. We employ a residual connection [11] around each of
the two sub-layers, followed by layer normalization [1]. That is, the output of each sub-layer is
LayerNorm(x + Sublayer(x)), where Sublayer(x) is the function implemented by the sub-layer
itself. To facilitate these residual connections, all sub-layers in the model, as well as the embedding
layers, produce outputs of dimension dmodel = 512.
Decoder: The decoder is also composed of a stack of N = 6 identical layers. In addition to the two
sub-layers in each encoder layer, the decoder inserts a third sub-layer, which performs multi-head
attention over the output of the encoder stack. Similar to the encoder, we employ residual connections
around each of the sub-layers, followed by layer norm