In [1]:
import pandas as pd

In [2]:
df = pd.read_csv('/home/yassine/Textra-edu/good_summaries_filtered.csv')


In [None]:
fine_tune_data = df[['Original Text Chunk', 'Summary']]
fine_tune_data.columns = ['input', 'output']

# Save as JSONL
fine_tune_data.to_json('/home/yassine/Textra-edu/fine_tune/fine_tune_data.jsonl', orient='records', lines=True)

# Save as CSV
fine_tune_data.to_csv('/home/yassine/Textra-edu/fine_tune/fine_tune_data.csv', index=False )


# Fine-Tuning Script 

In [3]:
from datasets import load_dataset 
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, Seq2SeqTrainingArguments, Seq2SeqTrainer
from transformers import DataCollatorForSeq2Seq

dataset = load_dataset("json", data_files="/home/yassine/Textra-edu/fine_tune/fine_tune_data.jsonl")

dataset = dataset["train"].train_test_split(test_size=0.1)


In [4]:
model_name = "google-t5/t5-small" 
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)


In [5]:
# Tokenize the dataset
def preprocess_data(examples):
    inputs = tokenizer(examples["input"], max_length=512, truncation=True )
    labels = tokenizer(examples["output"], max_length=128, truncation=True)
    inputs["labels"] = labels["input_ids"]
    return inputs

tokenized_dataset = dataset.map(preprocess_data, batched=True)
data_collator = DataCollatorForSeq2Seq(tokenizer, model=model )

# Set up training arguments
training_args = Seq2SeqTrainingArguments(
    output_dir="/home/yassine/Textra-edu/fine_tuned_model",
    evaluation_strategy="epoch",  # Ensure evaluation is performed at each epoch
    save_strategy="epoch",
    load_best_model_at_end=True,  # Required for EarlyStoppingCallback
    metric_for_best_model="eval_loss",
    greater_is_better=False,
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=10,
    save_total_limit=2,
    predict_with_generate=True,
    logging_dir="/home/yassine/Textra-edu/logs",
)



Map:   0%|          | 0/171 [00:00<?, ? examples/s]

Map:   0%|          | 0/19 [00:00<?, ? examples/s]



In [6]:
from transformers import EarlyStoppingCallback

# Initialise Trainer 

trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["test"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=5)]
)

  trainer = Seq2SeqTrainer(


In [11]:
from accelerate import Accelerator 

In [13]:
# Train the model 
accelerator = Accelerator()
trainer.train()

Passing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.48.0. You should pass an instance of `EncoderDecoderCache` instead, e.g. `past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values)`.


Epoch,Training Loss,Validation Loss
1,No log,1.816722
2,No log,1.667318
3,No log,1.601467
4,No log,1.555593
5,No log,1.527846
6,No log,1.508461
7,No log,1.494589
8,No log,1.492947
9,No log,1.486762
10,No log,1.485001


There were missing keys in the checkpoint model loaded: ['encoder.embed_tokens.weight', 'decoder.embed_tokens.weight', 'lm_head.weight'].


TrainOutput(global_step=220, training_loss=1.4513499866832387, metrics={'train_runtime': 3036.4423, 'train_samples_per_second': 0.563, 'train_steps_per_second': 0.072, 'total_flos': 62314325999616.0, 'train_loss': 1.4513499866832387, 'epoch': 10.0})

In [8]:
import os

model_path = "/home/yassine/Textra-edu/fine_tuned_model/checkpoint-220"
print("Files in model directory:", os.listdir(model_path))

Files in model directory: ['scheduler.pt', 'tokenizer_config.json', 'generation_config.json', 'tokenizer.json', 'spiece.model', 'optimizer.pt', 'rng_state.pth', 'config.json', 'model.safetensors', 'training_args.bin', 'trainer_state.json', 'special_tokens_map.json']


In [2]:
from transformers import T5Tokenizer, T5ForConditionalGeneration
import os

# Path to the fine-tuned model
model_path = "/home/yassine/Textra-edu/fine_tuned_model/checkpoint-220"

# Load the model and tokenizer
try:
    tokenizer = T5Tokenizer.from_pretrained(model_path)
    model = T5ForConditionalGeneration.from_pretrained(model_path)
    print("Model and tokenizer loaded successfully!")
except Exception as e:
    print(f"Error loading model or tokenizer: {e}")
    exit()

# Summarization function
def summarize_text(input_text, max_length=100, min_length=30, length_penalty=2.0, num_beams=4):
    """
    Summarize input text using the fine-tuned T5 model.
    """
    # Tokenize and encode input text
    input_ids = tokenizer.encode("summarize: " + input_text, return_tensors="pt", truncation=True)

    # Generate summary
    summary_ids = model.generate(
        input_ids,
        max_length=max_length,
        min_length=min_length,
        length_penalty=length_penalty,
        num_beams=num_beams,
        early_stopping=True
    )

    # Decode the output
    return tokenizer.decode(summary_ids[0], skip_special_tokens=True)



You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


Model and tokenizer loaded successfully!


In [3]:
text_to_summarize = """
1/ Collecter des données en entreprises, puis passer ensuite à la mise
en place de l’IA dans l’entreprise.
Au contraire, dès que l’entreprise a commencé à collecter des données,
elle doit commencer à les transmettre à une équipe IA, qui pourra
donner des informations/feedback à l’équipe IT de l’entreprise du type
de données à collecter et les types d’infrastructures informatiques à
continuer à développer.
Ex : Peut-être qu’une équipe d’IA peut examiner les données de votre usine et
dire, au lieu de collecter les données de cette grosse machine juste toutes les
10mn, mais de le faire toutes les minutes. Cela permettrait de faire un meilleur
travail en créant un système de maintenance préventive pour vous. IL y a
beaucoup de va et vient entre les équipes de l’UT et de l’IA.
Le conseil est de recueillir les commentaires de l’IA plutôt, car elle peut
aider à orienter le développement de l’infrastructure informatique."""

summary = summarize_text(text_to_summarize, max_length=2056)
print(summary)

l’entreprise a commencé à collecter des données, puis passer ensuite à la mise en place de l’IA. l’équipe IT de l’entreprise du type de données à collecter et les types d’infrastructures informatiques à continuer à développer.


# Testing metrics 

In [4]:
from transformers import T5Tokenizer, T5ForConditionalGeneration
from sentence_transformers import SentenceTransformer, util
from rouge_score import rouge_scorer
import torch

embedding_model = SentenceTransformer('all-MiniLM-L6-v2')  # Small and fast pre-trained model


In [6]:
def calculate_semantic_similarity(original_text, summary_text):
    """
    Calculate semantic similarity between the original text and the generated summary.
    """
    # Compute embeddings
    embeddings = embedding_model.encode([original_text, summary_text], convert_to_tensor=True)
    similarity = util.pytorch_cos_sim(embeddings[0], embeddings[1])
    return similarity.item()

def calculate_coverage(original_text, summary_text):
    """
    Calculate coverage using ROUGE-L (longest common subsequence) as a proxy.
    """
    scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)
    scores = scorer.score(original_text, summary_text)
    return scores['rougeL'].fmeasure

In [7]:
semantic_similarity = calculate_semantic_similarity(text_to_summarize, summary)
coverage = calculate_coverage(text_to_summarize, summary)

print(f"\nMetrics:\nSemantic Similarity: {semantic_similarity:.4f}\nCoverage (ROUGE-L): {coverage:.4f}")




Metrics:
Semantic Similarity: 0.7009
Coverage (ROUGE-L): 0.3365
