In [None]:
!pip install datasets evaluate transformers[sentencepiece]
!pip install accelerate
!pip install huggingface_hub
!pip install nltk
!pip3 install absl-py rouge_score

In [None]:
# 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

In [None]:
# from huggingface_hub import notebook_login

# notebook_login()


In [None]:
!huggingface-cli login --token  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

In [None]:
from datasets import load_dataset
from datasets import Dataset
from datasets import concatenate_datasets

import evaluate
import nltk
import pandas as pd

import torch
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, DataCollatorForSeq2Seq, Seq2SeqTrainingArguments , pipeline , Seq2SeqTrainer, AutoConfig

In [None]:
eng_data = load_dataset("YoussefAnwar/English-news")
ar_data = load_dataset("YoussefAnwar/Arabic-news")

## Data Split and Mix Data

In [None]:
eng_data["train"]

In [None]:
ar_data["train"]

In [None]:
eng_threshold = 2_000_000
ar_treshold = 88881

In [None]:
first_stage_data  = concatenate_datasets([eng_data["train"].select(range(eng_threshold))
                                          , ar_data["train"].select(range(ar_treshold))])
# second stage
second_stage_data = concatenate_datasets([eng_data["train"].select(range(eng_threshold, eng_data["train"].num_rows))
                                          , ar_data["train"].select(range(ar_data["train"].num_rows))])


In [None]:
first_stage_data = first_stage_data.shuffle(seed=42)

In [None]:
second_stage_data = second_stage_data.shuffle(seed=42)

In [None]:
first_stage_data=first_stage_data.train_test_split(0.1, seed=42)
second_stage_data=second_stage_data.train_test_split(0.1, seed=42)

In [None]:
first_stage_data['validation'] = first_stage_data.pop("test")
second_stage_data['validation'] = second_stage_data.pop("test")

In [None]:
first_stage_data

In [None]:
second_stage_data

## Tokenizatin and Preprocessing

In [None]:
from transformers import AutoTokenizer
t5_tokenizer = AutoTokenizer.from_pretrained("google/mt5-small")

In [None]:
from datasets import DatasetDict

# Selecting 1000 samples from both train and validation datasets
train_sample = first_stage_data["train"].select(range(10000))
validation_sample = first_stage_data["validation"].select(range(1000))

# Creating a new DatasetDict called sa
ds = DatasetDict({
    'train': train_sample,
    'validation': validation_sample
})

# Optionally print the sizes of the new dataset
print(ds)

In [None]:
def tokenize_sample_data(data):
  input_feature = t5_tokenizer(data["Body"], truncation=True, max_length=1024)
  label = t5_tokenizer(data["Title"], truncation=True, max_length=128)
  return {
    "input_ids": input_feature["input_ids"],
    "attention_mask": input_feature["attention_mask"],
    "labels": label["input_ids"],
  }

tokenized_ds =ds.map(
  tokenize_sample_data,
  remove_columns=["Body", "Title"],
  batched=True,
  batch_size=128)

tokenized_ds

In [None]:
ds["train"][0]

In [None]:
tokenized_ds_first_stage =first_stage_data.map(
    tokenize_sample_data,
    remove_columns=["Body", "Title"],
    batched=True,
    batch_size=128)

tokenized_ds_second_stage =second_stage_data.map(
    tokenize_sample_data,
    remove_columns=["Body", "Title"],
    batched=True,
    batch_size=128)

In [None]:
tokenized_ds_second_stage.push_to_hub('YoussefAnwar/tokenized_ds_second_stage')

In [None]:
tokenized_ds_second_stage = load_dataset('YoussefAnwar/tokenized_ds_second_stage')

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")


mt5_config = AutoConfig.from_pretrained(
  "google/mt5-small",
  max_length=128,
  length_penalty=0.6,
  no_repeat_ngram_size=2,
  num_beams=15,
)
model = (AutoModelForSeq2SeqLM
         .from_pretrained("google/mt5-small", config=mt5_config)
         .to(device))

In [None]:
data_collator = DataCollatorForSeq2Seq(
  t5_tokenizer,
  model=model,
  return_tensors="pt")

In [None]:
import evaluate
import numpy as np
from nltk.tokenize import RegexpTokenizer

rouge_metric = evaluate.load("rouge")

# define function for custom tokenization
def tokenize_sentence(arg):
    encoded_arg = t5_tokenizer(arg)
    return t5_tokenizer.convert_ids_to_tokens(encoded_arg.input_ids)


def metrics_func(eval_arg):
  preds, labels = eval_arg
  # Replace -100
  labels = np.where(labels != -100, labels, t5_tokenizer.pad_token_id)
  # Convert id tokens to text
  text_preds = t5_tokenizer.batch_decode(preds, skip_special_tokens=True)
  text_labels = t5_tokenizer.batch_decode(labels, skip_special_tokens=True)
  
  # Sentence-ending punctuation for English and Arabic
  sentence_endings = (".", "!", "?", "؟")  
  
  # Insert a sentence-ending character if missing
  text_preds = [(p if p.endswith(sentence_endings) else p + ".") for p in text_preds]
  text_labels = [(l if l.endswith(sentence_endings) else l + ".") for l in text_labels]

  # Tokenizer to split sentences in English and Arabic based on sentence-ending punctuation
  sent_tokenizer = RegexpTokenizer(r'[^.!؟?]*[.!؟?]')
  text_preds = ["\n".join(np.char.strip(sent_tokenizer.tokenize(p))) for p in text_preds]
  text_labels = ["\n".join(np.char.strip(sent_tokenizer.tokenize(l))) for l in text_labels]
  
  # Compute ROUGE score with custom tokenization
  return rouge_metric.compute(
    predictions=text_preds,
    references=text_labels,
    tokenizer=tokenize_sentence
  )

In [None]:
from torch.utils.data import DataLoader

sample_dataloader = DataLoader(
    tokenized_ds_second_stage["validation"].select(range(100)).with_format("torch"),
    collate_fn=data_collator,
    batch_size=5)

for batch in sample_dataloader:
    with torch.no_grad():
        preds = model.generate(
          batch["input_ids"].to(device),
          num_beams=15,
          num_return_sequences=1,
          no_repeat_ngram_size=1,
          remove_invalid_values=True,
          max_length=128,
        )
        labels = batch["labels"]
        break

metrics_func([preds, labels])

In [None]:
from transformers import Seq2SeqTrainingArguments

training_args = Seq2SeqTrainingArguments(
    output_dir="mt5-summarize-ar-en",
    run_name="mt5-ar-en-summarization",  
    log_level="error",
    num_train_epochs=1,
    learning_rate=5e-4,
    lr_scheduler_type="linear",
    warmup_steps=90,
    optim="adafactor",
    weight_decay=0.01,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=2,
    gradient_accumulation_steps=8,
    eval_strategy="steps",
    eval_steps=100,
    predict_with_generate=True,
    generation_max_length=128,
    save_steps=500,
    logging_steps=10,
    push_to_hub=True
)

In [None]:
gen_config =  {'max_length': 128, 'num_beams': 15, 'length_penalty': 0.6, 'no_repeat_ngram_size': 2}

In [None]:
# Create the trainer
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    data_collator=data_collator,  
    compute_metrics=metrics_func,  
    train_dataset=tokenized_ds_second_stage["train"].select(range(50000)),  
    eval_dataset=tokenized_ds_second_stage['validation'].select(range(150)), 
    tokenizer=t5_tokenizer,
)

trainer.train()

In [None]:
trainer.push_to_hub()

In [None]:
def summarize_text(input_sentence, model, tokenizer, gen_config):
    inputs = tokenizer(input_sentence, return_tensors="pt", padding=True, truncation=True)
    inputs = {k: v.cuda() for k, v in inputs.items()}
    summary_ids = model.generate(**inputs, **gen_config)
    summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)
    return summary

In [None]:

# Example usage
arabic_input = (
    "تعتبر التكنولوجيا الحديثة واحدة من أهم العوامل التي تؤثر على حياتنا اليومية. "
    "في السنوات الأخيرة، شهدنا تطوراً هائلاً في مجالات مثل الذكاء الاصطناعي والبيانات الضخمة، "
    "مما ساهم في تحسين كفاءة العمل وزيادة الإنتاجية. ومع ذلك، يجب أن نكون حذرين من الآثار السلبية المحتملة، "
    "مثل فقدان الوظائف وتهديد الخصوصية."
)

english_input = (
    "Modern technology is one of the most significant factors impacting our daily lives. "
    "In recent years, we have witnessed tremendous advancements in fields such as artificial intelligence and big data, "
    "which have contributed to increased efficiency and productivity in the workplace. "
    "However, we must be cautious of potential negative effects, such as job displacement and threats to privacy."
)

gen_config = {'max_length': 128, 'num_beams': 15, 'length_penalty': 0.6, 'no_repeat_ngram_size': 2}

# Call the function for Arabic input
arabic_summary = summarize_text(arabic_input, model, t5_tokenizer, gen_config)
print("Generated Arabic Summary:", arabic_summary)

# Call the function for English input (assuming the model supports it)
english_summary = summarize_text(english_input, model, t5_tokenizer, gen_config)
print("Generated English Summary:", english_summary)