In [None]:
!pip install transformers
!pip install transformers[torch]
!pip install evaluate
!pip install datasets



In [None]:
from datasets import load_dataset
from transformers import (AutoTokenizer,
                          AutoModelForSeq2SeqLM ,
                          DataCollatorForSeq2Seq ,
                          Seq2SeqTrainingArguments,
                          Seq2SeqTrainer,
                          AutoConfig
                         )
import evaluate
import numpy as np
import torch
import re
import nltk
import pandas as pd


In [None]:
dataset = load_dataset("csv", data_files={"train": ["/content/Train.csv"], "validation": "/content/Validation.csv"})


In [None]:
!pip install sentencepiece

Collecting sentencepiece
  Downloading sentencepiece-0.1.99-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m15.3 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: sentencepiece
Successfully installed sentencepiece-0.1.99


In [None]:
tokenizer = AutoTokenizer.from_pretrained("moussaKam/AraBART")

In [None]:
def preprocess_function(examples):
    model_inputs = tokenizer(examples['text'], max_length=1024, truncation=True)
    labels = tokenizer(text_target=examples["summary"], max_length=215, truncation=True)
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

In [None]:
tokenized_dataset = dataset.map(preprocess_function,batch_size=215,batched=True)

In [None]:

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

arabart_config = AutoConfig.from_pretrained(
  "moussaKam/AraBART",
  max_length=215,
  no_repeat_ngram_size=2,
  num_beams=15,
  length_penalty =0.6
)
model = (AutoModelForSeq2SeqLM
         .from_pretrained("moussaKam/AraBART", config=arabart_config)
         .to(device))

Downloading pytorch_model.bin:   0%|          | 0.00/557M [00:00<?, ?B/s]

In [None]:

data_collator = DataCollatorForSeq2Seq(
  tokenizer,
  model=model,
  return_tensors="pt")

In [None]:
!pip install rouge_score
rouge = evaluate.load("rouge")

def strip_arabic_text(text):
    text = re.sub(r'[\u0617-\u061A\u064B-\u0652]', '', text)
    text = text.replace('\u0640', '')
    text = re.sub(r'\s+', ' ', text.strip())
    return text

def tokenize_sentence(sentence):
    encoded_sentence=tokenizer(sentence)
    return tokenizer.convert_ids_to_tokens(encoded_sentence.input_ids)

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    for pred in decoded_preds:
        pred = re.sub("؟", "?", pred)
        pred = strip_arabic_text(pred)


    for label in decoded_labels:
        label = re.sub("؟", "?", label)
        label = strip_arabic_text(label)

    decoded_preds = ["\n".join(nltk.sent_tokenize(pred)) for pred in decoded_preds]
    decoded_labels = ["\n".join(nltk.sent_tokenize(label)) for label in decoded_labels]


    return rouge.compute(
    predictions=decoded_preds,
    references=decoded_labels,
    tokenizer=tokenize_sentence)

Collecting rouge_score
  Downloading rouge_score-0.1.2.tar.gz (17 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: rouge_score
  Building wheel for rouge_score (setup.py) ... [?25l[?25hdone
  Created wheel for rouge_score: filename=rouge_score-0.1.2-py3-none-any.whl size=24934 sha256=8e4b5b4c5fc6828b2c46736d0a716d7c8f3184118a37e3836f94d4ad1f267788
  Stored in directory: /root/.cache/pip/wheels/5f/dd/89/461065a73be61a532ff8599a28e9beef17985c9e9c31e541b4
Successfully built rouge_score
Installing collected packages: rouge_score
Successfully installed rouge_score-0.1.2


Downloading builder script:   0%|          | 0.00/6.27k [00:00<?, ?B/s]

In [None]:
warmup_proportion = 0.1
max_grad_norm = 1.0
training_step	= len(tokenized_dataset["train"]) * 5
warmup_step = training_step * 0.1
eval_step =warmup_step/5
print(warmup_step)
print(eval_step)

69.0
13.8


In [None]:
from transformers import AdamW, get_linear_schedule_with_warmup, Trainer, TrainingArguments

# Define the optimizer and learning rate scheduler
num_epochs = 5
optimizer = AdamW(model.parameters(), lr=5e-5)
len_DataLoader =len(tokenized_dataset["train"])/32
num_train_steps = len_DataLoader * num_epochs
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=num_train_steps)


training_args = TrainingArguments(
    output_dir='./results',
    save_strategy = 'epoch',
    evaluation_strategy='epoch',
    num_train_epochs=num_epochs,
    per_device_train_batch_size=32,
    warmup_steps=0,
    weight_decay=0.01,
    load_best_model_at_end=True,
    metric_for_best_model='Rougel',
    logging_dir='./logs',
    greater_is_better=True
)



In [None]:
trainer = Trainer(
    model=model,
    args=training_args,
    data_collator = data_collator,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["validation"],
    compute_metrics=compute_metrics,
    tokenizer = tokenizer
)
trainer.optimizer = optimizer    # learning rate
trainer.lr_scheduler = scheduler # learning rate scheduler


In [None]:
trainer.train()