<a href="https://colab.research.google.com/github/yuliayuda/auto-tag-from-web/blob/main/shamela.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
#!pip install transformers pandas torch scikit-learn evaluate wandb datasets tqdm
!pip install rouge_score

In [None]:
import os
import pandas as pd
import torch
from torch.utils.data import Dataset
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer, Seq2SeqTrainer, Seq2SeqTrainingArguments
from evaluate import load
from sklearn.model_selection import train_test_split

os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"

# Inisialisasi Tokenizer dan Model
model_name = "MIIB-NLP/Arabic-question-generation"
try:
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
except Exception as e:
    print(f"Error loading model or tokenizer: {e}")
    raise

# Load metrics untuk evaluasi
try:
    rouge = load("rouge")
    bleu = load("bleu")
except Exception as e:
    print(f"Error loading evaluation metrics: {e}")
    raise

class ArabicQuestionAnswerDataset(Dataset):
    def __init__(self, data, tokenizer, max_length=256):
        self.tokenizer = tokenizer
        self.data = data
        self.max_length = max_length

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        try:
            context = self.data.loc[idx, 'text']
            answer = context  # Menggunakan context yang sama untuk answer

            # Tokenisasi
            inputs = self.tokenizer(
                context,
                return_tensors="pt",
                truncation=True,
                max_length=self.max_length,
                padding="max_length"
            )
            labels = self.tokenizer(
                answer,
                return_tensors="pt",
                truncation=True,
                max_length=self.max_length,
                padding="max_length"
            ).input_ids

            return {
                'input_ids': inputs['input_ids'].squeeze(),
                'attention_mask': inputs['attention_mask'].squeeze(),
                'labels': labels.squeeze()
            }
        except Exception as e:
            print(f"Error in dataset __getitem__ method: {e}")
            return None

# Preprocessing data
def preprocess_data(df):
    try:
        df.dropna(subset=['text'], inplace=True)
        df['text'] = df['text'].str.strip()
        return df
    except Exception as e:
        print(f"Error preprocessing data: {e}")
        raise

# Memuat dataset dari CSV dan bagi menjadi train-test split
def load_and_split_data(csv_file):
    try:
        df = pd.read_csv(csv_file)
        df = preprocess_data(df)
        train_data, val_data = train_test_split(df, test_size=0.2, random_state=42)
        return train_data.reset_index(drop=True), val_data.reset_index(drop=True)
    except FileNotFoundError as e:
        print(f"CSV file not found: {csv_file}")
        raise
    except pd.errors.EmptyDataError as e:
        print(f"CSV file is empty: {csv_file}")
        raise
    except Exception as e:
        print(f"Error loading and splitting data: {e}")
        raise

# Menghasilkan pertanyaan menggunakan model yang di fine-tune
def generate_questions(model, tokenizer, context, max_length=50):
    try:
        inputs = tokenizer(context, return_tensors="pt", truncation=True, max_length=256).input_ids.to(model.device)
        outputs = model.generate(inputs, max_length=max_length, num_beams=5, early_stopping=True)
        question = tokenizer.decode(outputs[0], skip_special_tokens=True)
        return question
    except Exception as e:
        print(f"Error generating questions: {e}")
        return ""

# Fungsi evaluasi metrik
def compute_metrics(eval_pred):
    try:
        predictions, labels = eval_pred
        # Menghindari penghitungan jika predictions tidak valid
        if predictions is None or labels is None or len(predictions) == 0 or len(labels) == 0:
            return {"rouge": 0, "bleu": 0}  # Nilai default jika tidak ada data

        decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
        decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

        rouge_score = rouge.compute(predictions=decoded_preds, references=decoded_labels)
        bleu_score = bleu.compute(predictions=decoded_preds, references=decoded_labels)

        return {
            "rouge": rouge_score['rougeL'],
            "bleu": bleu_score['bleu']
        }
    except Exception as e:
        print(f"Error computing metrics: {e}")
        return {}

# Fine-tuning menggunakan Trainer dari Hugging Face
def fine_tune_model(train_dataset, val_dataset, output_dir="/content/drive/Shareddrives/Gpldome_2/output_model", epochs=5, batch_size=4, learning_rate=5e-5):
    try:
        checkpoints = [ckpt for ckpt in os.listdir(output_dir) if ckpt.startswith('checkpoint-')]
        checkpoint_path = None

        # Cek apakah checkpoint ada dan file trainer_state.json ada
        for checkpoint in checkpoints:
            trainer_state_path = os.path.join(output_dir, checkpoint, "trainer_state.json")
            if os.path.exists(trainer_state_path):
                checkpoint_path = os.path.join(output_dir, checkpoint)
                print(f"Melanjutkan training dari checkpoint: {checkpoint_path}")
                break

        if not checkpoint_path:
            print("Memulai training dari awal.")

        training_args = Seq2SeqTrainingArguments(
            output_dir=output_dir,
            fp16=False,
            gradient_checkpointing=True,
            per_device_train_batch_size=batch_size,
            per_device_eval_batch_size=batch_size,
            predict_with_generate=True,
            evaluation_strategy="steps",
            save_steps=100,
            eval_steps=100,
            logging_dir='./logs',
            logging_steps=100,
            num_train_epochs=epochs,
            save_total_limit=2,
            load_best_model_at_end=True,
            metric_for_best_model="eval_loss",  # Ganti ke 'eval_loss'
            greater_is_better=False,  # Ganti ke False untuk 'eval_loss'
            learning_rate=learning_rate,
            gradient_accumulation_steps=2,
            warmup_steps=500,
            max_grad_norm=1.0,  # Tambahkan gradient clipping
        )

        trainer = Seq2SeqTrainer(
            model=model,
            args=training_args,
            train_dataset=train_dataset,
            eval_dataset=val_dataset,
            compute_metrics=compute_metrics
        )

        trainer.train(resume_from_checkpoint=checkpoint_path)
    except Exception as e:
        print(f"Error during fine-tuning: {e}")
        raise

# Simpan model yang sudah di fine-tune
def save_fine_tuned_model(model, output_dir):
    try:
        model.save_pretrained(output_dir)
        tokenizer.save_pretrained(output_dir)
    except Exception as e:
        print(f"Error saving fine-tuned model: {e}")

# Fungsi utama untuk menjalankan fine-tuning
def main(csv_file):
    try:
        train_data, val_data = load_and_split_data(csv_file)
        train_dataset = ArabicQuestionAnswerDataset(train_data, tokenizer)
        val_dataset = ArabicQuestionAnswerDataset(val_data, tokenizer)

        fine_tune_model(train_dataset, val_dataset)

        save_fine_tuned_model(model, "/content/drive/Shareddrives/Gpldome_2/output_model/fine_tuned_arabic_question_model")

        df = pd.read_csv(csv_file)
        df['Generated_Question'] = df['text'].apply(lambda x: generate_questions(model, tokenizer, x))
        df['Generated_Answer'] = df['text']

        output_csv = "/content/drive/Shareddrives/Gpldome_2/output_model/generated_questions_answers.csv"
        df.to_csv(output_csv, index=False)
        print(f"Hasil pertanyaan dan jawaban disimpan di: {output_csv}")
    except Exception as e:
        print(f"Error in main function: {e}")
        raise

if __name__ == "__main__":
    csv_file = "/content/gabungan_kitab.csv"  # Ubah sesuai nama file CSV yang digunakan
    main(csv_file)


Memulai training dari awal.


  self.scaler = torch.cuda.amp.GradScaler(**kwargs)
  return fn(*args, **kwargs)


Step,Training Loss,Validation Loss,Rouge,Bleu
100,15.4233,,0.0,4e-06


  return fn(*args, **kwargs)
