<a href="https://colab.research.google.com/github/yuliayuda/auto-tag-from-web/blob/main/shamela.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install transformers pandas torch scikit-learn evaluate wandb datasets tqdm


In [None]:
import os
import pandas as pd
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer, Seq2SeqTrainer, Seq2SeqTrainingArguments
from evaluate import load
from sklearn.model_selection import train_test_split
import wandb

# Login ke W&B menggunakan API key
wandb.login(key="5a646a4e2997f8ff868dfe4df325accd8a13b4b6")

# Inisialisasi Tokenizer dan Model
model_name = "MIIB-NLP/Arabic-question-generation"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)

# Load metrics untuk evaluasi
rouge = load("rouge")
bleu = load("bleu")

class ArabicQuestionAnswerDataset(Dataset):
    def __init__(self, data, tokenizer, max_length=512):
        self.tokenizer = tokenizer
        self.data = data
        self.max_length = max_length

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        context = self.data.loc[idx, 'text']
        answer = context  # Menggunakan context yang sama untuk answer

        # Tokenisasi
        inputs = self.tokenizer(
            context,
            return_tensors="pt",
            truncation=True,
            max_length=self.max_length,
            padding="max_length"
        )
        labels = self.tokenizer(
            answer,
            return_tensors="pt",
            truncation=True,
            max_length=self.max_length,
            padding="max_length"
        ).input_ids

        # Mengembalikan sebagai dictionary yang sesuai
        return {
            'input_ids': inputs['input_ids'].squeeze(),
            'attention_mask': inputs['attention_mask'].squeeze(),
            'labels': labels.squeeze()
        }

# Preprocessing data
def preprocess_data(df):
    # Menghapus missing values
    df.dropna(subset=['text'], inplace=True)

    # Melakukan preprocessing lain seperti penghapusan karakter tidak relevan jika diperlukan
    df['text'] = df['text'].apply(lambda x: x.strip())

    return df

# Memuat dataset dari CSV dan bagi menjadi train-test split
def load_and_split_data(csv_file):
    df = pd.read_csv(csv_file)
    df = preprocess_data(df)  # Preprocessing
    train_data, val_data = train_test_split(df, test_size=0.2, random_state=42)
    return train_data.reset_index(drop=True), val_data.reset_index(drop=True)

# Menghasilkan pertanyaan menggunakan model yang di fine-tune
def generate_questions(model, tokenizer, context, max_length=50):
    inputs = tokenizer(context, return_tensors="pt", truncation=True, max_length=512).input_ids.to(model.device)
    outputs = model.generate(inputs, max_length=max_length, num_beams=5, early_stopping=True)
    question = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return question

# Fungsi evaluasi metrik
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    # Menghitung metrik
    rouge_score = rouge.compute(predictions=decoded_preds, references=decoded_labels)
    bleu_score = bleu.compute(predictions=decoded_preds, references=decoded_labels)

    return {"rouge": rouge_score['rougeL'], "bleu": bleu_score['bleu']}

# Fine-tuning menggunakan Trainer dari Hugging Face
def fine_tune_model(train_dataset, val_dataset, output_dir="/content/drive/Shareddrives/Gpldome_2/output_model", epochs=5, batch_size=8, learning_rate=5e-5):
    training_args = Seq2SeqTrainingArguments(
        output_dir=output_dir,
        per_device_train_batch_size=batch_size,
        per_device_eval_batch_size=batch_size,
        predict_with_generate=True,
        evaluation_strategy="steps",
        save_steps=500,
        eval_steps=500,
        logging_dir='./logs',
        logging_steps=100,
        num_train_epochs=epochs,
        save_total_limit=2,
        load_best_model_at_end=True,
        metric_for_best_model="rouge",
        greater_is_better=True,
        learning_rate=learning_rate,
        gradient_accumulation_steps=2,  # Untuk akumulasi gradient
        warmup_steps=500,  # Linear warmup
        report_to="wandb"  # Menggunakan W&B untuk logging
    )

    trainer = Seq2SeqTrainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=val_dataset,
        compute_metrics=compute_metrics
    )

    trainer.train()

# Simpan model yang sudah di fine-tune
def save_fine_tuned_model(model, output_dir):
    model.save_pretrained(output_dir)
    tokenizer.save_pretrained(output_dir)

# Fungsi utama untuk menjalankan fine-tuning
def main(csv_file):
    # Load dan bagi dataset
    train_data, val_data = load_and_split_data(csv_file)

    # Siapkan dataset untuk fine-tuning
    train_dataset = ArabicQuestionAnswerDataset(train_data, tokenizer)
    val_dataset = ArabicQuestionAnswerDataset(val_data, tokenizer)

    # Fine-tune model
    fine_tune_model(train_dataset, val_dataset)

    # Simpan model yang sudah di fine-tune
    save_fine_tuned_model(model, "/content/drive/Shareddrives/Gpldome_2/output_model/fine_tuned_arabic_question_model")

    # Hasilkan pertanyaan dan jawaban serta simpan ke CSV
    df = pd.read_csv(csv_file)
    df['Generated_Question'] = df['text'].apply(lambda x: generate_questions(model, tokenizer, x))
    df['Generated_Answer'] = df['text']  # Answer tetap dari kolom text yang sama

    # Simpan ke file CSV baru
    output_csv = "/content/drive/Shareddrives/Gpldome_2/output_model/generated_questions_answers.csv"
    df.to_csv(output_csv, index=False)
    print(f"Hasil pertanyaan dan jawaban disimpan di: {output_csv}")

if __name__ == "__main__":
    # Ganti dengan path dataset CSV Anda
    csv_file = "/content/gabungan_kitab.csv"
    main(csv_file)
