In [1]:
# !pip install transformers datasets evaluate torch numpy pandas SentencePiece transformers[torch] accelerate==0.34.2 absl-py rouge_score

In [2]:
from transformers import BertTokenizer, EncoderDecoderModel
from transformers import DataCollatorForSeq2Seq, Seq2SeqTrainingArguments, Seq2SeqTrainer, Trainer, EarlyStoppingCallback
from transformers import ProgressCallback
from transformers import TrainerCallback
from datasets import Dataset, DatasetDict, load_dataset
from evaluate import load
import torch
import os
from os import listdir
from os.path import isfile, join
import json
import re
import numpy as np
import pandas as pd
import requests

In [3]:
path = os.getcwd()

In [4]:
# Load dataset from parquet files using load_dataset
dataset = load_dataset('parquet', data_files={'train': 'dataset/used/train.parquet', 
                                               'test': 'dataset/used/test.parquet', 
                                               'dev': 'dataset/used/dev.parquet'})

# Drop unused columns
columns_to_drop = ['id', 'url','clean_article','clean_summary','extractive_summary','ext_clean_article','ext_clean_summary','extractive_summary_sentences','ext_extractive_summary','prep_clean_article_no_sw']

dataset['train'] = dataset['train'].remove_columns(columns_to_drop)
dataset['test'] = dataset['test'].remove_columns(columns_to_drop)
dataset['dev'] = dataset['dev'].remove_columns(columns_to_drop)

In [5]:
dataset['train']

Dataset({
    features: ['prep_clean_article', 'prep_clean_summary', 'prep_extractive_summary'],
    num_rows: 193883
})

In [6]:
train_sample = dataset["train"].shuffle(seed=42).select(range(int(3000)))
dev_sample = dataset["dev"].shuffle(seed=42).select(range(int(1000)))
test_sample = dataset["test"].shuffle(seed=42).select(range(int(1000)))

In [7]:
print(f"Train size: {len(train_sample)}")
print(f"Val size: {len(dev_sample)}")
print(f"Test size: {len(test_sample)}")

Train size: 50
Val size: 10
Test size: 10


In [8]:
dataset_dict = DatasetDict({
    'train': train_sample,
    'val': dev_sample,
    'test': test_sample
})

In [9]:
tokenizer = BertTokenizer.from_pretrained("cahya/bert2gpt-indonesian-summarization")
tokenizer.bos_token = tokenizer.cls_token
tokenizer.eos_token = tokenizer.sep_token
model = EncoderDecoderModel.from_pretrained("cahya/bert2gpt-indonesian-summarization")

EncoderDecoderModel has generative capabilities, as `prepare_inputs_for_generation` is explicitly overwritten. However, it doesn't directly inherit from `GenerationMixin`. From 👉v4.50👈 onwards, `PreTrainedModel` will NOT inherit from `GenerationMixin`, and this model will lose the ability to call `generate` and other related functions.
  - If you are the owner of the model architecture code, please modify your model class such that it inherits from `GenerationMixin` (after `PreTrainedModel`, otherwise you'll get an exception).
  - If you are not the owner of the model architecture class, please contact the model code owner to update it.


In [10]:
data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model)

In [11]:
prefix = ""
def preprocess_function(examples):
    inputs = [prefix + doc for doc in examples["prep_clean_article"]]
    model_inputs = tokenizer(inputs, max_length=512, truncation=True)

    labels = tokenizer(text_target=examples["prep_clean_summary"], max_length=128, truncation=True)

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

In [12]:
tokenized_dataset = dataset_dict.map(preprocess_function, batched=True)

In [13]:
from_number = '6285741880658@c.us'

hasil = f'Memulai Pelatihan Model BERT2GPT \nTrain size: {len(train_sample)} \nVal size: {len(dev_sample)} \nTest size: {len(test_sample)}'

response = requests.post(
        'https://whatsapp.inspektorat.pekalongankab.go.id/api/sendText/', #rahasia ya ;D
        headers={
            'Content-Type': 'application/json; charset=utf-8',
            'Accept': 'application/json',
            'X-Api-Key': 'KMZWAY87AA' # rahasia
        },
        json={  # Menggunakan parameter `json` untuk mengirim raw JSON
            'chatId': from_number,
            'text': hasil,
            'session': 'NoamChomsky'
        },
        verify=False  # Ini sesuai dengan withoutVerifying() di PHP
    )



In [14]:
for param in model.get_encoder().parameters():
    param.requires_grad = False  # Membekukan semua parameter encoder

training_args = Seq2SeqTrainingArguments(
    output_dir="./results",
    eval_strategy="epoch",   
    save_strategy="epoch",         
    logging_dir='./logs',          
    logging_steps=10,              
    learning_rate=3e-5,
    per_device_train_batch_size=4, 
    per_device_eval_batch_size=2,
    gradient_accumulation_steps=2,
    weight_decay=0.05,
    save_total_limit=3,
    num_train_epochs=2,
    predict_with_generate=True,
    load_best_model_at_end=True,
    metric_for_best_model="rouge2",
    greater_is_better=True,
)

In [15]:
rouge = load('rouge')
def compute_metrics(eval_pred):
    predictions, labels = eval_pred

    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)

    # Directly decode predictions
    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=False)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=False)

    # Rouge expects newline-separated text
    decoded_preds = ["\n".join(decoded_pred.split()) for decoded_pred in decoded_preds]
    decoded_labels = ["\n".join(decoded_label.split()) for decoded_label in decoded_labels]

    result = rouge.compute(predictions=decoded_preds, references=decoded_labels, use_stemmer=True)

    return {k: round(v, 4) for k, v in result.items()}

In [16]:
torch.set_num_threads(24)  # Mengatur jumlah threads sesuai dengan jumlah core CPU

trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["val"],
    compute_metrics=compute_metrics,
    data_collator=data_collator,  # Pastikan ini sudah teratur
    callbacks=[
        EarlyStoppingCallback(early_stopping_patience=3),
        ProgressCallback()
    ]
)

In [17]:
trainer.train()

  0%|          | 0/18 [00:00<?, ?it/s]



Epoch,Training Loss,Validation Loss,Rouge1,Rouge2,Rougel,Rougelsum
0,No log,3.392625,0.4282,0.2349,0.3668,0.4261
2,2.690100,3.156455,0.5026,0.3342,0.4569,0.5011


  0%|          | 0/3 [00:00<?, ?it/s]

{'eval_loss': 3.392624616622925, 'eval_rouge1': 0.4282, 'eval_rouge2': 0.2349, 'eval_rougeL': 0.3668, 'eval_rougeLsum': 0.4261, 'eval_runtime': 283.5415, 'eval_samples_per_second': 0.035, 'eval_steps_per_second': 0.011, 'epoch': 0.92}




{'loss': 2.6901, 'grad_norm': 5.679691314697266, 'learning_rate': 1.3333333333333333e-05, 'epoch': 1.54}


  0%|          | 0/3 [00:00<?, ?it/s]

{'eval_loss': 3.1692073345184326, 'eval_rouge1': 0.4715, 'eval_rouge2': 0.3029, 'eval_rougeL': 0.4291, 'eval_rougeLsum': 0.4718, 'eval_runtime': 240.3992, 'eval_samples_per_second': 0.042, 'eval_steps_per_second': 0.012, 'epoch': 2.0}




  0%|          | 0/3 [00:00<?, ?it/s]

{'eval_loss': 3.1564552783966064, 'eval_rouge1': 0.5026, 'eval_rouge2': 0.3342, 'eval_rougeL': 0.4569, 'eval_rougeLsum': 0.5011, 'eval_runtime': 238.4112, 'eval_samples_per_second': 0.042, 'eval_steps_per_second': 0.013, 'epoch': 2.77}


There were missing keys in the checkpoint model loaded: ['decoder.lm_head.weight'].


{'train_runtime': 929.2966, 'train_samples_per_second': 0.161, 'train_steps_per_second': 0.019, 'train_loss': 2.323039107852512, 'epoch': 2.77}


TrainOutput(global_step=18, training_loss=2.323039107852512, metrics={'train_runtime': 929.2966, 'train_samples_per_second': 0.161, 'train_steps_per_second': 0.019, 'total_flos': 47322787276800.0, 'train_loss': 2.323039107852512, 'epoch': 2.769230769230769})

In [18]:
# Menyimpan model
model.save_pretrained("model/bert2gpt-cahya")

# Menyimpan tokenizer
tokenizer.save_pretrained("model/bert2gpt-cahya")

('model/bert2gpt-cahya/tokenizer_config.json',
 'model/bert2gpt-cahya/special_tokens_map.json',
 'model/bert2gpt-cahya/vocab.txt',
 'model/bert2gpt-cahya/added_tokens.json')

In [19]:
# Mendapatkan log history dari trainer
log_history = trainer.state.log_history

# Membuat DataFrame dari log_history
df_log_history = pd.DataFrame(log_history)

# Simpan ke file CSV
df_log_history.to_csv('log_history.csv', index=False)

train_runtime_minutes = ''

# Mencari log yang berisi train_runtime
for log in log_history:
    if 'train_runtime' in log:
        train_runtime = log['train_runtime']
        train_runtime_minutes = train_runtime / 60  # Konversi ke menit
        break

In [20]:
from_number = '6285741880658@c.us'
response = requests.post(
        'https://whatsapp.inspektorat.pekalongankab.go.id/api/sendText/', #rahasia ya ;D
        headers={
            'Content-Type': 'application/json; charset=utf-8',
            'Accept': 'application/json',
            'X-Api-Key': 'KMZWAY87AA' # rahasia
        },
        json={  # Menggunakan parameter `json` untuk mengirim raw JSON
            'chatId': from_number,
            'text': f"Selesai Pelatihan Model BERT2GPT : {train_runtime_minutes:.2f} menit",
            'session': 'NoamChomsky'
        },
        verify=False  # Ini sesuai dengan withoutVerifying() di PHP
    )

