In [1]:
# !pip install transformers datasets evaluate torch numpy pandas SentencePiece
# !pip install transformers[torch]
# !pip install accelerate==0.34.2
# !pip show accelerate
# !pip install absl-py rouge_score

In [2]:
from transformers import T5Tokenizer, T5ForConditionalGeneration, GenerationConfig, T5Model
from transformers import DataCollatorForSeq2Seq, Seq2SeqTrainingArguments, Seq2SeqTrainer, Trainer, EarlyStoppingCallback
from transformers import ProgressCallback
from transformers import TrainerCallback
from datasets import Dataset, DatasetDict, load_dataset
from evaluate import load
import torch
import os
from os import listdir
from os.path import isfile, join
import json
import re
import numpy as np
import pandas as pd
import requests

In [3]:
path = os.getcwd()

In [4]:
# Load dataset from parquet files using load_dataset
dataset = load_dataset('parquet', data_files={'train': 'dataset/used/train.parquet', 
                                               'test': 'dataset/used/test.parquet', 
                                               'dev': 'dataset/used/dev.parquet',
                                             'xtreme_test':'dataset/used/xtreme_test.parquet',
                                             'xtreme_dev':'dataset/used/xtreme_dev.parquet'})

# Drop unused columns
columns_to_drop = ['id', 'url','clean_article','clean_summary','extractive_summary','ext_clean_article','ext_clean_summary','extractive_summary_sentences','ext_extractive_summary','prep_clean_article_no_sw']

dataset['train'] = dataset['train'].remove_columns(columns_to_drop)
dataset['test'] = dataset['test'].remove_columns(columns_to_drop)
dataset['dev'] = dataset['dev'].remove_columns(columns_to_drop)

In [5]:
dataset['train']

Dataset({
    features: ['prep_clean_article', 'prep_clean_summary', 'prep_extractive_summary'],
    num_rows: 193883
})

In [6]:
train_sample = dataset["train"].shuffle(seed=42).select(range(int(2500)))
dev_sample = dataset["dev"].shuffle(seed=42).select(range(int(700)))
test_sample = dataset["test"].shuffle(seed=42).select(range(int(700)))

# train_sample = dataset["train"].shuffle(seed=42).select(range(int(len(dataset["train"]) * 0.005)))
# dev_sample = dataset["dev"].shuffle(seed=42).select(range(int(len(dataset["dev"]) * 0.05)))
# test_sample = dataset["test"].shuffle(seed=42).select(range(int(len(dataset["test"]) * 0.05)))

In [7]:
print(f"Train size: {len(train_sample)}")
print(f"Val size: {len(dev_sample)}")
print(f"Test size: {len(test_sample)}")

Train size: 80
Val size: 25
Test size: 25


In [8]:
dataset_dict = DatasetDict({
    'train': train_sample,
    'val': dev_sample,
    'test': test_sample
})

In [9]:
tokenizer = T5Tokenizer.from_pretrained("panggi/t5-base-indonesian-summarization-cased")
model = T5ForConditionalGeneration.from_pretrained("panggi/t5-base-indonesian-summarization-cased")

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


In [10]:
data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model)

In [11]:
prefix = ""
def preprocess_function(examples):
    inputs = [prefix + doc for doc in examples["prep_clean_article"]]
    model_inputs = tokenizer(inputs, max_length=512, truncation=True)

    labels = tokenizer(text_target=examples["prep_clean_summary"], max_length=128, truncation=True)

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

In [12]:
tokenized_dataset = dataset_dict.map(preprocess_function, batched=True)

In [13]:
from_number = '6285741880658@c.us'

hasil = f'Memulai Pelatihan Model \nTrain size: {len(train_sample)} \nVal size: {len(dev_sample)} \nTest size: {len(test_sample)}'

response = requests.post(
        'https://whatsapp.inspektorat.pekalongankab.go.id/api/sendText/', #rahasia ya ;D
        headers={
            'Content-Type': 'application/json; charset=utf-8',
            'Accept': 'application/json',
            'X-Api-Key': 'KMZWAY87AA' # rahasia
        },
        json={  # Menggunakan parameter `json` untuk mengirim raw JSON
            'chatId': from_number,
            'text': hasil,
            'session': 'NoamChomsky'
        },
        verify=False  # Ini sesuai dengan withoutVerifying() di PHP
    )



In [14]:
from transformers import TrainerCallback

class CustomCallback(TrainerCallback):
    def on_epoch_end(self, args, state, control, **kwargs):
        # Catat training loss
        training_loss = state.log_history[-1]['loss'] if state.log_history else None
        state.log_history.append({
            'epoch': state.epoch,
            'training_loss': training_loss
        })


In [15]:
for param in model.get_encoder().parameters():
    param.requires_grad = False  # Membekukan semua parameter encoder

training_args = Seq2SeqTrainingArguments(
    output_dir="./results",
    eval_strategy="epoch",   
    save_strategy="epoch",         
    logging_dir='./logs',          
    logging_steps=10,              
    learning_rate=5e-5,
    per_device_train_batch_size=2,  # Ubah batch size dari 4 menjadi 2
    per_device_eval_batch_size=2,   # Ubah eval batch size dari 4 menjadi 2
    gradient_accumulation_steps=2,
    weight_decay=0.03,
    save_total_limit=3,
    num_train_epochs=2,
    predict_with_generate=True,
    load_best_model_at_end=True,
    metric_for_best_model="rouge2",
    greater_is_better=True,
)

In [16]:
rouge = load('rouge')
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    if isinstance(predictions, tuple):
        predictions = predictions[0]  # Memperbaiki typo di sini, sebelumnya 'preds' seharusnya 'predictions'

    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)

    # Directly decode predictions
    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    # Rouge expects newline-separated text
    decoded_preds = ["\n".join(decoded_pred.split()) for decoded_pred in decoded_preds]
    decoded_labels = ["\n".join(decoded_label.split()) for decoded_label in decoded_labels]

    result = rouge.compute(predictions=decoded_preds, references=decoded_labels, use_stemmer=True)

    return {k: round(v, 4) for k, v in result.items()}

In [17]:
torch.set_num_threads(4)  # Mengatur jumlah threads sesuai dengan jumlah core CPU

trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["val"],
    compute_metrics=compute_metrics,
    data_collator=data_collator,  # Pastikan ini sudah teratur
    callbacks=[
        CustomCallback(),  
        EarlyStoppingCallback(early_stopping_patience=3),
        ProgressCallback()
    ]
)

In [18]:
trainer.train()

  0%|          | 0/40 [00:00<?, ?it/s]

Epoch,Training Loss,Validation Loss,Rouge1,Rouge2,Rougel,Rougelsum
1,2.9309,3.263117,0.0795,0.0304,0.0695,0.0804
2,2.701,3.203314,0.1101,0.0385,0.0926,0.1081


{'loss': 2.8876, 'grad_norm': 3.7228009700775146, 'learning_rate': 3.7500000000000003e-05, 'epoch': 0.5}
{'loss': 2.9309, 'grad_norm': 3.758315086364746, 'learning_rate': 2.5e-05, 'epoch': 1.0}




  0%|          | 0/13 [00:00<?, ?it/s]

{'eval_loss': 3.2631173133850098, 'eval_rouge1': 0.0795, 'eval_rouge2': 0.0304, 'eval_rougeL': 0.0695, 'eval_rougeLsum': 0.0804, 'eval_runtime': 47.8264, 'eval_samples_per_second': 0.523, 'eval_steps_per_second': 0.272, 'epoch': 1.0}
{'loss': 2.4312, 'grad_norm': 3.7096760272979736, 'learning_rate': 1.25e-05, 'epoch': 1.5}
{'loss': 2.701, 'grad_norm': 3.902181625366211, 'learning_rate': 0.0, 'epoch': 2.0}




  0%|          | 0/13 [00:00<?, ?it/s]

{'eval_loss': 3.2033135890960693, 'eval_rouge1': 0.1101, 'eval_rouge2': 0.0385, 'eval_rougeL': 0.0926, 'eval_rougeLsum': 0.1081, 'eval_runtime': 47.9872, 'eval_samples_per_second': 0.521, 'eval_steps_per_second': 0.271, 'epoch': 2.0}


There were missing keys in the checkpoint model loaded: ['encoder.embed_tokens.weight', 'decoder.embed_tokens.weight', 'lm_head.weight'].


{'train_runtime': 297.2023, 'train_samples_per_second': 0.538, 'train_steps_per_second': 0.135, 'train_loss': 2.737675428390503, 'epoch': 2.0}


TrainOutput(global_step=40, training_loss=2.737675428390503, metrics={'train_runtime': 297.2023, 'train_samples_per_second': 0.538, 'train_steps_per_second': 0.135, 'total_flos': 47834118051840.0, 'train_loss': 2.737675428390503, 'epoch': 2.0})

In [19]:
# Menyimpan model
model.save_pretrained("model/t5-panggi")

# Menyimpan tokenizer
tokenizer.save_pretrained("model/t5-panggi")

('model/t5-panggi/tokenizer_config.json',
 'model/t5-panggi/special_tokens_map.json',
 'model/t5-panggi/spiece.model',
 'model/t5-panggi/added_tokens.json')

In [20]:
# Mendapatkan log history dari trainer
log_history = trainer.state.log_history

# Membuat DataFrame dari log_history
df_log_history = pd.DataFrame(log_history)

# Simpan ke file CSV
df_log_history.to_csv('log_history.csv', index=False)

train_runtime_minutes = ''

# Mencari log yang berisi train_runtime
for log in log_history:
    if 'train_runtime' in log:
        train_runtime = log['train_runtime']
        train_runtime_minutes = train_runtime / 60  # Konversi ke menit
        break

In [21]:
from_number = '6285741880658@c.us'

hasil = f'Selesai Pelatihan Model : {train_runtime_minutes:.2f} menit'

response = requests.post(
        'https://whatsapp.inspektorat.pekalongankab.go.id/api/sendText/', #rahasia ya ;D
        headers={
            'Content-Type': 'application/json; charset=utf-8',
            'Accept': 'application/json',
            'X-Api-Key': 'KMZWAY87AA' # rahasia
        },
        json={  # Menggunakan parameter `json` untuk mengirim raw JSON
            'chatId': from_number,
            'text': hasil,
            'session': 'NoamChomsky'
        },
        verify=False  # Ini sesuai dengan withoutVerifying() di PHP
    )



In [22]:
# Mendapatkan log history dari trainer
log_history = trainer.state.log_history

# Mencari log yang berisi train_runtime
for log in log_history:
    if 'train_runtime' in log:
        train_runtime = log['train_runtime']
        train_runtime_minutes = train_runtime / 60  # Konversi ke menit
        print(f"Train runtime: {train_runtime_minutes:.2f} menit")
        break


Train runtime: 4.95 menit


In [23]:
# from transformers import T5Tokenizer, T5ForConditionalGeneration

# # Memuat tokenizer
# tokenizerx = T5Tokenizer.from_pretrained("model")

# # Memuat model
# modelx = T5ForConditionalGeneration.from_pretrained("model")

In [24]:
# textx = """
#     Liputan6.com, Bandung: Ratusan warga Kecamatan Andir, Kota Madya Bandung, Jawa Barat, antusias mengikuti sosialisasi teknik pencoblosan Pemilihan Umum 2004 yang digelar Komisi Pemilihan Umum Daerah Bandung, Senin(9/2). Walau secara keseluruhan tak ada kesulitan, warga tetap mengaku bingung. Soalnya ukuran kertas suara lebih besar dibanding ukuran bilik suara. Dalam sosialisasi teknis pencoblosan, KPU Bandung memang langsung memperagakan dengan menggunakan bilik suara asli serta kertas suara sesuai ukuran yang sebenarnya. Hal ini dimaksudkan agar dalam pelaksanaan pemilu nanti para calon pemilih tak mengalami kesulitan lagi. Seperti diketahui, kertas suara berukuran 48 X 84 sentimeter. Sementara bilik suara hanya berukuran 50 X 50 sentimeter dengan tinggi 60 sentimeter [baca: Pemilu Sekarang Memang Berbeda].(ICH/Patria Hidayat dan Taufik Hidayat).
# """

In [25]:
# # Tokenisasi input
# inputsx = tokenizerx.encode(textx, return_tensors="pt", max_length=512, truncation=True)

# # Menghasilkan ringkasan
# summary_idsx = modelx.generate(inputsx, max_length=150, min_length=40, length_penalty=2.0, num_beams=4, early_stopping=True)

# # Mengubah token kembali ke teks
# summaryx = tokenizerx.decode(summary_idsx[0], skip_special_tokens=True)

# # Menampilkan ringkasan
# print("Ringkasan:", summaryx)

In [26]:
# # Tokenisasi input
# inputsx = tokenizerx.encode(textx, return_tensors="pt", max_length=512, truncation=True)

# # Menghasilkan ringkasan
summary_idsx = modelx.generate(inputsx,
            max_length=100, 
            num_beams=2,
            repetition_penalty=2.5, 
            length_penalty=1.0, 
            early_stopping=True,
            no_repeat_ngram_size=2,
            use_cache=True)

# # Mengubah token kembali ke teks
# summaryx = tokenizerx.decode(summary_idsx[0], skip_special_tokens=True)

# # Menampilkan ringkasan
# print("Ringkasan:", summaryx)

In [27]:
# /usr/local/lib/python3.10/dist-packages/transformers/generation/utils.py:1220: UserWarning: Using the model-agnostic default `max_length` (=20) to control the generation length. We recommend setting `max_new_tokens` to control the maximum length of the generation.
#   warnings.warn(
# {'eval_loss': 2.4181246757507324, 'eval_rouge1': 0.3015, 'eval_rouge2': 0.1477, 'eval_rougeL': 0.2614, 'eval_rougeLsum': 0.3011, 'eval_runtime': 996.3124, 'eval_samples_per_second': 0.55, 'eval_steps_per_second': 0.138, 'epoch': 1.0}
# /usr/local/lib/python3.10/dist-packages/transformers/generation/utils.py:1220: UserWarning: Using the model-agnostic default `max_length` (=20) to control the generation length. We recommend setting `max_new_tokens` to control the maximum length of the generation.
#   warnings.warn(
# {'eval_loss': 2.427324056625366, 'eval_rouge1': 0.3012, 'eval_rouge2': 0.1466, 'eval_rougeL': 0.2616, 'eval_rougeLsum': 0.3011, 'eval_runtime': 1010.0476, 'eval_samples_per_second': 0.543, 'eval_steps_per_second': 0.136, 'epoch': 2.0}
# /usr/local/lib/python3.10/dist-packages/transformers/generation/utils.py:1220: UserWarning: Using the model-agnostic default `max_length` (=20) to control the generation length. We recommend setting `max_new_tokens` to control the maximum length of the generation.
#   warnings.warn(
# {'eval_loss': 2.436957597732544, 'eval_rouge1': 0.2982, 'eval_rouge2': 0.1444, 'eval_rougeL': 0.259, 'eval_rougeLsum': 0.2982, 'eval_runtime': 1011.9433, 'eval_samples_per_second': 0.542, 'eval_steps_per_second': 0.135, 'epoch': 2.99}
# There were missing keys in the checkpoint model loaded: ['encoder.embed_tokens.weight', 'decoder.embed_tokens.weight', 'lm_head.weight'].
# {'train_runtime': 11335.3937, 'train_samples_per_second': 0.256, 'train_steps_per_second': 0.032, 'train_loss': 1.7137528516701102, 'epoch': 2.99}
# TrainOutput(global_step=363, training_loss=1.7137528516701102, metrics={'train_runtime': 11335.3937, 'train_samples_per_second': 0.256, 'train_steps_per_second': 0.032, 'total_flos': 1285241309153280.0, 'train_loss': 1.7137528516701102, 'epoch': 2.9876543209876543})