In [None]:
# 1. Install necessary libraries
!pip install -q transformers datasets sentencepiece

# 3. Imports
import pandas as pd
from datasets import Dataset
from transformers import MT5Tokenizer, MT5ForConditionalGeneration, AutoTokenizer
import torch
from torch.utils.data import DataLoader
from tqdm.notebook import tqdm


# 4. Load CSV dataset
csv_path = 'dataset_summerize.csv'
df = pd.read_csv(csv_path)

# 5. Convert to Huggingface Dataset
dataset = Dataset.from_pandas(df)

# 6. Initialize tokenizer and model
model_name = 'csebuetnlp/mT5_multilingual_XLSum'
tokenizer = AutoTokenizer.from_pretrained(model_name)  # AutoTokenizer resolves tokenizer class mismatch
model = MT5ForConditionalGeneration.from_pretrained(model_name)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# 7. Preprocessing function
def preprocess_function(examples):
    inputs = ["summarize: " + doc for doc in examples['Customer Review']]
    model_inputs = tokenizer(inputs, max_length=512, truncation=True, padding='max_length')
    labels = tokenizer(examples['Abstractive Summary'], max_length=128, truncation=True, padding='max_length')
    model_inputs['labels'] = labels['input_ids']
    return model_inputs


# 8. Tokenize dataset
tokenized_dataset = dataset.map(preprocess_function, batched=True, remove_columns=dataset.column_names)

# 9. Split train/validation
split = tokenized_dataset.train_test_split(test_size=0.1)
train_ds = split['train']
val_ds = split['test']

# 10. Convert to PyTorch DataLoader
def collate_fn(batch):
    input_ids = torch.tensor([example['input_ids'] for example in batch], dtype=torch.long)
    attention_mask = torch.tensor([example['attention_mask'] for example in batch], dtype=torch.long)
    labels = torch.tensor([example['labels'] for example in batch], dtype=torch.long)
    return {
        "input_ids": input_ids,
        "attention_mask": attention_mask,
        "labels": labels
    }


train_loader = DataLoader(train_ds, batch_size=2, shuffle=True, collate_fn=collate_fn)
val_loader = DataLoader(val_ds, batch_size=2, collate_fn=collate_fn)

# 11. Training loop
optimizer = torch.optim.AdamW(model.parameters(), lr=5e-5)
epochs = 3

model.train()
for epoch in range(epochs):
    print(f"Epoch {epoch+1}/{epochs}")
    for batch in tqdm(train_loader):
        batch = {k: v.to(device) for k, v in batch.items()}
        outputs = model(**batch)
        loss = outputs.loss
        loss.backward()
        optimizer.step()
        optimizer.zero_grad()

# 12. Save model
save_path = 'mt5_xlsum_summarization'
model.save_pretrained(save_path)
tokenizer.save_pretrained(save_path)

# 13. Example inference
model.eval()
test_text = df['Customer Review'][0]
inputs = tokenizer("summarize: " + test_text, return_tensors="pt", truncation=True, max_length=256).to(device)
summary_ids = model.generate(**inputs, max_length=64, num_beams=4, early_stopping=True)
print("Generated summary:\n", tokenizer.decode(summary_ids[0], skip_special_tokens=True))




Map:   0%|          | 0/1260 [00:00<?, ? examples/s]

Epoch 1/3


  0%|          | 0/567 [00:00<?, ?it/s]

Epoch 2/3


  0%|          | 0/567 [00:00<?, ?it/s]

Epoch 3/3


  0%|          | 0/567 [00:00<?, ?it/s]



Generated summary:
 Ketel ini bocor setelah seminggu dan kualitasnya buruk, sangat mengecewakan.


In [None]:
import random
random_index = random.randint(0, len(df) - 1)
# print(random_index)
# test_text = df['Customer Review'][random_index]
test_text = "Produk ini benar-benar memuaskan. Kualitasnya sangat baik, sesuai dengan deskripsi, dan terasa awet saat digunakan Selain itu, proses pengirimannya juga sangat cepat, bahkan lebih cepat dari perkiraan. Saya sangat merekomendasikan produk ini bagi siapa pun yang mencari kualitas dan layanan yang baik sekaligus."
inputs = tokenizer("summarize: " + test_text, return_tensors="pt", truncation=True, max_length=256).to(device)
summary_ids = model.generate(**inputs, max_length=256, num_beams=4, early_stopping=True)
print(f"{random_index}. {test_text}")
print("Generated summary:\n", tokenizer.decode(summary_ids[0], skip_special_tokens=True))

1244. Produk ini benar-benar memuaskan. Kualitasnya sangat baik, sesuai dengan deskripsi, dan terasa awet saat digunakan Selain itu, proses pengirimannya juga sangat cepat, bahkan lebih cepat dari perkiraan. Saya sangat merekomendasikan produk ini bagi siapa pun yang mencari kualitas dan layanan yang baik sekaligus.
Generated summary:
 Produk ini sangat memuaskan dengan kualitas yang sangat baik dan terasa awet saat digunakan.
