# Install & Imports

## Install

In [None]:
# Install
!pip install -r https://raw.githubusercontent.com/IndoNLP/indonlu/master/requirements.txt

## Imports

In [None]:
# Imports
import random
import numpy as np
import pandas as pd
import torch
import torch.nn.functional as F
from tqdm import tqdm
from torch import optim
from torch.utils.data import Dataset, DataLoader
from sklearn.metrics import accuracy_score, f1_score, recall_score, precision_score, confusion_matrix, classification_report

from transformers import BertForSequenceClassification, BertConfig, BertTokenizer

import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import os
import pandas as pd

from google.colab import drive

drive.mount('/content/drive')

# Define classes & function

## Classes

In [None]:
# Classes
class DocumentSentimentDataLoader(DataLoader):
    def __init__(self, max_seq_len=512, *args, **kwargs):
        super(DocumentSentimentDataLoader, self).__init__(*args, **kwargs)
        self.collate_fn = self._collate_fn
        self.max_seq_len = max_seq_len

    def _collate_fn(self, batch):
        batch_size = len(batch)
        max_seq_len = max(map(lambda x: len(x[0]), batch))
        max_seq_len = min(self.max_seq_len, max_seq_len)

        subword_batch = np.zeros((batch_size, max_seq_len), dtype=np.int64)
        mask_batch = np.zeros((batch_size, max_seq_len), dtype=np.float32)
        sentiment_batch = np.zeros((batch_size, 1), dtype=np.int64)

        seq_list = []
        for i, (subwords, sentiment, raw_seq) in enumerate(batch):
            subwords = subwords[:max_seq_len]
            subword_batch[i,:len(subwords)] = subwords
            mask_batch[i,:len(subwords)] = 1
            sentiment_batch[i,0] = sentiment

            seq_list.append(raw_seq)

        return subword_batch, mask_batch, sentiment_batch, seq_list

class DocumentSentimentDataset(Dataset):
    # Static constant variable
    LABEL2INDEX = {'positif': 0, 'netral': 1, 'negatif': 2}
    INDEX2LABEL = {0: 'positif', 1: 'netral', 2: 'negatif'}
    NUM_LABELS = 3

    def load_dataset(self, path):
        # Baca dataset dari file CSV
        df = pd.read_csv(path)

        # Pastikan kolom yang dibaca sesuai dengan dataset Anda
        df.columns = ['judul', 'text', 'tanggal', 'sentimen']

        # Ubah label sentimen menjadi indeks
        df['sentimen'] = df['sentimen'].apply(lambda lab: self.LABEL2INDEX[lab])
        return df

    def __init__(self, dataset_path, tokenizer, no_special_token=False, *args, **kwargs):
        # Load dataset
        self.data = self.load_dataset(dataset_path)

        # Tokenizer untuk encoding teks
        self.tokenizer = tokenizer

        # Opsi untuk menambahkan special token atau tidak
        self.no_special_token = no_special_token

    def __getitem__(self, index):
        # Ambil data pada indeks tertentu
        data = self.data.loc[index, :]

        # Ambil judul dan sentimen
        text, sentiment = data['text'], data['sentimen']

        # Encode teks menjadi subwords
        subwords = self.tokenizer.encode(text, add_special_tokens=not self.no_special_token)

        # Return subwords, sentiment, dan judul (untuk referensi)
        return np.array(subwords), np.array(sentiment), data['text']

    def __len__(self):
        # Return jumlah data dalam dataset
        return len(self.data)

class EarlyStopping:
    def __init__(self, patience=3, verbose=True, delta=0):
        """
        Args:
            patience (int): Berapa banyak epoch tanpa perbaikan sebelum berhenti.
            verbose (bool): Cetak info saat validasi loss membaik.
            delta (float): Perubahan minimum yang dianggap perbaikan.
        """
        self.patience = patience
        self.verbose = verbose
        self.delta = delta
        self.best_score = None
        self.early_stop = False
        self.counter = 0

    def __call__(self, val_loss, model):
        score = -val_loss

        if self.best_score is None:
            self.best_score = score
            self.save_checkpoint(val_loss, model, prev_best=None)
        elif score < self.best_score + self.delta:
            self.counter += 1
            if self.verbose:
                print(f"EarlyStopping counter: {self.counter} out of {self.patience}")
            if self.counter >= self.patience:
                self.early_stop = True
        else:
            prev_best = -self.best_score  # Simpan nilai terbaik sebelum update
            self.best_score = score
            self.save_checkpoint(val_loss, model, prev_best)
            self.counter = 0

    def save_checkpoint(self, val_loss, model, prev_best=None):
        '''Simpan model ketika validasi loss membaik.'''
        if self.verbose:
            if prev_best is None:
                print(f'Validation loss initialized: {val_loss:.6f}. Saving model...')
            else:
                print(f'Validation loss decreased ({prev_best:.6f} --> {val_loss:.6f}). Saving model...')
        torch.save(model.state_dict(), 'checkpoint.pt')



## Function

In [None]:
# Forward function for sequence classification
def forward_sequence_classification(model, batch_data, i2w, is_test=False, device='gpu', **kwargs):
    """
    Forward function for sequence classification.

    Args:
        model: Model yang akan digunakan.
        batch_data: Data batch yang berisi subword, mask, token_type (opsional), dan label.
        i2w: Mapping dari indeks ke label (untuk decoding prediksi).
        is_test: Flag untuk menentukan apakah ini tahap testing (default: False).
        device: Perangkat yang digunakan ('cuda' atau 'cpu').
        **kwargs: Argumen tambahan.

    Returns:
        loss: Loss dari model.
        list_hyp: List prediksi.
        list_label: List label sebenarnya.
    """
    # Unpack batch data
    if len(batch_data) == 3:
        (subword_batch, mask_batch, label_batch) = batch_data
        token_type_batch = None
    elif len(batch_data) == 4:
        (subword_batch, mask_batch, token_type_batch, label_batch) = batch_data

    # Prepare input & label
    subword_batch = torch.LongTensor(subword_batch)
    mask_batch = torch.FloatTensor(mask_batch)
    token_type_batch = torch.LongTensor(token_type_batch) if token_type_batch is not None else None
    label_batch = torch.LongTensor(label_batch)

    # Pindahkan tensor ke device yang sesuai
    if device == "cuda":
        subword_batch = subword_batch.cuda()
        mask_batch = mask_batch.cuda()
        token_type_batch = token_type_batch.cuda() if token_type_batch is not None else None
        label_batch = label_batch.cuda()
    else:
        subword_batch = subword_batch.cpu()
        mask_batch = mask_batch.cpu()
        token_type_batch = token_type_batch.cpu() if token_type_batch is not None else None
        label_batch = label_batch.cpu()

    # Forward model
    outputs = model(subword_batch, attention_mask=mask_batch, token_type_ids=token_type_batch, labels=label_batch)
    loss, logits = outputs[:2]

    # Generate prediction & label list
    list_hyp = []
    list_label = []
    list_confidence = []

    hyp = torch.topk(logits, 1)[1]  # Ambil prediksi dengan nilai tertinggi
    for j in range(len(hyp)):
        list_hyp.append(i2w[hyp[j].item()])  # Decode prediksi
        list_label.append(i2w[label_batch[j].item()])  # Decode label sebenarnya
        list_confidence.append(F.softmax(logits, dim=1)[j].max().item())  # Ambil nilai confidence

    return loss, list_hyp, list_label, list_confidence


def document_sentiment_metrics_fn(list_hyp, list_label):
    metrics = {}
    metrics["ACC"] = accuracy_score(list_label, list_hyp)
    metrics["F1"] = f1_score(list_label, list_hyp, average='macro')
    metrics["REC"] = recall_score(list_label, list_hyp, average='macro')
    metrics["PRE"] = precision_score(list_label, list_hyp, average='macro')
    return metrics

def set_seed(seed):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

def seed_worker(worker_id):
    worker_seed = torch.initial_seed() % 2**32
    np.random.seed(worker_seed)
    random.seed(worker_seed)

def count_param(module, trainable=False):
    if trainable:
        return sum(p.numel() for p in module.parameters() if p.requires_grad)
    else:
        return sum(p.numel() for p in module.parameters())

def get_lr(optimizer):
    for param_group in optimizer.param_groups:
        return param_group['lr']

def metrics_to_string(metric_dict):
    string_list = []
    for key, value in metric_dict.items():
        string_list.append('{}:{:.2f}'.format(key, value))
    return ' '.join(string_list)

# Define the training function
def train_model(model, train_loader, optimizer, epoch, i2w):
    model.train()
    torch.set_grad_enabled(True)

    total_train_loss = 0
    list_hyp, list_label, list_confidence = [], [], []

    train_pbar = tqdm(train_loader, leave=True, total=len(train_loader))
    for i, batch_data in enumerate(train_pbar):
        # Forward model
        loss, batch_hyp, batch_label, confidence = forward_sequence_classification(model, batch_data[:-1], i2w=i2w, device='cuda')

        # Update model
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        tr_loss = loss.item()
        total_train_loss += tr_loss

        # Calculate metrics
        list_hyp += batch_hyp
        list_label += batch_label
        list_confidence += confidence

        train_pbar.set_description(f"(Epoch {epoch+1}) TRAIN LOSS:{total_train_loss/(i+1):.4f} LR:{get_lr(optimizer):.8f}")

    # Calculate train metric
    metrics = document_sentiment_metrics_fn(list_hyp, list_label)
    print(f"(Epoch {epoch+1}) TRAIN LOSS:{total_train_loss/(i+1):.4f} {metrics_to_string(metrics)} LR:{get_lr(optimizer):.8f}")
    return list_hyp, list_label, list_confidence, total_train_loss

# Define the evaluation function
def evaluate_model(model, valid_loader, i2w):
    model.eval()
    torch.set_grad_enabled(False)

    total_loss = 0
    list_hyp, list_label, list_confidence = [], [], []

    pbar = tqdm(valid_loader, leave=True, total=len(valid_loader))
    for i, batch_data in enumerate(pbar):
        loss, batch_hyp, batch_label, confidence = forward_sequence_classification(model, batch_data[:-1], i2w=i2w, device='cuda')

        valid_loss = loss.item()
        total_loss += valid_loss

        # Calculate evaluation metrics
        list_hyp += batch_hyp
        list_label += batch_label
        list_confidence += confidence
        metrics = document_sentiment_metrics_fn(list_hyp, list_label)

        pbar.set_description(f"VALID LOSS:{total_loss/(i+1):.4f} {metrics_to_string(metrics)}")

    metrics = document_sentiment_metrics_fn(list_hyp, list_label)
    print(f"VALID LOSS:{total_loss/(i+1):.4f} {metrics_to_string(metrics)}")

    return list_hyp, list_label, list_confidence, total_loss

# Define the test evaluation function
def test_model(model, valid_loader, i2w):
    model.eval()
    torch.set_grad_enabled(False)

    total_loss = 0
    list_hyp, list_label, list_confidence = [], [], []

    test_pbar = tqdm(valid_loader, leave=True, total=len(valid_loader))
    for i, batch_data in enumerate(test_pbar):
        loss, batch_hyp, batch_label, confidence = forward_sequence_classification(model, batch_data[:-1], i2w=i2w, device='cuda')

        test_loss = loss.item()
        total_loss += test_loss

        # Calculate evaluation metrics
        list_hyp += batch_hyp
        list_label += batch_label
        list_confidence += confidence
        metrics = document_sentiment_metrics_fn(list_hyp, list_label)

        test_pbar.set_description(f"TEST LOSS:{total_loss/(i+1):.4f} {metrics_to_string(metrics)}")

    metrics = document_sentiment_metrics_fn(list_hyp, list_label)
    print(f"TEST LOSS:{total_loss/(i+1):.4f} {metrics_to_string(metrics)}")

    return list_hyp, list_label, list_confidence, total_loss

# Define the function to generate the confusion matrix and other reports
def generate_reports(dataset_path, list_hyp, list_label, list_confidence, dataset_ratio, lr, current_epoch, accuracy, loss, path_name, rename_folder=False):
    # Confusion matrix and other evaluations
    cm = confusion_matrix(list_label, list_hyp)

    # Visualize confusion matrix
    plt.figure(figsize=(10, 7))
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=['Negatif', 'Netral', 'Positif'], yticklabels=['Negatif', 'Netral', 'Positif'])
    plt.xlabel('Predicted')
    plt.ylabel('True')
    plt.title('Confusion Matrix')

    # Define result directory
    main_dir = f'{base_directory}/Test-Results/{model_name}'
    param_name = f'{dataset_ratio}-lr_{lr}'
    test_result_dir = f'{main_dir}/{param_name}/{path_name}/epoch_{current_epoch}-acr_{accuracy}-loss_{loss}'
    os.makedirs(test_result_dir, exist_ok=True)

    # Save confusion matrix image
    confusion_matrix_path = f'{test_result_dir}/confusion_matrix.png'
    plt.savefig(confusion_matrix_path)
    plt.show()

    # Save classification report
    classification_report_path = f'{test_result_dir}/classification_report.txt'
    with open(classification_report_path, 'w') as f:
        report = classification_report(list_label, list_hyp, target_names=['Negatif', 'Netral', 'Positif'])
        f.write(report)

    # Save confusion matrix and classification report to a .txt file
    evaluation_results_path = f'{test_result_dir}/evaluation_results.txt'
    with open(evaluation_results_path, 'a') as f:
        f.write("=== Confusion Matrix ===\n")
        f.write(np.array2string(cm))
        f.write("\n\n")

        f.write("=== Classification Report ===\n")
        f.write(classification_report(list_label, list_hyp, target_names=['Negatif', 'Netral', 'Positif']))
        f.write("\n\n")


    # Save actual labels and predictions to a CSV file
    df = pd.read_csv(dataset_path)
    df['prediksi'] = list_hyp
    df['confidence'] = list_confidence
    # Tambahkan kolom 'needs_review' jika label manual beda atau confidence < threshold
    df['needs_review'] = (
        (df['sentimen'] != df['prediksi']) | (df['confidence'] < 0.90)
    )

    # Tambahkan kolom 'overconfident_errors' jika prediksi salah tapi confidence tinggi
    df['overconfident_errors'] = (
        (df['sentimen'] != df['prediksi']) & (df['confidence'] > 0.9)
    )
    predictions_file_path = f'{test_result_dir}/predictions.csv'
    df.to_csv(predictions_file_path, index=False, encoding='utf-8')

    if rename_folder:
      os.rename(f'{main_dir}/{param_name}', f'{main_dir}/acr_{accuracy}-{param_name}')

    print(f"Confusion matrix saved to: {confusion_matrix_path}")
    print(f"Classification report saved to: {classification_report_path}")
    print(f"Evaluation results saved to: {evaluation_results_path}")
    print(f"Predictions saved to: {predictions_file_path}")


# Variables

In [None]:
base_directory = f'/content/drive/MyDrive/Analisis-Sentimen-CNBC'

model_name = 'indobert-large-p2'

learning_rate = [
    2e-5,
    2e-6,
    3e-5,
    3e-6
]
dataset_ratios = [
    '90-10',
    '80-20',
    '70-30'
]

tokenizer = BertTokenizer.from_pretrained(f'indobenchmark/{model_name}')
config = BertConfig.from_pretrained(f'indobenchmark/{model_name}')
config.num_labels = DocumentSentimentDataset.NUM_LABELS

# Run

In [None]:
!rm -rf ~/.cache/pip


In [None]:
set_seed(42)
g = torch.Generator()
g.manual_seed(42)

for lr in learning_rate:
    for dataset_ratio in dataset_ratios:
        print(f"Training with Learning Rate: {lr}, Dataset Ratio: {dataset_ratio}")

        # Dataset preparation
        dataset_directory = f'{base_directory}/Dataset/{dataset_ratio}'
        train_dataset_path = f'{dataset_directory}/train_data.csv'
        valid_dataset_path = f'{dataset_directory}/validation_data.csv'

        train_dataset = DocumentSentimentDataset(train_dataset_path, tokenizer, lowercase=True)
        valid_dataset = DocumentSentimentDataset(valid_dataset_path, tokenizer, lowercase=True)

        train_loader = DocumentSentimentDataLoader(
            dataset=train_dataset, max_seq_len=128, batch_size=32,
            num_workers=16, shuffle=False, worker_init_fn=seed_worker, generator=g
        )

        valid_loader = DocumentSentimentDataLoader(
            dataset=valid_dataset, max_seq_len=128, batch_size=32,
            num_workers=16, shuffle=False, worker_init_fn=seed_worker, generator=g
        )

        w2i, i2w = DocumentSentimentDataset.LABEL2INDEX, DocumentSentimentDataset.INDEX2LABEL
        print(w2i)
        print(i2w)

        # Model initialization
        set_seed(42)  # ulangi set seed agar konsisten sebelum from_pretrained
        model = BertForSequenceClassification.from_pretrained(f'indobenchmark/{model_name}', config=config)
        optimizer = optim.AdamW(model.parameters(), lr=lr)
        model = model.cuda()

        max_epoch = 20
        patience = 3
        early_stopping = EarlyStopping(patience=patience, verbose=True)

        for epoch in range(max_epoch):
            list_hyp, list_label, list_confidence_train, total_train_loss = train_model(model, train_loader, optimizer, epoch, i2w)
            train_accuracy = accuracy_score(list_label, list_hyp)
            generate_reports(
                train_dataset_path, list_hyp, list_label, list_confidence_train,
                dataset_ratio, lr, epoch + 1, train_accuracy, total_train_loss, 'train'
            )

            list_hyp_val, list_label_val, list_confidence_val, total_loss_val = evaluate_model(model, valid_loader, i2w)
            val_accuracy = accuracy_score(list_label_val, list_hyp_val)
            generate_reports(
                valid_dataset_path, list_hyp_val, list_label_val, list_confidence_val,
                dataset_ratio, lr, epoch + 1, val_accuracy, total_loss_val, 'evaluation'
            )

            early_stopping(total_loss_val, model)
            if early_stopping.early_stop:
                print("Early stopping triggered")
                break

        list_hyp_test, list_label_test, list_confidence, total_loss_test = test_model(model, valid_loader, i2w)
        accuracy = accuracy_score(list_label_test, list_hyp_test)

        generate_reports(
            valid_dataset_path, list_hyp_test, list_label_test, list_confidence,
            dataset_ratio, lr, epoch + 1, accuracy, total_loss_test, 'test',  rename_folder=True
        )
