<a href="https://colab.research.google.com/github/zlkhyr/NllbNusaX/blob/main/NllbClassifier.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# !pip install datasets
!pip install torchinfo

#Library

In [None]:
import torch
import pandas as pd
import torch.nn as nn
from tqdm import tqdm
from torch.optim import AdamW
from google.colab import files
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from torch.utils.data import TensorDataset, DataLoader, random_split
# from datasets import load_dataset, concatenate_datasets
from transformers import (
    AutoModelForSeq2SeqLM,
    AutoTokenizer
)

#Load Nllb

In [None]:
model_name = "facebook/nllb-200-distilled-600M"
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)

In [None]:
model

#test tokenizer

In [None]:
token = tokenizer.encode('ini adalah contoh teks' )
token

In [None]:
tokenizer.convert_ids_to_tokens(token)

#Data Preporocessing

In [None]:
train = pd.read_csv('/content/drive/MyDrive/Final_Year/data_train.csv')
val = pd.read_csv('/content/drive/MyDrive/Final_Year/data_val.csv')
test = pd.read_csv('/content/drive/MyDrive/Final_Year/data_test.csv')

In [None]:
train.info(), test.info(), val.info()

In [None]:
# map_bahasa ={
#   'aceh' : 'ace_Latn',
#   'bali' : 'ban_Latn',
#   'banjar' : 'bjn_Latn',
#   'bugis' : 'bug_Latn',
#   'indonesia' : 'ind_Latn',
#   'inggris' : 'eng_Latn',
#   'jawa' : 'jav_Latn',
#   'minang' : 'min_Latn',
#   'sunda' : 'sun_Latn',

#   'batak toba' : 'ind_Latn',
#   'madura' : 'ind_Latn',
#   'ngaju' : 'ind_Latn'
# }

In [None]:
def preprocess(dataset, shuffle):

  label_map = {"negative": 0, "neutral": 1, "positive": 2}
  dataset["label"] = dataset["label"].map(label_map)

  # tokenizer.src_lang = 'ind_Latn'

  encoding = tokenizer(
      dataset['text'].tolist(),
      padding=True,
      truncation=True,
      max_length=model.config.max_length,
      return_tensors="pt"
  )

  labels = torch.tensor(dataset['label'].tolist(), dtype=torch.long)
  dataset = TensorDataset(encoding['input_ids'], encoding['attention_mask'], labels)
  dataloader = DataLoader(dataset, batch_size=32, shuffle=shuffle)
  return dataloader

In [None]:
train_dataloader = preprocess(train, True)
val_dataloader = preprocess(val, False)

#Pengembangan Model

##Model NllbClassifier

In [None]:
class NllbClassifier(nn.Module):
    def __init__(self, encoder, num_labels=3):
        super().__init__()
        self.encoder = encoder #encoder dari NLLB model
        self.dropout = nn.Dropout(0.1)
        self.classifier = nn.Linear(encoder.config.hidden_size, num_labels)

    def mean_pooling(self, model_output, attention_mask):
        token_embeddings = model_output.last_hidden_state
        input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
        return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)

    def forward(self, input_ids, attention_mask):
        #Menghapus spesial token di awal tiap text input (language token)
        input_ids = input_ids[:, 1:]
        attention_mask = attention_mask[:, 1:]

        #Forward pass ke encoder
        outputs = self.encoder(input_ids=input_ids, attention_mask=attention_mask)

        # Pooling dengan attention_mask
        pooled_output = self.mean_pooling(outputs, attention_mask)

        # Dropout layer
        pooled_output = self.dropout(pooled_output)

        # Classification layer
        logits = self.classifier(pooled_output)

        return logits

#Model

In [None]:
encoder = model.get_encoder()

In [None]:
encoder

In [None]:
modelFT = NllbClassifier(encoder, num_labels=3)

In [None]:
sum(p.numel() for p in modelFT.parameters())

In [None]:
modelFT

#Freezing



In [None]:
for param in modelFT.encoder.embed_tokens.parameters():
  param.requires_grad = False

#Training

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Code running di: {device}")

In [None]:
modelFT.to(device)

In [None]:
from torchinfo import summary
summary(modelFT)

In [None]:
summary(modelFT)

In [None]:
optimizer = AdamW(modelFT.parameters(), lr=2e-5)
loss_fn = nn.CrossEntropyLoss()

In [None]:
history = {
        'train_loss':[],
        'val_loss':[],
        'train_acc':[],
        'val_acc':[]
    }

In [None]:
epochs = 5
for epoch in range(epochs):
        modelFT.train()
        total_train_loss = 0
        correct_train = 0
        total_train = 0

        for batch in tqdm(train_dataloader, desc=f"Epoch {epoch + 1}/{epochs} [Train]", leave=False):
            input_ids, attention_mask, labels = batch
            input_ids = input_ids.to(device)
            attention_mask = attention_mask.to(device)
            labels = labels.to(device)

            outputs = modelFT(input_ids, attention_mask)
            loss = loss_fn(outputs, labels)

            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

            total_train_loss += loss.item()
            _, predicted = torch.max(outputs.data, 1)
            total_train += labels.size(0)
            correct_train += (predicted == labels).sum().item()

        avg_train_loss = total_train_loss / len(train_dataloader)
        train_accuracy = correct_train / total_train
        history['train_loss'].append(avg_train_loss)
        history['train_acc'].append(train_accuracy)

        modelFT.eval()
        total_val_loss = 0
        correct_val = 0
        total_val = 0

        with torch.no_grad():
            for batch in tqdm(val_dataloader, desc=f"Epoch {epoch + 1}/{epochs} [Validation]", leave=False):
                input_ids, attention_mask, labels = batch
                input_ids = input_ids.to(device)
                attention_mask = attention_mask.to(device)
                labels = labels.to(device)

                outputs = modelFT(input_ids, attention_mask)
                loss = loss_fn(outputs, labels)

                total_val_loss += loss.item()
                _, predicted = torch.max(outputs.data, 1)
                total_val += labels.size(0)
                correct_val += (predicted == labels).sum().item()

        avg_val_loss = total_val_loss / len(val_dataloader)
        val_accuracy = correct_val / total_val
        history['val_loss'].append(avg_val_loss)
        history['val_acc'].append(val_accuracy)

        print(f"Epoch [{epoch + 1}/{epochs}], Loss:{avg_train_loss:.4f}, Validation Loss:{avg_val_loss:.4f}, Accuracy:{train_accuracy:.2f}, Validation Accuracy:{val_accuracy:.2f}")

#Evaluation

In [None]:
def learning_curve(history, mode='loss'):
  plt.figure()
  if mode == 'loss':
    plt.plot(history['train_loss'], label='Training Loss')
    plt.plot(history['val_loss'], label='Validation Loss')
  elif mode == 'acc':
    plt.plot(history['train_acc'], label='Training Accuracy')
    plt.plot(history['val_acc'], label='Validation Accuracy')
  plt.xlabel('Epoch')
  plt.ylabel('Loss')
  plt.title(f'Learning Curve: {mode}')
  plt.legend()
  plt.grid(True)
  plt.show()

In [None]:
learning_curve(history, 'loss'), learning_curve(history, mode='acc')

In [None]:
test_dataloader = preprocess(test, False)

In [None]:
model.eval()
all_preds = []
all_labels = []

with torch.no_grad():
  for batch in test_dataloader:
    input_ids, attention_mask, labels = batch
    input_ids = input_ids.to(device)
    attention_mask = attention_mask.to(device)
    labels = labels.to(device)

    outputs = modelFT(input_ids, attention_mask)
    _, predicted = torch.max(outputs.data, 1)

    all_preds.extend(predicted.cpu().numpy())
    all_labels.extend(labels.cpu().numpy())

In [None]:
print(classification_report(all_labels, all_preds))

#test untuk tiap bahasa

In [None]:
test = pd.read_csv('/content/drive/MyDrive/Final_Year/data_test.csv')

In [None]:
test_ing = test[test.bahasa == 'aceh']

In [None]:
test_ing

In [None]:
test_ing_dataloader = preprocess(test_ing, False)

In [None]:
model.eval()
all_preds = []
all_labels = []

with torch.no_grad():
  for batch in test_ing_dataloader:
    input_ids, attention_mask, labels = batch
    input_ids = input_ids.to(device)
    attention_mask = attention_mask.to(device)
    labels = labels.to(device)

    outputs = modelFT(input_ids, attention_mask)
    _, predicted = torch.max(outputs.data, 1)

    all_preds.extend(predicted.cpu().numpy())
    all_labels.extend(labels.cpu().numpy())

In [None]:
print(classification_report(all_labels, all_preds))

In [None]:
print(classification_report(all_labels, all_preds))