In [1]:
import numpy as np
import pandas as pd

import torch
from torch import nn
from torch.optim import AdamW
from torch.utils.data import Dataset, DataLoader

from transformers import BertModel, BertTokenizer, BertConfig, get_linear_schedule_with_warmup

from sklearn.metrics import accuracy_score, classification_report

from tqdm import tqdm

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

/kaggle/input/valid.txt
/kaggle/input/test.txt
/kaggle/input/train.txt


In [2]:
# Read in the files
train_df = pd.read_csv('/kaggle/input/train.txt', delimiter='\t', header=None, encoding='utf-8')
valid_df = pd.read_csv('/kaggle/input/valid.txt', delimiter='\t', header=None, encoding='utf-8')

# Define the columns header
columns = ['CATEGORY', 'TITLE']

# Assign the headers to the dataframe
train_df.columns = columns
valid_df.columns = columns

train_df.head()

Unnamed: 0,CATEGORY,TITLE
0,b,update 1 yellen prepares wall st for more whol...
1,e,kanye raps about how awesome kim is on future ...
2,t,update 1 facebook to use satellites drones to ...
3,e,garth ancier counter sues michael egan over se...
4,t,update 1 mercedes recalls 284000 cars in us ca...


In [3]:
train_df.CATEGORY.value_counts()

CATEGORY
b    4500
e    4223
t    1229
m     732
Name: count, dtype: int64

In [4]:
valid_df.CATEGORY.value_counts()

CATEGORY
b    556
e    543
t    153
m     84
Name: count, dtype: int64

In [5]:
def create_balanced_dataset(df):
    """
    Create a balanced dataset by selecting the same number of samples for each class label.

    Args:
    df (DataFrame): The input DataFrame containing samples and their corresponding class labels.

    Returns:
    DataFrame: A balanced DataFrame with the same number of samples for each class label.
    """
    # Determine the minimum number of samples among all class labels
    min_samples_count = min(df.CATEGORY.value_counts())
    
    # Shuffle the DataFrame
    shuffled = df.sample(frac=1, random_state=42).reset_index(drop=True)
    
    # Select the same number of samples for each class label
    e = shuffled[shuffled['CATEGORY'] == "e"][:min_samples_count]
    b = shuffled[shuffled['CATEGORY'] == "b"][:min_samples_count]
    t = shuffled[shuffled['CATEGORY'] == "t"][:min_samples_count]
    m = shuffled[shuffled['CATEGORY'] == "m"][:min_samples_count]
    
    # Concatenate the balanced samples from all class labels
    balanced_df = pd.concat([e,b,t,m], ignore_index=True)

    #Shuffle the dataset
    balanced_df = balanced_df.sample(frac=1, random_state=42).reset_index(drop=True)
    
    return balanced_df

In [6]:
class SentenceDataset(Dataset):
    """
    Dataset class for sentence classification tasks.
    """
    def __init__(self, sentences, labels, tokenizer, max_length):
        """
        Initializes the SentenceDataset.

        Args:
        sentences (list): List of sentence.
        labels (list): List of labels corresponding to each sentence.
        tokenizer: Tokenizer object from Hugging Face transformers library.
        max_length (int): Maximum sequence length for tokenization.
        """
        self.sentences = np.asarray(sentences)
        self.labels = np.asarray(labels)
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        """
        Returns the total number of samples in the dataset.

        Returns:
        int: Length of the dataset.
        """
        return len(self.sentences)

    def __getitem__(self, idx):
        """
        Retrieves a sample from the dataset at the specified index.

        Args:
        idx (int): Index of the sample to retrieve.

        Returns:
        dict: Dictionary containing the tokenized input_ids, attention_mask, and label.
        """
        sentence = self.sentences[idx]
        label = self.labels[idx]

        # encode the sentences
        encoding = self.tokenizer.encode_plus(
            text=sentence,
            return_tensors='pt',
            max_length=self.max_length,
            padding='max_length',
            truncation=True,
        )
        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'label': torch.tensor(label)
        }

In [7]:
class BertClassifier(nn.Module):
    """
    Classifier model based on BERT for sentence classification tasks.
    """
    def __init__(self, model_name, num_classes):
        """
        Initializes the BertClassifier model.

        Args:
        model_name (str): Name of the pre-trained BERT model.
        num_classes (int): Number of classes for classification.
        """
        # Inherit from BertClassifier
        super(BertClassifier, self).__init__()
        config = BertConfig.from_pretrained(model_name)
        self.bert = BertModel.from_pretrained(model_name, config=config)
        self.dropout = nn.Dropout(0.1)
        self.fc = nn.Linear(config.hidden_size, num_classes)

    def forward(self, input_ids, attention_mask):
        """
        Forward pass of the BertClassifier model.

        Args:
        input_ids (torch.Tensor): Input token IDs.
        attention_mask (torch.Tensor): Attention mask for input tokens.

        Returns:
        torch.Tensor: Logits for each class.
        """
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        pooled_output = outputs.last_hidden_state[:, 0]
        x = self.dropout(pooled_output)
        logits = self.fc(x)
        return logits


In [8]:
def preprocess_dataset(df):
    """
    Pre-processes a DataFrame for a sentence classification task.

    Args:
    df (pandas.DataFrame): DataFrame containing 'TITLE' and 'CATEGORY' columns.

    Returns:
    tuple: Tuple containing sentences and corresponding labels.
    """

    df.loc[df['CATEGORY'] == 'e', 'LABEL'] = 0
    df.loc[df['CATEGORY'] == 'b', 'LABEL'] = 1
    df.loc[df['CATEGORY'] == 't', 'LABEL'] = 2
    df.loc[df['CATEGORY'] == 'm', 'LABEL'] = 3

    if 'CATEGORY' in df.keys():
        df = df.drop('CATEGORY', axis='columns')

    df['LABEL'] = df['LABEL'].astype(int)
    
    sentences = np.asarray(df['TITLE'])
    labels = np.asarray(df['LABEL'])
    return sentences, labels

In [9]:
def prepare_model_and_dataloaders(model_name, train_sentences, train_labels, val_sentences, val_labels, num_classes, learning_rate, num_epochs, max_length, batch_size):
    """
    Prepares the BERT model, tokenizers, and dataloaders for training and evaluation.

    Args:
    model_name (str): Name of the pre-trained BERT model.
    train_sentences (list): List of tuples containing sentences for training.
    train_labels (list): List of labels corresponding to each sentence in training set.
    val_sentences (list): List of tuples containing sentences for matched development set.
    val_labels (list): List of labels corresponding to each sentence in matched development set.
    num_classes (int): Number of classes for classification.
    learning_rate (float): Learning rate for optimizer.
    num_epochs (int): Number of training epochs.
    max_length (int): Maximum sequence length for tokenization.
    batch_size (int): Batch size for training and evaluation.

    Returns:
    tuple: Tuple containing model, dataloaders for training and evaluation, optimizer, scheduler, and device.
    """

    # Set device based on GPU availability
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    # Instantiate the BERT model
    model = nn.DataParallel(BertClassifier(model_name, num_classes)).to(device)

    # Define the optimizer
    optimizer = AdamW(model.parameters(), lr=learning_rate)

    # Instantiate the tokenizer
    tokenizer = BertTokenizer.from_pretrained(model_name)

    # Create dataloaders for training and evaluation
    train_dataset = SentenceDataset(train_sentences, train_labels, tokenizer, max_length)
    train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)

    val_dataset = SentenceDataset(val_sentences, val_labels, tokenizer, max_length)
    val_dataloader = DataLoader(val_dataset, batch_size=batch_size, shuffle=True)

    # Calculate total steps for the scheduler
    total_steps = len(train_dataloader) * num_epochs

    # Create scheduler
    scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=total_steps)

    return model, train_dataloader, val_dataloader, optimizer, scheduler, device


In [10]:
import time
import datetime
from tqdm import tqdm

def train(model, data_loader, optimizer, scheduler, device):
    """
    Trains the model using the provided data_loader, optimizer, and scheduler.

    Args:
    model (torch.nn.Module): The model to be trained.
    data_loader (torch.utils.data.DataLoader): DataLoader containing the training data.
    optimizer: Optimizer object for updating the model parameters.
    scheduler: Scheduler for adjusting the learning rate during training.
    device: Device to perform training on (e.g., "cuda" for GPU or "cpu").
    """

    # Get current time
    t0 = time.time()

    # Set the model to train mode
    model.train()

    # Iterate over batches in the data loader
    for step, batch in enumerate(tqdm(data_loader)):
        
        # Clear gradients
        optimizer.zero_grad()

        # Move input data to the appropriate device
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['label'].to(device)

        # Forward pass
        outputs = model(input_ids=input_ids, attention_mask=attention_mask)

        # Calculate loss
        loss = nn.CrossEntropyLoss()(outputs, labels)

        # Backward pass
        loss.backward()

        # Update model parameters
        optimizer.step()

        # Update the learning rate
        scheduler.step()

In [11]:
def evaluate(model, data_loader, device):
    """
    Evaluates the performance of the model on the given data loader.

    Args:
    model (torch.nn.Module): Trained model to evaluate.
    data_loader (torch.utils.data.DataLoader): DataLoader containing evaluation data.
    device (torch.device): Device to perform evaluation on (cpu or cuda).

    Returns:
    tuple: Tuple containing accuracy score and classification report.
    """

    # Set model to evaluation mode
    model.eval()

    # Initialize lists to store predictions and actual labels
    predictions = []
    actual_labels = []

    # Disable gradient calculation
    with torch.no_grad():

        # Iterate over batches in the data loader
        for batch in data_loader:

            # Move batch to device
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['label'].to(device)

            # Forward pass through the model
            outputs = model(input_ids=input_ids, attention_mask=attention_mask)

            # Calculate predictions
            _, preds = torch.max(outputs, dim=1)

            # Append predictions and actual labels to the lists
            predictions.extend(preds.cpu().tolist())
            actual_labels.extend(labels.cpu().tolist())

    # Calculate accuracy score
    accuracy = accuracy_score(actual_labels, predictions)

    # Generate classification report
    report = classification_report(actual_labels, predictions, zero_division=1)

    return accuracy, report

In [12]:
model_name = 'bert-base-cased'
num_classes = 4
max_length = 128
batch_size = 16
num_epochs = 4
learning_rate = 2e-5

    
best_accuracy = 0

# Tiền xử lý dữ liệu
train_sentences, train_labels = preprocess_dataset(create_balanced_dataset(train_df))
val_sentences, val_labels = preprocess_dataset(create_balanced_dataset(valid_df))

# Chuẩn bị dữ liệu huấn luyện và dữ liệu đánh giá cho khía cạnh này
model, train_dataloader, val_dataloader, optimizer, scheduler, device = prepare_model_and_dataloaders(
    model_name,
    train_sentences,
    train_labels,
    val_sentences,
    val_labels,
    num_classes,
    learning_rate,
    num_epochs,
    max_length,
    batch_size
)

# Huấn luyện mô hình cho khía cạnh này
for epoch in range(num_epochs):
    print("")
    print(f"======== Training - Epoch {epoch + 1} / {num_epochs} =======")
    train(model, train_dataloader, optimizer, scheduler, device)
    accuracy, report = evaluate(model, val_dataloader, device)
    if accuracy > best_accuracy:
        torch.save(model.state_dict(), f"model_best.pth")
        best_accuracy = accuracy

    print(f"Validation Accuracy: {accuracy:.4f}")
    print(f"Best Accuracy: {best_accuracy:.4f}")
    print(report)

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/436M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/49.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/213k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/436k [00:00<?, ?B/s]




100%|██████████| 183/183 [00:48<00:00,  3.74it/s]


Validation Accuracy: 0.9137
Best Accuracy: 0.9137
              precision    recall  f1-score   support

           0       0.96      0.94      0.95        84
           1       0.89      0.81      0.85        84
           2       0.86      0.95      0.90        84
           3       0.94      0.95      0.95        84

    accuracy                           0.91       336
   macro avg       0.91      0.91      0.91       336
weighted avg       0.91      0.91      0.91       336




100%|██████████| 183/183 [00:46<00:00,  3.90it/s]


Validation Accuracy: 0.9196
Best Accuracy: 0.9196
              precision    recall  f1-score   support

           0       0.99      0.92      0.95        84
           1       0.90      0.85      0.87        84
           2       0.86      0.95      0.90        84
           3       0.94      0.96      0.95        84

    accuracy                           0.92       336
   macro avg       0.92      0.92      0.92       336
weighted avg       0.92      0.92      0.92       336




100%|██████████| 183/183 [00:48<00:00,  3.79it/s]


Validation Accuracy: 0.9137
Best Accuracy: 0.9196
              precision    recall  f1-score   support

           0       0.96      0.96      0.96        84
           1       0.89      0.80      0.84        84
           2       0.86      0.94      0.90        84
           3       0.94      0.95      0.95        84

    accuracy                           0.91       336
   macro avg       0.91      0.91      0.91       336
weighted avg       0.91      0.91      0.91       336




100%|██████████| 183/183 [00:48<00:00,  3.78it/s]


Validation Accuracy: 0.9167
Best Accuracy: 0.9196
              precision    recall  f1-score   support

           0       0.94      0.96      0.95        84
           1       0.90      0.82      0.86        84
           2       0.88      0.94      0.91        84
           3       0.95      0.94      0.95        84

    accuracy                           0.92       336
   macro avg       0.92      0.92      0.92       336
weighted avg       0.92      0.92      0.92       336

