# Eksperimen2 (E2): BERT + Preprocessing + Class Weight

In [1]:
# Import semua library yang dibutuhkan
import re
import torch
import torch.nn as nn
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.utils.class_weight import compute_class_weight
from sklearn.metrics import f1_score, classification_report
from transformers import (
    BertTokenizer, 
    BertForSequenceClassification,
    Trainer, 
    TrainingArguments
)
from torch.utils.data import Dataset

print(f"PyTorch version: {torch.__version__}")
print(f"CUDA available: {torch.cuda.is_available()}")

2025-12-07 12:43:57.291956: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1765111437.508786      19 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1765111437.570982      19 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


AttributeError: 'MessageFactory' object has no attribute 'GetPrototype'

AttributeError: 'MessageFactory' object has no attribute 'GetPrototype'

AttributeError: 'MessageFactory' object has no attribute 'GetPrototype'

AttributeError: 'MessageFactory' object has no attribute 'GetPrototype'

AttributeError: 'MessageFactory' object has no attribute 'GetPrototype'

PyTorch version: 2.6.0+cu124
CUDA available: True


## 1. Text Preprocessing

Fungsi preprocessing untuk membersihkan teks:
- Remove URLs (http, www)
- Remove mentions (@username)
- Normalize whitespace

In [2]:
def preprocess_text(text):
    """
    Preprocessing: remove URLs, mentions, normalize whitespace
    
    Args:
        text (str): Raw text input
    
    Returns:
        str: Cleaned text
    """
    text = re.sub(r'http\S+|www\.\S+', '', text)  # Remove URLs
    text = re.sub(r'@\w+', '', text)  # Remove mentions
    text = re.sub(r'\s+', ' ', text).strip()  # Normalize whitespace
    return text

# Test fungsi
sample_text = "Check out http://example.com @user this is   a   test"
print(f"Before: {sample_text}")
print(f"After:  {preprocess_text(sample_text)}")

Before: Check out http://example.com @user this is   a   test
After:  Check out this is a test


## 2. Load Data

Load training dan dev dataset dari Kaggle input

In [3]:
# Load datasets
train_df = pd.read_csv('/kaggle/input/sharedtask-polar/subtask1/train/eng.csv')
dev_df = pd.read_csv('/kaggle/input/sharedtask-polar/subtask1/dev/eng.csv')

print(f"Training samples: {len(train_df)}")
print(f"Dev samples: {len(dev_df)}")
print(f"\nTrain columns: {train_df.columns.tolist()}")
print(f"\nFirst row:")
print(train_df.head(1))

Training samples: 3222
Dev samples: 160

Train columns: ['id', 'text', 'polarization']

First row:
                                     id  \
0  eng_973938b90b0ff5d87d35a582f83f5c89   

                                        text  polarization  
0   is defending imperialism in the dnd chat             0  


## 3. Apply Preprocessing

In [4]:
# Apply preprocessing
train_df['text_clean'] = train_df['text'].apply(preprocess_text)
dev_df['text_clean'] = dev_df['text'].apply(preprocess_text)

print("Sample preprocessed text:")
print(f"Before: {train_df['text'].iloc[0][:100]}...")
print(f"After:  {train_df['text_clean'].iloc[0][:100]}...")

# Check class distribution
print(f"\nClass distribution in training data:")
print(train_df['polarization'].value_counts())
print(f"\nPercentage:")
print(train_df['polarization'].value_counts(normalize=True) * 100)

Sample preprocessed text:
Before:  is defending imperialism in the dnd chat...
After:  is defending imperialism in the dnd chat...

Class distribution in training data:
polarization
0    2047
1    1175
Name: count, dtype: int64

Percentage:
polarization
0    63.531968
1    36.468032
Name: proportion, dtype: float64


## 4. Train-Validation Split

Split training data menjadi train (85%) dan validation (15%) dengan stratified sampling

In [5]:
# Stratified train-val split
train_texts, val_texts, train_labels, val_labels = train_test_split(
    train_df['text_clean'].tolist(),
    train_df['polarization'].tolist(),
    test_size=0.15,
    random_state=42,
    stratify=train_df['polarization']
)

print(f"Train: {len(train_texts)} samples")
print(f"Val: {len(val_texts)} samples")

# Check distribution
from collections import Counter
print(f"\nTrain labels distribution: {Counter(train_labels)}")
print(f"Val labels distribution: {Counter(val_labels)}")

Train: 2738 samples
Val: 484 samples

Train labels distribution: Counter({0: 1740, 1: 998})
Val labels distribution: Counter({0: 307, 1: 177})


## 5. Compute Class Weights

Mengatasi imbalanced data dengan class weighting

In [6]:
# Compute class weights
class_weights = compute_class_weight(
    class_weight='balanced',
    classes=np.array([0, 1]),
    y=train_labels
)
class_weights = torch.tensor(class_weights, dtype=torch.float32)

print(f"Class weights: {class_weights}")
print(f"  - Class 0 (Non-Polarized): {class_weights[0]:.4f}")
print(f"  - Class 1 (Polarized): {class_weights[1]:.4f}")

Class weights: tensor([0.7868, 1.3717])
  - Class 0 (Non-Polarized): 0.7868
  - Class 1 (Polarized): 1.3717


## 6. Tokenizer & Dataset Class

Setup BERT tokenizer dan custom dataset class

In [7]:
# Load tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Test tokenization
sample = train_texts[0]
encoded = tokenizer(sample, truncation=True, max_length=256)
print(f"Sample text: {sample[:100]}...")
print(f"Encoded length: {len(encoded['input_ids'])}")
print(f"First 10 tokens: {encoded['input_ids'][:10]}")

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

Sample text: Trudeau should take the opportunity to reduce Russian aggression in the arctic...
Encoded length: 15
First 10 tokens: [101, 19817, 27627, 2323, 2202, 1996, 4495, 2000, 5547, 2845]


In [8]:
class PolarizationDataset(Dataset):
    """
    Custom Dataset untuk polarization classification
    """
    def __init__(self, texts, labels, tokenizer, max_length=256):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length
    
    def __len__(self):
        return len(self.texts)
    
    def __getitem__(self, idx):
        encoding = self.tokenizer(
            self.texts[idx],
            truncation=True,
            padding='max_length',
            max_length=self.max_length,
            return_tensors='pt'
        )
        return {
            'input_ids': encoding['input_ids'].squeeze(),
            'attention_mask': encoding['attention_mask'].squeeze(),
            'labels': torch.tensor(self.labels[idx], dtype=torch.long)
        }


In [9]:
# Create train and validation datasets
train_dataset = PolarizationDataset(train_texts, train_labels, tokenizer)
val_dataset = PolarizationDataset(val_texts, val_labels, tokenizer)

print(f"Train dataset size: {len(train_dataset)}")
print(f"Val dataset size: {len(val_dataset)}")

# Test dataset
sample_item = train_dataset[0]
print(f"\nSample item keys: {sample_item.keys()}")
print(f"Input IDs shape: {sample_item['input_ids'].shape}")
print(f"Attention mask shape: {sample_item['attention_mask'].shape}")
print(f"Label: {sample_item['labels']}")

Train dataset size: 2738
Val dataset size: 484

Sample item keys: dict_keys(['input_ids', 'attention_mask', 'labels'])
Input IDs shape: torch.Size([256])
Attention mask shape: torch.Size([256])
Label: 0


## 7. Model Setup

Load BERT model dan setup weighted trainer

In [10]:
# Load BERT model
model = BertForSequenceClassification.from_pretrained(
    'bert-base-uncased',
    num_labels=2
)

# Move to GPU if available
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")

# Move class weights to device
class_weights = class_weights.to(device)
print(f"Class weights moved to {device}")

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Using device: cuda
Class weights moved to cuda


In [11]:
class WeightedTrainer(Trainer):
    """
    Custom Trainer dengan weighted loss function
    """
    def compute_loss(self, model, inputs, return_outputs=False, **kwargs):
        labels = inputs.pop("labels")
        outputs = model(**inputs)
        logits = outputs.logits
        
        # Use weighted cross entropy loss
        loss_fn = nn.CrossEntropyLoss(weight=class_weights)
        loss = loss_fn(logits, labels)
        
        return (loss, outputs) if return_outputs else loss


## 8. Training Configuration

Setup metrics dan training arguments

In [12]:
def compute_metrics(eval_pred):
    """
    Compute F1 macro dan F1 for polarized class
    """
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    
    f1_macro = f1_score(labels, predictions, average='macro')
    f1_polarized = f1_score(labels, predictions, average='binary', pos_label=1)
    
    return {
        'f1_macro': f1_macro,
        'f1_polarized': f1_polarized
    }


In [13]:
training_args = TrainingArguments(
    output_dir='./bert_weighted_results',
    num_train_epochs=5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    learning_rate=2e-5,
    warmup_ratio=0.1,
    weight_decay=0.01,
    eval_strategy='epoch',
    save_strategy='epoch',
    load_best_model_at_end=True,
    metric_for_best_model='f1_macro',
    greater_is_better=True,
    logging_dir='./logs',
    logging_steps=50,
    fp16=True,  # Mixed precision training
    report_to='none'
)

print("Training configuration:")
print(f"  - Epochs: {training_args.num_train_epochs}")
print(f"  - Batch size: {training_args.per_device_train_batch_size}")
print(f"  - Learning rate: {training_args.learning_rate}")
print(f"  - Warmup ratio: {training_args.warmup_ratio}")
print(f"  - Weight decay: {training_args.weight_decay}")

Training configuration:
  - Epochs: 5
  - Batch size: 16
  - Learning rate: 2e-05
  - Warmup ratio: 0.1
  - Weight decay: 0.01


## 9. Training

Train model dengan weighted loss

In [14]:
trainer = WeightedTrainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics,
)

print("Ready to train!")

Ready to train!


In [15]:
print("="*60)
print("TRAINING: BERT + Preprocessing + Class Weight")
print("="*60)

trainer.train()

print("\n✓ Training completed!")

TRAINING: BERT + Preprocessing + Class Weight


Epoch,Training Loss,Validation Loss,F1 Macro,F1 Polarized
1,0.6458,0.539534,0.717538,0.642254
2,0.4101,0.56817,0.724193,0.643275
3,0.2831,0.507855,0.754463,0.710327
4,0.1989,0.652668,0.761612,0.693642
5,0.1266,0.675964,0.769237,0.704871



✓ Training completed!


## 10. Evaluation

Evaluate model pada validation set

In [16]:
print("="*60)
print("VALIDATION RESULTS")
print("="*60)

eval_results = trainer.evaluate()
for key, value in eval_results.items():
    print(f"{key}: {value:.4f}")

VALIDATION RESULTS


eval_loss: 0.6760
eval_f1_macro: 0.7692
eval_f1_polarized: 0.7049
eval_runtime: 4.3142
eval_samples_per_second: 112.1880
eval_steps_per_second: 3.7090
epoch: 5.0000


In [17]:
# Get predictions
val_predictions = trainer.predict(val_dataset)
val_preds = np.argmax(val_predictions.predictions, axis=-1)

# Print classification report
print("\nDetailed Classification Report:")
print("="*60)
print(classification_report(
    val_labels, 
    val_preds, 
    target_names=['Non-Polarized', 'Polarized'],
    digits=4
))


Detailed Classification Report:
               precision    recall  f1-score   support

Non-Polarized     0.8269    0.8404    0.8336       307
    Polarized     0.7151    0.6949    0.7049       177

     accuracy                         0.7872       484
    macro avg     0.7710    0.7677    0.7692       484
 weighted avg     0.7860    0.7872    0.7865       484



## 11. Generate Submission

Generate predictions untuk dev set dan create submission file

In [18]:
print("="*60)
print("GENERATING SUBMISSION")
print("="*60)

# Create dev dataset (labels dummy karena tidak ada ground truth)
dev_texts = dev_df['text_clean'].tolist()
dev_dataset = PolarizationDataset(dev_texts, [0]*len(dev_texts), tokenizer)

# Predict
print("Predicting on dev set...")
dev_predictions = trainer.predict(dev_dataset)
dev_preds = np.argmax(dev_predictions.predictions, axis=-1)

print(f"✓ Predictions completed for {len(dev_preds)} samples")

GENERATING SUBMISSION
Predicting on dev set...


✓ Predictions completed for 160 samples


In [19]:
# Create submission dataframe
submission = pd.DataFrame({
    'id': dev_df['id'],
    'polarization': dev_preds
})

# Save submission
import os
os.makedirs('subtask_1_bert_weighted', exist_ok=True)
submission.to_csv('subtask_1_bert_weighted/pred_eng.csv', index=False)

print("✓ Submission file created")
print(f"\nLabel distribution in submission:")
print(submission['polarization'].value_counts())
print(f"\nPercentage:")
print(submission['polarization'].value_counts(normalize=True) * 100)

✓ Submission file created

Label distribution in submission:
polarization
0    116
1     44
Name: count, dtype: int64

Percentage:
polarization
0    72.5
1    27.5
Name: proportion, dtype: float64


In [20]:
# Create zip file for submission
!zip -r submission_bert_weighted.zip subtask_1_bert_weighted/

print("\n" + "="*60)
print("SUBMISSION READY!")
print("="*60)

  adding: subtask_1_bert_weighted/ (stored 0%)
  adding: subtask_1_bert_weighted/pred_eng.csv (deflated 48%)

SUBMISSION READY!
