In [9]:
# declare a list tasks whose products you want to use as inputs
upstream = None


In [8]:
# Imports and Setup
import pandas as pd
import numpy as np
from pathlib import Path
import sys
import logging
import json
import torch
from torch.utils.data import DataLoader
from tqdm import tqdm

# Add project root to path
sys.path.append(str(Path.cwd().parent))

# Import custom modules
from src.models.bert_model import TrollDetector
from src.models.trainer import TrollDetectorTrainer
from src.data_tools.dataset import TrollDataset, collate_batch

# Set up logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

In [None]:
# Define paths
DATA_DIR = Path('../data')
PROCESSED_DATA_DIR = DATA_DIR / 'processed'
CHECKPOINT_DIR = Path('../checkpoints')

# Create checkpoint directory
CHECKPOINT_DIR.mkdir(parents=True, exist_ok=True)

# Training configuration
config = {
    'model_name': 'distilbert-base-multilingual-cased',
    'max_length': 96,
    'batch_size': 32,
    'learning_rate': 2e-5,
    'weight_decay': 0.03,
    'num_epochs': 3,
    'dropout_rate': 0.2,
    'warmup_steps': 50,
    'max_grad_norm': 1.0,
    'comments_per_user': 10,
    'early_stopping_patience': 3,
    'use_wandb': False,
    'random_state': 17,  # Default if config not found
    'label_smoothing': 0.1        # Added label smoothing
}

# Try to load preprocessing config
try:
    with open(PROCESSED_DATA_DIR / 'preprocessing_config.json', 'r') as f:
        preproc_config = json.load(f)
        config['random_state'] = preproc_config.get('random_state', 42)
except FileNotFoundError:
    print("Warning: preprocessing_config.json not found, using default random_state")

print("Configuration loaded:")
for key, value in config.items():
    print(f"{key}: {value}")

Configuration loaded:
model_name: distilbert-base-multilingual-cased
max_length: 96
batch_size: 32
learning_rate: 2e-05
weight_decay: 0.03
num_epochs: 3
dropout_rate: 0.2
warmup_steps: 50
max_grad_norm: 1.0
comments_per_user: 10
early_stopping_patience: 3
use_wandb: False
random_state: 42
label_smoothing: 0.1


In [12]:
# Load preprocessed data splits
train_df = pd.read_parquet(PROCESSED_DATA_DIR / 'train.parquet')
val_df = pd.read_parquet(PROCESSED_DATA_DIR / 'val.parquet')
test_df = pd.read_parquet(PROCESSED_DATA_DIR / 'test.parquet')

print("Dataset sizes:")
print(f"Train: {len(train_df)} samples, {train_df['author'].nunique()} authors")
print(f"Val:   {len(val_df)} samples, {val_df['author'].nunique()} authors")
print(f"Test:  {len(test_df)} samples, {test_df['author'].nunique()} authors")

Dataset sizes:
Train: 171242 samples, 4188 authors
Val:   39523 samples, 898 authors
Test:  35135 samples, 898 authors


In [13]:
# Create Datasets and DataLoaders
# Initialize datasets
train_dataset = TrollDataset(
    train_df,
    tokenizer_name=config['model_name'],
    max_length=config['max_length'],
    comments_per_user=config['comments_per_user']
)

val_dataset = TrollDataset(
    val_df,
    tokenizer_name=config['model_name'],
    max_length=config['max_length'],
    comments_per_user=config['comments_per_user']
)

test_dataset = TrollDataset(
    test_df,
    tokenizer_name=config['model_name'],
    max_length=config['max_length'],
    comments_per_user=config['comments_per_user']
)

# Create dataloaders
train_loader = DataLoader(
    train_dataset,
    batch_size=config['batch_size'],
    shuffle=True,
    collate_fn=collate_batch
)

val_loader = DataLoader(
    val_dataset,
    batch_size=config['batch_size'],
    shuffle=False,
    collate_fn=collate_batch
)

test_loader = DataLoader(
    test_dataset,
    batch_size=config['batch_size'],
    shuffle=False,
    collate_fn=collate_batch
)

INFO:src.data_tools.dataset:Using 'text' as text column
INFO:src.data_tools.dataset:Created 17356 samples from 4188 authors
INFO:src.data_tools.dataset:Using 'text' as text column
INFO:src.data_tools.dataset:Created 4019 samples from 898 authors
INFO:src.data_tools.dataset:Using 'text' as text column
INFO:src.data_tools.dataset:Created 3572 samples from 898 authors


In [14]:
# Initialize Model and Trainer
model = TrollDetector(
    model_name=config['model_name'],
    dropout_rate=config['dropout_rate']
)

# Initialize trainer
trainer = TrollDetectorTrainer(
    model=model,
    train_loader=train_loader,
    val_loader=val_loader,
    test_loader=test_loader,
    learning_rate=config['learning_rate'],
    weight_decay=config['weight_decay'],
    max_grad_norm=config['max_grad_norm'],
    num_epochs=config['num_epochs'],
    warmup_steps=config['warmup_steps'],
    checkpoint_dir=CHECKPOINT_DIR,
    use_wandb=config['use_wandb']
)

  self.scaler = GradScaler()


In [15]:
# Train the model
final_metrics = trainer.train()

print("\nTraining completed!")
print("\nFinal metrics:")
for metric, value in final_metrics.items():
    print(f"{metric}: {value:.4f}")

# Save final configuration and results
results = {
    'config': config,
    'final_metrics': final_metrics
}

with open(CHECKPOINT_DIR / 'training_results.json', 'w') as f:
    json.dump(results, f, indent=2)

INFO:src.models.trainer:Starting training on device: cuda
INFO:src.models.trainer:Training samples: 17356
INFO:src.models.trainer:Validation samples: 4019
INFO:src.models.trainer:
Epoch 1/3
  with torch.cuda.amp.autocast():
Training: 100%|██████████| 543/543 [12:11<00:00,  1.35s/it, loss=0.0079]
INFO:src.models.trainer:Training metrics: {'accuracy': 0.9331295376385174, 'f1': 0.9296849498081569, 'precision': 0.9340245949837218, 'recall': 0.9331295376385174, 'loss': 0.10678839946464115, 'num_authors': 2617}
Evaluating: 100%|██████████| 126/126 [03:03<00:00,  1.46s/it]
INFO:src.models.trainer:Evaluated 586 unique authors, 413 with multiple batches
INFO:src.models.trainer:Average batches per author: 6.86
INFO:src.models.trainer:Validation metrics: {'accuracy': 0.9607508532423208, 'f1': 0.959823624364068, 'precision': 0.9608523198966884, 'recall': 0.9607508532423208, 'auc': 0.9917184265010353, 'loss': 0.05787931462114174, 'num_authors': 586}
INFO:src.models.trainer:New best model with valid


Training completed!

Final metrics:
accuracy: 0.9707
f1: 0.9701
precision: 0.9713
recall: 0.9707
auc: 0.9905
loss: 0.0618
num_authors: 546.0000


In [16]:
# Save the final model state
final_model_path = CHECKPOINT_DIR / 'best_model.pt'
torch.save(model.state_dict(), final_model_path)
print(f"\nSaved final model to: {final_model_path}")



Saved final model to: checkpoints/best_model.pt
