In [9]:
# declare a list tasks whose products you want to use as inputs
upstream = None


In [11]:
# Imports and Setup
import pandas as pd
import numpy as np
from pathlib import Path
import sys
import logging
import json
import torch
from torch.utils.data import DataLoader
from tqdm import tqdm

# Add project root to path
sys.path.append(str(Path.cwd().parent))

# Import custom modules
from src.models.bert_model import TrollDetector
from src.models.trainer import TrollDetectorTrainer
from src.data_tools.dataset import TrollDataset, collate_batch

# Set up logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

In [12]:
# Define paths
DATA_DIR = Path('../data')
PROCESSED_DATA_DIR = DATA_DIR / 'processed'
CHECKPOINT_DIR = Path('../checkpoints')

# Create checkpoint directory
CHECKPOINT_DIR.mkdir(parents=True, exist_ok=True)

# Training configuration
config = {
    'model_name': 'distilbert-base-multilingual-cased',
    'max_length': 64,
    'batch_size': 64,
    'learning_rate': 2e-5,
    'weight_decay': 0.01,
    'num_epochs': 3,
    'warmup_steps': 0,
    'max_grad_norm': 1.0,
    'comments_per_user': 5,
    'use_wandb': False,
    'random_state': 42  # Default if config not found
}

# Try to load preprocessing config
try:
    with open(PROCESSED_DATA_DIR / 'preprocessing_config.json', 'r') as f:
        preproc_config = json.load(f)
        config['random_state'] = preproc_config.get('random_state', 42)
except FileNotFoundError:
    print("Warning: preprocessing_config.json not found, using default random_state")

print("Configuration loaded:")
for key, value in config.items():
    print(f"{key}: {value}")

Configuration loaded:
model_name: distilbert-base-multilingual-cased
max_length: 64
batch_size: 64
learning_rate: 2e-05
weight_decay: 0.01
num_epochs: 3
warmup_steps: 0
max_grad_norm: 1.0
comments_per_user: 5
use_wandb: False
random_state: 42


In [3]:
# Load preprocessed data splits
train_df = pd.read_parquet(PROCESSED_DATA_DIR / 'train.parquet')
val_df = pd.read_parquet(PROCESSED_DATA_DIR / 'val.parquet')
test_df = pd.read_parquet(PROCESSED_DATA_DIR / 'test.parquet')

print("Dataset sizes:")
print(f"Train: {len(train_df)} samples, {train_df['author'].nunique()} authors")
print(f"Val:   {len(val_df)} samples, {val_df['author'].nunique()} authors")
print(f"Test:  {len(test_df)} samples, {test_df['author'].nunique()} authors")

Dataset sizes:
Train: 1724555 samples, 16242 authors
Val:   558573 samples, 3481 authors
Test:  418812 samples, 3481 authors


In [4]:
# Create Datasets and DataLoaders
# Initialize datasets
train_dataset = TrollDataset(
    train_df,
    tokenizer_name=config['model_name'],
    max_length=config['max_length'],
    comments_per_user=config['comments_per_user']
)

val_dataset = TrollDataset(
    val_df,
    tokenizer_name=config['model_name'],
    max_length=config['max_length'],
    comments_per_user=config['comments_per_user']
)

test_dataset = TrollDataset(
    test_df,
    tokenizer_name=config['model_name'],
    max_length=config['max_length'],
    comments_per_user=config['comments_per_user']
)

# Create dataloaders
train_loader = DataLoader(
    train_dataset,
    batch_size=config['batch_size'],
    shuffle=True,
    collate_fn=collate_batch
)

val_loader = DataLoader(
    val_dataset,
    batch_size=config['batch_size'],
    shuffle=False,
    collate_fn=collate_batch
)

test_loader = DataLoader(
    test_dataset,
    batch_size=config['batch_size'],
    shuffle=False,
    collate_fn=collate_batch
)

In [5]:
# Initialize Model and Trainer
model = TrollDetector(
    model_name=config['model_name'],
    dropout_rate=0.1
)

# Initialize trainer
trainer = TrollDetectorTrainer(
    model=model,
    train_loader=train_loader,
    val_loader=val_loader,
    test_loader=test_loader,
    learning_rate=config['learning_rate'],
    weight_decay=config['weight_decay'],
    max_grad_norm=config['max_grad_norm'],
    num_epochs=config['num_epochs'],
    warmup_steps=config['warmup_steps'],
    checkpoint_dir=CHECKPOINT_DIR,
    use_wandb=config['use_wandb']
)

  self.scaler = GradScaler()


In [6]:
# Train the model
final_metrics = trainer.train()

print("\nTraining completed!")
print("\nFinal metrics:")
for metric, value in final_metrics.items():
    print(f"{metric}: {value:.4f}")

# Save final configuration and results
results = {
    'config': config,
    'final_metrics': final_metrics
}

with open(CHECKPOINT_DIR / 'training_results.json', 'w') as f:
    json.dump(results, f, indent=2)

INFO:src.models.trainer:Starting training on device: cuda
INFO:src.models.trainer:Training samples: 16242
INFO:src.models.trainer:Validation samples: 3481
INFO:src.models.trainer:
Epoch 1/3
  with torch.cuda.amp.autocast():
Training: 100%|██████████████████████████████████████████████| 254/254 [01:13<00:00,  3.46it/s, loss=0.0089]
INFO:src.models.trainer:Training metrics: {'accuracy': 0.9745105282600665, 'f1': 0.9729995098041248, 'precision': 0.9730301896986112, 'recall': 0.9745105282600665, 'loss': 0.08144478514555871}
Evaluating: 100%|███████████████████████████████████████████████████████████| 55/55 [00:12<00:00,  4.39it/s]
INFO:src.models.trainer:Validation metrics: {'accuracy': 0.986210858948578, 'f1': 0.9856956694655379, 'precision': 0.985924896791376, 'recall': 0.986210858948578, 'auc': 0.9928983690250293, 'loss': 0.0547953194684603}
INFO:src.models.trainer:New best model with validation AUC: 0.9929
INFO:src.models.trainer:
Epoch 2/3
  with torch.cuda.amp.autocast():
Training: 1


Training completed!

Final metrics:
accuracy: 0.9931
f1: 0.9932
precision: 0.9932
recall: 0.9931
auc: 0.9953
loss: 0.0242
