In [9]:
# declare a list tasks whose products you want to use as inputs
upstream = None


In [2]:
# Imports and Setup
import pandas as pd
import numpy as np
from pathlib import Path
import sys
import logging
import json
import torch
from torch.utils.data import DataLoader
from tqdm import tqdm

# Add project root to path
sys.path.append(str(Path.cwd().parent))

# Import custom modules
from src.models.bert_model import TrollDetector
from src.models.trainer import TrollDetectorTrainer
from src.data_tools.dataset import TrollDataset, collate_batch

# Set up logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

In [3]:
# Define paths
DATA_DIR = Path('data')
PROCESSED_DATA_DIR = DATA_DIR / 'processed'
CHECKPOINT_DIR = Path('./checkpoints')

# Create checkpoint directory
CHECKPOINT_DIR.mkdir(parents=True, exist_ok=True)

# Updated training configuration
config = {
    'model_name': 'distilbert-base-multilingual-cased',
    'max_length': 96,
    'batch_size': 32,
    'learning_rate': 2e-5,
    'weight_decay': 0.03,
    'num_epochs': 3,
    'dropout_rate': 0.2,
    'warmup_steps': 50,
    'max_grad_norm': 1.0,
    'comments_per_user': 10,
    'early_stopping_patience': 3,
    'use_wandb': False,
    'random_state': 17,
}

# Try to load preprocessing config
try:
    with open(PROCESSED_DATA_DIR / 'preprocessing_config.json', 'r') as f:
        preproc_config = json.load(f)
        config['random_state'] = preproc_config.get('random_state', 42)
except FileNotFoundError:
    print("Warning: preprocessing_config.json not found, using default random_state")

print("Configuration loaded:")
for key, value in config.items():
    print(f"{key}: {value}")

Configuration loaded:
model_name: distilbert-base-multilingual-cased
max_length: 96
batch_size: 32
learning_rate: 2e-05
weight_decay: 0.03
num_epochs: 3
dropout_rate: 0.2
warmup_steps: 50
max_grad_norm: 1.0
comments_per_user: 10
early_stopping_patience: 3
use_wandb: False
random_state: 42


In [4]:
# # Load preprocessed data splits
# train_df = pd.read_parquet(PROCESSED_DATA_DIR / 'train.parquet')
# val_df = pd.read_parquet(PROCESSED_DATA_DIR / 'val.parquet')
# test_df = pd.read_parquet(PROCESSED_DATA_DIR / 'test.parquet')

# # Load preprocessed small data splits
train_df = pd.read_parquet(PROCESSED_DATA_DIR / 'train_small.parquet')
val_df = pd.read_parquet(PROCESSED_DATA_DIR / 'val_small.parquet')
test_df = pd.read_parquet(PROCESSED_DATA_DIR / 'test_small.parquet')

print("Dataset sizes:")
print(f"Train: {len(train_df)} samples, {train_df['author'].nunique()} authors")
print(f"Val:   {len(val_df)} samples, {val_df['author'].nunique()} authors")
print(f"Test:  {len(test_df)} samples, {test_df['author'].nunique()} authors")

Dataset sizes:
Train: 17838 samples, 1649 authors
Val:   5044 samples, 353 authors
Test:  3798 samples, 353 authors


In [None]:
# Create Datasets with regression settings
train_dataset = TrollDataset(
    train_df,
    tokenizer_name=config['model_name'],
    max_length=config['max_length'],
    comments_per_user=config['comments_per_user'],
    label_column='troll',  # or your trolliness score column
    normalize_labels=True  # This will automatically normalize scores to [0,1]
)

val_dataset = TrollDataset(
    val_df,
    tokenizer_name=config['model_name'],
    max_length=config['max_length'],
    comments_per_user=config['comments_per_user'],
    label_column='troll',
    normalize_labels=True
)

test_dataset = TrollDataset(
    test_df,
    tokenizer_name=config['model_name'],
    max_length=config['max_length'],
    comments_per_user=config['comments_per_user'],
    label_column='troll',
    normalize_labels=True
)

# Create dataloaders (unchanged)
train_loader = DataLoader(
    train_dataset,
    batch_size=config['batch_size'],
    shuffle=True,
    collate_fn=collate_batch
)

val_loader = DataLoader(
    val_dataset,
    batch_size=config['batch_size'],
    shuffle=False,
    collate_fn=collate_batch
)

test_loader = DataLoader(
    test_dataset,
    batch_size=config['batch_size'],
    shuffle=False,
    collate_fn=collate_batch
)

INFO:src.data_tools.dataset:Using 'text' as text column
INFO:src.data_tools.dataset:Labels are already normalized between 0 and 1
INFO:src.data_tools.dataset:Created 2101 samples from 1649 authors
INFO:src.data_tools.dataset:Using 'text' as text column
INFO:src.data_tools.dataset:Labels are already normalized between 0 and 1
INFO:src.data_tools.dataset:Created 559 samples from 353 authors
INFO:src.data_tools.dataset:Using 'text' as text column
INFO:src.data_tools.dataset:Labels are already normalized between 0 and 1
INFO:src.data_tools.dataset:Created 439 samples from 353 authors


In [16]:
# Initialize Model and Trainer
model = TrollDetector(
    model_name=config['model_name'],
    dropout_rate=config['dropout_rate']
)

# Initialize trainer
trainer = TrollDetectorTrainer(
    model=model,
    train_loader=train_loader,
    val_loader=val_loader,
    test_loader=test_loader,
    learning_rate=config['learning_rate'],
    weight_decay=config['weight_decay'],
    max_grad_norm=config['max_grad_norm'],
    num_epochs=config['num_epochs'],
    warmup_steps=config['warmup_steps'],
    checkpoint_dir=CHECKPOINT_DIR,
    use_wandb=config['use_wandb']
)

  self.scaler = GradScaler()


In [14]:
# Reimport trainer after changes
from importlib import reload
from src.models.trainer import TrollDetectorTrainer
reload(sys.modules['src.models.trainer'])
from src.models.trainer import TrollDetectorTrainer

In [17]:
# Train the model
final_metrics = trainer.train()

print("\nTraining completed!")
print("\nFinal metrics:")
for metric, value in final_metrics.items():
    print(f"{metric}: {value:.4f}")

# Save final configuration and results
results = {
    'config': config,
    'final_metrics': final_metrics
}

with open(CHECKPOINT_DIR / 'training_results.json', 'w') as f:
    json.dump(results, f, indent=2)

INFO:src.models.trainer:Starting training on device: cuda
INFO:src.models.trainer:Training samples: 2101
INFO:src.models.trainer:Validation samples: 559
INFO:src.models.trainer:
Epoch 1/3
Training: 100%|████████████████████████████████████████████████████████████| 66/66 [01:45<00:00,  1.60s/it, loss=0.0357]
INFO:src.models.trainer:Training metrics: {'mse': 0.15188526519692985, 'rmse': 0.38972460173426293, 'mae': 0.35933035012059567, 'r2': -1.051097793574841, 'binary_accuracy': 0.8389261744966443, 'loss': 0.07453820440974651, 'num_authors': 298}
Evaluating: 100%|███████████████████████████████████████████████████████████████████████| 18/18 [00:32<00:00,  1.79s/it]
INFO:src.models.trainer:Validation metrics: {'mse': 0.11774399761962906, 'rmse': 0.3431384525517784, 'mae': 0.3059161580167711, 'r2': -0.07651654966518007, 'binary_accuracy': 0.875, 'loss': 0.05636216477594442, 'num_authors': 80}
INFO:src.models.trainer:New best model with validation R²: -0.0765
INFO:src.models.trainer:
Epoch 


Training completed!

Final metrics:
mse: 0.1471
rmse: 0.3836
mae: 0.2528
r2: -0.2537
binary_accuracy: 0.7654
loss: 0.0422
num_authors: 81.0000


In [18]:
# Save the final model state
final_model_path = CHECKPOINT_DIR / 'best_model.pt'
torch.save(model.state_dict(), final_model_path)
print(f"\nSaved final model to: {final_model_path}")



Saved final model to: checkpoints/best_model.pt
