In [9]:
# declare a list tasks whose products you want to use as inputs
upstream = None


In [1]:
# Imports and Setup
import pandas as pd
import numpy as np
from pathlib import Path
import sys
import logging
import json
import torch
from torch.utils.data import DataLoader
from tqdm import tqdm

# Add project root to path
sys.path.append(str(Path.cwd().parent))

# Import custom modules
from src.models.bert_model import TrollDetector
from src.models.trainer import TrollDetectorTrainer
from src.data_tools.dataset import TrollDataset, collate_batch

# Set up logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

In [5]:
# Define paths
DATA_DIR = Path('data')
PROCESSED_DATA_DIR = DATA_DIR / 'processed'
CHECKPOINT_DIR = Path('./checkpoints')

# Create checkpoint directory
CHECKPOINT_DIR.mkdir(parents=True, exist_ok=True)

# Updated training configuration
config = {
    'model_name': 'distilbert-base-multilingual-cased',
    'adapter_path': None, #Dont use adapter for first training
    # 'model_name': 'ufal/robeczech-base',
    'max_length': 96,
    'batch_size': 8,
    'learning_rate': 2e-5,
    'weight_decay': 0.01,
    'num_epochs': 3,
    'dropout_rate': 0.2,
    'warmup_steps': 50,
    'max_grad_norm': 1.0,
    'comments_per_user': 10,
    'early_stopping_patience': 3,
    'random_state': 17,
}

# Try to load preprocessing config
try:
    with open(PROCESSED_DATA_DIR / 'preprocessing_config.json', 'r') as f:
        preproc_config = json.load(f)
        config['random_state'] = preproc_config.get('random_state', 42)
except FileNotFoundError:
    print("Warning: preprocessing_config.json not found, using default random_state")

print("Configuration loaded:")
for key, value in config.items():
    print(f"{key}: {value}")

Configuration loaded:
model_name: distilbert-base-multilingual-cased
adapter_path: None
max_length: 96
batch_size: 8
learning_rate: 2e-05
weight_decay: 0.03
num_epochs: 3
dropout_rate: 0.1
warmup_steps: 50
max_grad_norm: 1.0
comments_per_user: 10
early_stopping_patience: 3
random_state: 42


In [None]:
import liwc
parse, category_names = liwc.load_token_parser(
        "/absolute/path/to/LIWC2007_English100131.dic")  # or LIWC-2015, etc.


FileNotFoundError: [Errno 2] No such file or directory: '/absolute/path/to/LIWC2007_English100131.dic'

In [11]:
# # Load preprocessed data splits
# train_df = pd.read_parquet(PROCESSED_DATA_DIR / 'train.parquet')
# val_df = pd.read_parquet(PROCESSED_DATA_DIR / 'val.parquet')
# test_df = pd.read_parquet(PROCESSED_DATA_DIR / 'test.parquet')

# # Load preprocessed small data splits
train_df = pd.read_parquet(PROCESSED_DATA_DIR / 'train.parquet')
val_df = pd.read_parquet(PROCESSED_DATA_DIR / 'val.parquet')
test_df = pd.read_parquet(PROCESSED_DATA_DIR / 'test.parquet')

print("Dataset sizes:")
print(f"Train: {len(train_df)} samples, {train_df['author'].nunique()} authors")
print(f"Val:   {len(val_df)} samples, {val_df['author'].nunique()} authors")
print(f"Test:  {len(test_df)} samples, {test_df['author'].nunique()} authors")

Dataset sizes:
Train: 625987 samples, 8953 authors
Val:   169654 samples, 1919 authors
Test:  102276 samples, 1919 authors


In [12]:
# Filter datasets to only include English content
train_df = train_df[train_df['language'].isin(['en', 'English'])]
val_df = val_df[val_df['language'].isin(['en', 'English'])]
test_df = test_df[test_df['language'].isin(['en', 'English'])]

print("\nDataset sizes after filtering for English:")
print(f"Train: {len(train_df)} samples, {train_df['author'].nunique()} authors")
print(f"Val:   {len(val_df)} samples, {val_df['author'].nunique()} authors") 
print(f"Test:  {len(test_df)} samples, {test_df['author'].nunique()} authors")



Dataset sizes after filtering for English:
Train: 346079 samples, 6007 authors
Val:   117871 samples, 1314 authors
Test:  48815 samples, 1285 authors


In [13]:
# Create Datasets with regression settings
train_dataset = TrollDataset(
    train_df,
    tokenizer_name=config['model_name'],
    max_length=config['max_length'],
    comments_per_user=config['comments_per_user'],
    label_column='troll',  # or your trolliness score column
    normalize_labels=True  # This will automatically normalize scores to [0,1]
)

val_dataset = TrollDataset(
    val_df,
    tokenizer_name=config['model_name'],
    max_length=config['max_length'],
    comments_per_user=config['comments_per_user'],
    label_column='troll',
    normalize_labels=True
)

test_dataset = TrollDataset(
    test_df,
    tokenizer_name=config['model_name'],
    max_length=config['max_length'],
    comments_per_user=config['comments_per_user'],
    label_column='troll',
    normalize_labels=True
)

# Create dataloaders (unchanged)
train_loader = DataLoader(
    train_dataset,
    batch_size=config['batch_size'],
    shuffle=True,
    collate_fn=collate_batch
)

val_loader = DataLoader(
    val_dataset,
    batch_size=config['batch_size'],
    shuffle=False,
    collate_fn=collate_batch
)

test_loader = DataLoader(
    test_dataset,
    batch_size=config['batch_size'],
    shuffle=False,
    collate_fn=collate_batch
)

INFO:src.data_tools.dataset:Using 'text' as text column
INFO:src.data_tools.dataset:Labels are already normalized between 0 and 1
INFO:src.data_tools.dataset:Created 19684 samples from 6007 authors
INFO:src.data_tools.dataset:Using 'text' as text column
INFO:src.data_tools.dataset:Labels are already normalized between 0 and 1
INFO:src.data_tools.dataset:Created 4752 samples from 1314 authors
INFO:src.data_tools.dataset:Using 'text' as text column
INFO:src.data_tools.dataset:Labels are already normalized between 0 and 1
INFO:src.data_tools.dataset:Created 4276 samples from 1285 authors


In [9]:
# Initialize Model and Trainer
model = TrollDetector(
    model_name=config['model_name'],
    adapter_path=config['adapter_path'],
    dropout_rate=config['dropout_rate']
)

# Initialize trainer
trainer = TrollDetectorTrainer(
    model=model,
    train_loader=train_loader,
    val_loader=val_loader,
    test_loader=test_loader,
    learning_rate=config['learning_rate'],
    weight_decay=config['weight_decay'],
    max_grad_norm=config['max_grad_norm'],
    num_epochs=config['num_epochs'],
    warmup_steps=config['warmup_steps'],
    checkpoint_dir=CHECKPOINT_DIR,
)

INFO:adapters.heads.model_mixin:Adding head 'default' with config {'head_type': 'masked_lm', 'vocab_size': 119547, 'embedding_size': 768, 'layers': 2, 'activation_function': 'gelu', 'layer_norm': True, 'bias': True, 'shift_labels': False, 'label2id': None}.
  self.scaler = GradScaler()


In [17]:
# 1 – right after you construct TrollDetector
trainable = [n for n, p in model.named_parameters() if p.requires_grad]
print("Trainable parameter tensors:", len(trainable))
assert len(trainable) > 0, "Nothing to train!"

# # 2 – right after you build the optimiser
# print("Optimizer param groups:",
#       sum(p.numel() for p in optimizer.param_groups[0]['params']))

Trainable parameter tensors: 111


In [15]:
# Train the model
final_metrics = trainer.train()

print("\nTraining completed!")
print("\nFinal metrics:")
for metric, value in final_metrics.items():
    print(f"{metric}: {value:.4f}")

# Save final configuration and results
results = {
    'config': config,
    'final_metrics': final_metrics
}

with open(CHECKPOINT_DIR / 'training_results.json', 'w') as f:
    json.dump(results, f, indent=2)

INFO:src.models.trainer:Starting training on device: cuda
INFO:src.models.trainer:Training samples: 10790
INFO:src.models.trainer:Validation samples: 2383
INFO:src.models.trainer:
Epoch 1/3
  with autocast():
Training: 100%|████████████████████████████████████████████████████████| 1349/1349 [07:01<00:00,  3.20it/s, loss=0.0879]
INFO:src.models.trainer:Training metrics: {'mse': 0.038329243539302865, 'rmse': 0.19577855740428488, 'mae': 0.09667715059744345, 'r2': 0.8239332464107989, 'binary_accuracy': 0.9508445945945946, 'loss': 0.017497718319414157, 'num_authors': 5920}
Evaluating: 100%|█████████████████████████████████████████████████████████████████████| 298/298 [02:07<00:00,  2.34it/s]
INFO:src.models.trainer:Validation metrics: {'mse': 0.08737124002524761, 'rmse': 0.2955862649468808, 'mae': 0.15593152526600362, 'r2': 0.5936431075659179, 'binary_accuracy': 0.8872120730738682, 'loss': 0.04368227651320922, 'num_authors': 1259}
INFO:src.models.trainer:
Epoch 2/3
  with autocast():
Traini


Training completed!

Final metrics:
mse: 0.0950
rmse: 0.3082
mae: 0.1629
r2: 0.5521
binary_accuracy: 0.8805
loss: 0.0413
num_authors: 1238.0000


In [14]:
# Save the final model state
final_model_path = CHECKPOINT_DIR / 'best_model_english_small.pt'
torch.save(model.state_dict(), final_model_path)
print(f"\nSaved final model to: {final_model_path}")



Saved final model to: checkpoints/best_model_english_small.pt
