In [1]:
# Imports and Setup
import pandas as pd
import numpy as np
from pathlib import Path
import sys
import logging
import json
import torch
from torch.utils.data import DataLoader
from tqdm import tqdm

# Add project root to path
sys.path.append(str(Path.cwd().parent))

# Import custom modules
from src.models.bert_model import TrollDetector
from src.models.trainer import TrollDetectorTrainer
from src.data_tools.dataset import TrollDataset, collate_batch

# Set up logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

In [2]:
# Define paths
DATA_DIR = Path('data')
PROCESSED_DATA_DIR = DATA_DIR / 'processed'
CHECKPOINT_DIR = Path('./checkpoints')

# Create checkpoint directory
CHECKPOINT_DIR.mkdir(parents=True, exist_ok=True)

# # Updated training configuration
# config = {
#     'model_name': 'distilbert-base-multilingual-cased',
#     'adapter_path': None, #Dont use adapter for first training
#     # 'model_name': 'ufal/robeczech-base',
#     'max_length': 96,
#     'batch_size': 8,
#     'learning_rate': 2e-5,
#     'weight_decay': 0.03,
#     'num_epochs': 3,
#     'dropout_rate': 0.1,
#     'warmup_steps': 50,
#     'max_grad_norm': 1.0,
#     'comments_per_user': 10,
#     'early_stopping_patience': 3,
#     'random_state': 17,
# }

# Try to load preprocessing config
try:
    with open(PROCESSED_DATA_DIR / 'preprocessing_config.json', 'r') as f:
        preproc_config = json.load(f)
        # config['random_state'] = preproc_config.get('random_state', 42)
except FileNotFoundError:
    print("Warning: preprocessing_config.json not found, using default random_state")

# print("Configuration loaded:")
# for key, value in config.items():
#     print(f"{key}: {value}")

In [3]:
# # Load preprocessed data splits
# train_df = pd.read_parquet(PROCESSED_DATA_DIR / 'train.parquet')
# val_df = pd.read_parquet(PROCESSED_DATA_DIR / 'val.parquet')
# test_df = pd.read_parquet(PROCESSED_DATA_DIR / 'test.parquet')

# # Load preprocessed small data splits
train_df = pd.read_parquet(PROCESSED_DATA_DIR / 'train.parquet')
val_df = pd.read_parquet(PROCESSED_DATA_DIR / 'val.parquet')
test_df = pd.read_parquet(PROCESSED_DATA_DIR / 'test.parquet')

print("Dataset sizes:")
print(f"Train: {len(train_df)} samples, {train_df['author'].nunique()} authors")
print(f"Val:   {len(val_df)} samples, {val_df['author'].nunique()} authors")
print(f"Test:  {len(test_df)} samples, {test_df['author'].nunique()} authors")

Dataset sizes:
Train: 625987 samples, 8953 authors
Val:   169654 samples, 1919 authors
Test:  102276 samples, 1919 authors


In [4]:
def extract_stylometric_features(df):
    """
    Extract stylometric features from text data.
    
    Args:
        df: DataFrame containing 'text' column
        
    Returns:
        DataFrame with stylometric features
    """
    # Create copy to avoid modifying original
    features_df = df.copy()
    
    # Calculate features
    features_df['char_count'] = features_df['text'].str.len()
    features_df['word_count'] = features_df['text'].str.split().str.len()
    features_df['avg_word_length'] = features_df['text'].apply(lambda x: np.mean([len(word) for word in x.split()]))
    features_df['capital_letters'] = features_df['text'].apply(lambda x: sum(1 for c in x if c.isupper()))
    features_df['number_count'] = features_df['text'].apply(lambda x: sum(c.isdigit() for c in x))
    
    return features_df

# Extract features for each dataset
print("Extracting stylometric features...")
train_features = extract_stylometric_features(train_df)
val_features = extract_stylometric_features(val_df)
test_features = extract_stylometric_features(test_df)

# Display sample of features
print("\nSample of extracted features from training set:")
print(train_features[['text', 'char_count', 'word_count', 'avg_word_length', 'capital_letters', 'number_count']].head())


Extracting stylometric features...

Sample of extracted features from training set:
                                                text  char_count  word_count  \
0  Hedge fund managers expect return investment D...          57           8   
1                                А я в Мурманске бдю          19           5   
2  Привет Собянину! Снос ларьков добрался и до Пе...          70          10   
3       Hurrikan trifft Insel Puerto Rico ganz hart!          44           7   
4                              Смотри, коты vs. дети          21           4   

   avg_word_length  capital_letters  number_count  
0         6.250000                3             0  
1         3.000000                2             0  
2         6.100000                5             0  
3         5.428571                4             0  
4         4.500000                1             0  


In [5]:
# Prepare features for SVR
feature_columns = ['char_count', 'word_count', 'avg_word_length', 'capital_letters', 'number_count']

# Create feature matrices
X_train = train_features[feature_columns].values
X_val = val_features[feature_columns].values 
X_test = test_features[feature_columns].values

# Get labels
y_train = train_features['troll'].values
y_val = val_features['troll'].values
y_test = test_features['troll'].values


In [None]:
# GBR
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_squared_error, r2_score

model = Pipeline([
    ('scale', StandardScaler()),
    ('gbr',  GradientBoostingRegressor(
                n_estimators=400,
                learning_rate=0.05,
                max_depth=3,
                subsample=0.8,
                random_state=42))
])

model.fit(X_train, y_train)

In [7]:
for name, X, y in [('Train', X_train, y_train),
                   ('Val',   X_val,   y_val),
                   ('Test',  X_test,  y_test)]:
    y_hat = model.predict(X)
    print(f'{name:5s} | MSE {mean_squared_error(y, y_hat):.4f} '
          f'R² {r2_score(y, y_hat):.4f}')

Train | MSE 0.1782 R² 0.0904
Val   | MSE 0.2343 R² 0.0005
Test  | MSE 0.1616 R² 0.1193


In [8]:
from sklearn.svm import LinearSVR
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_val_scaled   = scaler.transform(X_val)
X_test_scaled  = scaler.transform(X_test)

svr = LinearSVR(C=1.0, epsilon=0.1, random_state=42, max_iter=50000, tol=1e-4)
svr.fit(X_train_scaled, y_train)

train_pred = svr.predict(X_train_scaled)
val_pred   = svr.predict(X_val_scaled)
test_pred  = svr.predict(X_test_scaled)

for split, y_true, y_hat in [
    ("Train", y_train, train_pred),
    ("Val  ", y_val,   val_pred),
    ("Test ", y_test,  test_pred),
]:
    print(f"{split} | MSE {mean_squared_error(y_true, y_hat):.4f} "
          f"R² {r2_score(y_true, y_hat):.4f}")

Train | MSE 0.2240 R² -0.1432
Val   | MSE 0.3100 R² -0.3226
Test  | MSE 0.2036 R² -0.1100


In [9]:

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import Ridge
from sklearn.model_selection import train_test_split

tfidf_pipe = Pipeline([
    ('tfidf', TfidfVectorizer(max_features=50_000,
                              ngram_range=(1,2),
                              min_df=3,
                              max_df=0.9,
                              dtype=float)),
    ('reg',   Ridge(alpha=1.0, random_state=42))
])

tfidf_pipe.fit(train_df['text'], y_train)

y_pred = tfidf_pipe.predict(test_df['text'])
print('TF-IDF | Test MSE {:.4f}  R² {:.4f}'
      .format(mean_squared_error(y_test, y_pred),
              r2_score(y_test, y_pred)))




TF-IDF | Test MSE 0.1362  R² 0.2578


In [11]:
# Filter for English tweets only
train_df_en = train_df[train_df['language'].isin(['en', 'English'])].copy()
val_df_en = val_df[val_df['language'].isin(['en', 'English'])].copy()
test_df_en = test_df[test_df['language'].isin(['en', 'English'])].copy()

print("English-only dataset sizes:")
print(f"Training:   {len(train_df_en):,} ({len(train_df_en)/len(train_df):.1%} of full training set)")
print(f"Validation: {len(val_df_en):,} ({len(val_df_en)/len(val_df):.1%} of full validation set)") 
print(f"Test:       {len(test_df_en):,} ({len(test_df_en)/len(test_df):.1%} of full test set)")


English-only dataset sizes:
Training:   346,079 (55.3% of full training set)
Validation: 117,871 (69.5% of full validation set)
Test:       48,815 (47.7% of full test set)


In [12]:
# Extract features for each dataset
print("Extracting stylometric features...")
train_features = extract_stylometric_features(train_df_en)
val_features = extract_stylometric_features(val_df_en)
test_features = extract_stylometric_features(test_df_en)

Extracting stylometric features...


In [13]:

# Prepare features for SVR
feature_columns = ['char_count', 'word_count', 'avg_word_length', 'capital_letters', 'number_count']

# Create feature matrices
X_train = train_features[feature_columns].values
X_val = val_features[feature_columns].values 
X_test = test_features[feature_columns].values

# Get labels
y_train = train_features['troll'].values
y_val = val_features['troll'].values
y_test = test_features['troll'].values

In [14]:
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_squared_error, r2_score

model = Pipeline([
    ('scale', StandardScaler()),
    ('gbr',  GradientBoostingRegressor(
                n_estimators=400,
                learning_rate=0.05,
                max_depth=3,
                subsample=0.8,
                random_state=42))
])

model.fit(X_train, y_train)

In [15]:
for name, X, y in [('Train', X_train, y_train),
                   ('Val',   X_val,   y_val),
                   ('Test',  X_test,  y_test)]:
    y_hat = model.predict(X)
    print(f'{name:5s} | MSE {mean_squared_error(y, y_hat):.4f} '
          f'R² {r2_score(y, y_hat):.4f}')

Train | MSE 0.1948 R² 0.0999
Val   | MSE 0.2624 R² -0.0637
Test  | MSE 0.1834 R² 0.1063


In [17]:
# --- 1. Helper: collapse all comments of an author into one string ----------
def make_author_dataframe(df: pd.DataFrame) -> pd.DataFrame:
    """
    Returns a DataFrame with one row per author.
    Columns: 'text' (concatenated comments) and 'label' (mean trolliness).
    """
    out = (
        df.groupby('author')['text']
          .apply(lambda x: ' '.join(x))       # concatenate comments
          .to_frame(name='text')
    )
    out['label'] = (
        df.groupby('author')['troll']
          .mean()                             # author-level score
    )
    return out

train_authors = make_author_dataframe(train_df_en)
val_authors   = make_author_dataframe(val_df_en)
test_authors  = make_author_dataframe(test_df_en)

In [18]:
from sklearn.svm import LinearSVR
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline

# Create author-level features by aggregating tweet-level features
def aggregate_features(features_df):
    agg_dict = {
        'char_count': 'mean',
        'word_count': 'mean', 
        'avg_word_length': 'mean',
        'capital_letters': 'mean',
        'number_count': 'mean'
    }
    return features_df.groupby('author')[feature_columns].agg(agg_dict)

train_author_features = aggregate_features(train_features)
val_author_features = aggregate_features(val_features) 
test_author_features = aggregate_features(test_features)

# Get author-level labels
train_author_labels = train_authors['label'].values
val_author_labels = val_authors['label'].values
test_author_labels = test_authors['label'].values

# Create and train pipeline
svr_pipe = Pipeline([
    ('scale', StandardScaler()),
    ('svr', LinearSVR(C=1.0, epsilon=0.1, random_state=42, max_iter=50000, tol=1e-4))
])

svr_pipe.fit(train_author_features, train_author_labels)

# Make predictions
train_pred = svr_pipe.predict(train_author_features)
val_pred = svr_pipe.predict(val_author_features)
test_pred = svr_pipe.predict(test_author_features)

# Print results
for split, y_true, y_hat in [
    ("Train", train_author_labels, train_pred),
    ("Val  ", val_author_labels, val_pred), 
    ("Test ", test_author_labels, test_pred)
]:
    print(f"{split} | MSE {mean_squared_error(y_true, y_hat):.4f} "
          f"R² {r2_score(y_true, y_hat):.4f}")

Train | MSE 0.2815 R² -0.2557
Val   | MSE 0.2602 R² -0.2106
Test  | MSE 0.2603 R² -0.2107


In [22]:
# --- 2. Define and train the pipeline --------------------------------------
author_pipe = Pipeline([
    ('tfidf', TfidfVectorizer(max_features=50_000,
                              ngram_range=(1, 2),
                              min_df=3,
                              max_df=0.9,
                              dtype=float)),
    ('reg',   Ridge(alpha=1.0, random_state=42))
])

author_pipe.fit(train_authors['text'], train_authors['label'])

# --- 3. Evaluate on train, validation and test authors ----------------------------
for split, X, y in [
    ('Train',      train_authors['text'], train_authors['label']),
    ('Validation', val_authors['text'],   val_authors['label']),
    ('Test',       test_authors['text'],  test_authors['label'])
]:
    y_hat = author_pipe.predict(X)
    
    # Calculate MSE and R²
    mse = mean_squared_error(y, y_hat)
    r2 = r2_score(y, y_hat)
    
    # Calculate BCE loss
    # Clip predictions to [0,1] range since BCE expects probabilities
    y_hat_clipped = np.clip(y_hat, 0, 1)
    bce = -np.mean(y * np.log(y_hat_clipped + 1e-10) + 
                   (1-y) * np.log(1 - y_hat_clipped + 1e-10))
    
    print(f'{split:10s} | MSE {mse:.4f}  R² {r2:.4f}  BCE {bce:.4f}')



Train      | MSE 0.0360  R² 0.8394  BCE 0.1677
Validation | MSE 0.0904  R² 0.5793  BCE 0.3181
Test       | MSE 0.0974  R² 0.5470  BCE 0.3507
