In [48]:
import logging
import pandas as pd
from pathlib import Path
import optuna
from optuna.pruners import MedianPruner
from optuna.samplers import TPESampler
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score, precision_recall_fscore_support, matthews_corrcoef
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
import xgboost as xgb
import os
import torch

# Set up logging
logging.basicConfig(format='%(asctime)s - %(message)s',
                    datefmt='%Y-%m-%d %H:%M:%S',
                    level=logging.INFO)

In [49]:
# Path configuration
DATA_DIR = Path('./data')
TRAIN_PATH = DATA_DIR / 'train.csv'
DEV_PATH = DATA_DIR / 'dev.csv'
MODEL_SAVE_PATH = Path('./models')
MODEL_SAVE_PATH.mkdir(parents=True, exist_ok=True)

# Optuna config
N_TRIALS = 100  # Number of Optuna trials

In [50]:
def get_device() -> torch.device:
    """Determine the device to use for computations."""
    if torch.cuda.is_available():
        return torch.device('cuda')
    elif hasattr(torch.backends, 'mps') and torch.backends.mps.is_available():
        return torch.device('mps')
    else:
        return torch.device('cpu')

In [51]:
def prepare_data(train_df, dev_df):
    """Prepare data for XGBoost training."""
    # Combine claim and evidence into a single text feature for TF-IDF
    train_df['text'] = train_df['Claim'] + " [SEP] " + train_df['Evidence']
    dev_df['text'] = dev_df['Claim'] + " [SEP] " + dev_df['Evidence']
    
    # Extract labels
    train_labels = train_df['label'].values
    dev_labels = dev_df['label'].values
    
    return train_df, dev_df, train_labels, dev_labels

In [52]:
def calculate_all_metrics(y_true, y_pred):
    """
    Calculate comprehensive evaluation metrics.
    
    Returns:
        dict: Dictionary containing all metrics
    """
    # Basic accuracy
    accuracy = accuracy_score(y_true, y_pred)
    
    # Calculate precision, recall, f1 (macro)
    macro_precision, macro_recall, macro_f1, _ = precision_recall_fscore_support(
        y_true, y_pred, average='macro'
    )
    
    # Calculate precision, recall, f1 (weighted)
    weighted_precision, weighted_recall, weighted_f1, _ = precision_recall_fscore_support(
        y_true, y_pred, average='weighted'
    )
    
    # Matthews Correlation Coefficient
    mcc = matthews_corrcoef(y_true, y_pred)
    
    metrics = {
        'Accuracy': accuracy,
        'Macro-P': macro_precision,
        'Macro-R': macro_recall,
        'Macro-F1': macro_f1,
        'W Macro-P': weighted_precision,
        'W Macro-R': weighted_recall,
        'W Macro-F1': weighted_f1,
        'MCC': mcc
    }
    
    return metrics

In [53]:
def objective(trial):
    """Optuna objective function for hyperparameter optimization."""
    # Load data
    logging.info("Loading datasets...")
    train_df = pd.read_csv(TRAIN_PATH)
    dev_df = pd.read_csv(DEV_PATH)
    
    logging.info(f"Training data shape: {train_df.shape}")
    logging.info(f"Development data shape: {dev_df.shape}")
    
    # Prepare data
    train_df, dev_df, train_labels, dev_labels = prepare_data(train_df, dev_df)
    
    # TF-IDF vectorizer parameters
    max_features = trial.suggest_categorical("max_features", [5000, 10000, 15000, 20000])
    min_df = trial.suggest_categorical("min_df", [1, 2, 3])
    ngram_range = trial.suggest_categorical("ngram_range", [(1, 1), (1, 2), (1, 3)])
    
    # XGBoost hyperparameters
    xgb_params = {
        'max_depth': trial.suggest_int('max_depth', 3, 10),
        'learning_rate': trial.suggest_float('learning_rate', 1e-4, 0.3, log=True),
        'n_estimators': trial.suggest_int('n_estimators', 100, 1000),
        'min_child_weight': trial.suggest_int('min_child_weight', 1, 7),
        'gamma': trial.suggest_float('gamma', 1e-8, 1.0, log=True),
        'subsample': trial.suggest_float('subsample', 0.5, 1.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.5, 1.0),
        'reg_alpha': trial.suggest_float('reg_alpha', 1e-8, 1.0, log=True),
        'reg_lambda': trial.suggest_float('reg_lambda', 1e-8, 1.0, log=True),
        'objective': 'binary:logistic',
        'eval_metric': 'logloss',
        'use_label_encoder': False,
        'tree_method': 'hist',
        'random_state': 42
    }
    
    # Create pipeline
    pipeline = Pipeline([
        ('tfidf', TfidfVectorizer(
            max_features=max_features,
            min_df=min_df,
            ngram_range=ngram_range,
            stop_words='english'
        )),
        ('scaler', StandardScaler(with_mean=False)),  # TF-IDF matrices are sparse
        ('xgb', xgb.XGBClassifier(
            **xgb_params,
            enable_categorical=True  # Enable categorical feature support
        ))
    ])
    
    # Train model
    logging.info(f"Training XGBoost with hyperparameters: {xgb_params}")
    
    X_train = pipeline.named_steps['tfidf'].fit_transform(train_df['text'])
    X_train = pipeline.named_steps['scaler'].fit_transform(X_train)
    
    X_dev = pipeline.named_steps['tfidf'].transform(dev_df['text'])
    X_dev = pipeline.named_steps['scaler'].transform(X_dev)
    
    # Create DMatrix objects
    dtrain = xgb.DMatrix(X_train, label=train_labels)
    ddev = xgb.DMatrix(X_dev, label=dev_labels)
    
    # Train model
    model = xgb.train(
        xgb_params,
        dtrain,
        num_boost_round=xgb_params['n_estimators'],
        evals=[(ddev, 'eval')],
        early_stopping_rounds=50,
        verbose_eval=False
    )
    
    # Make predictions
    dev_preds = (model.predict(ddev) >= 0.5).astype(int)    
    metrics = calculate_all_metrics(dev_labels, dev_preds)
    
    # Report intermediate values for pruning
    trial.report(metrics['W Macro-F1'], step=model.best_iteration)
    if trial.should_prune():
        raise optuna.exceptions.TrialPruned()
    
    return metrics['W Macro-F1']

In [54]:
if __name__ == "__main__":
    print("\nHYPERPARAMETER TUNING")
    print("=====================")
    print(f"Running {N_TRIALS} trials...")
    
    # Check if GPU is available
    device = get_device()
    if torch.cuda.is_available():
        # Enable GPU acceleration for XGBoost
        os.environ['CUDA_VISIBLE_DEVICES'] = '0'
    logging.info(f"Using device: {device}")
    
    # Create a study with TPE sampler and MedianPruner
    sampler = TPESampler(seed=42)
    pruner = MedianPruner(n_startup_trials=5, n_warmup_steps=5, interval_steps=2)
    
    study = optuna.create_study(
        direction='maximize',
        sampler=sampler,
        pruner=pruner,
        study_name='xgboost_evidence_detection'
    )
    
    try:
        study.optimize(objective, n_trials=N_TRIALS, n_jobs=-1)
    except KeyboardInterrupt:
        print("Hyperparameter tuning interrupted.")
    
    print("\nBest trial:")
    trial = study.best_trial
    print(f"  Value (Accuracy): {trial.value}")
    print("  Params:")
    for key, value in trial.params.items():
        print(f"    {key}: {value}")

2025-03-26 19:00:23 - Using device: cpu
[I 2025-03-26 19:00:23,209] A new study created in memory with name: xgboost_evidence_detection
2025-03-26 19:00:23 - Loading datasets...
2025-03-26 19:00:23 - Loading datasets...
2025-03-26 19:00:23 - Loading datasets...
2025-03-26 19:00:23 - Loading datasets...
2025-03-26 19:00:23 - Loading datasets...
2025-03-26 19:00:23 - Loading datasets...
2025-03-26 19:00:23 - Loading datasets...
2025-03-26 19:00:23 - Loading datasets...
2025-03-26 19:00:23 - Loading datasets...
2025-03-26 19:00:23 - Loading datasets...
2025-03-26 19:00:23 - Loading datasets...



HYPERPARAMETER TUNING
Running 100 trials...


2025-03-26 19:00:23 - Loading datasets...
2025-03-26 19:00:23 - Loading datasets...
2025-03-26 19:00:23 - Loading datasets...
2025-03-26 19:00:23 - Loading datasets...
2025-03-26 19:00:23 - Loading datasets...
2025-03-26 19:00:23 - Training data shape: (21508, 3)
2025-03-26 19:00:23 - Training data shape: (21508, 3)
2025-03-26 19:00:23 - Training data shape: (21508, 3)
2025-03-26 19:00:23 - Development data shape: (5926, 3)
2025-03-26 19:00:23 - Training data shape: (21508, 3)
2025-03-26 19:00:23 - Training data shape: (21508, 3)
2025-03-26 19:00:23 - Training data shape: (21508, 3)
2025-03-26 19:00:23 - Training data shape: (21508, 3)
2025-03-26 19:00:23 - Development data shape: (5926, 3)
2025-03-26 19:00:23 - Training data shape: (21508, 3)
2025-03-26 19:00:24 - Development data shape: (5926, 3)
2025-03-26 19:00:24 - Training data shape: (21508, 3)
2025-03-26 19:00:24 - Training data shape: (21508, 3)
2025-03-26 19:00:24 - Training XGBoost with hyperparameters: {'max_depth': 4, 'lea


Best trial:
  Value (Accuracy): 0.7862936515178504
  Params:
    max_features: 15000
    min_df: 1
    ngram_range: (1, 2)
    max_depth: 6
    learning_rate: 0.10437109564495202
    n_estimators: 949
    min_child_weight: 7
    gamma: 9.957755835836044e-08
    subsample: 0.8633107968594791
    colsample_bytree: 0.9196679493967628
    reg_alpha: 0.0007388500492709971
    reg_lambda: 4.991341546894774e-05
