In [None]:
import logging
import pandas as pd
import numpy as np
from pathlib import Path

# Hyperparameter tuning
import optuna
from optuna.pruners import MedianPruner
from optuna.samplers import TPESampler

# Embeddings
import gensim.downloader as api
from sklearn.base import BaseEstimator, TransformerMixin


from sklearn.metrics import accuracy_score, precision_recall_fscore_support, matthews_corrcoef
from sklearn.preprocessing import StandardScaler
import xgboost as xgb

import warnings
warnings.filterwarnings('ignore')

# Set up logging
logging.basicConfig(format='%(asctime)s - %(message)s',
                    datefmt='%Y-%m-%d %H:%M:%S',
                    level=logging.INFO)

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
# Path configuration
DATA_DIR = Path('../../data')
TRAIN_PATH = DATA_DIR / 'train.csv'
AUGMENTED_DATA_PATH = DATA_DIR / 'train_augmented.csv'
DEV_PATH = DATA_DIR / 'dev.csv'
MODEL_SAVE_PATH = Path('./models')
MODEL_SAVE_PATH.mkdir(parents=True, exist_ok=True)

# Optuna config
N_TRIALS = 50  # Number of Optuna trials

In [4]:
glove_embeddings = api.load('glove-wiki-gigaword-300')

2025-03-27 13:10:46 - loading projection weights from C:\Users\willi/gensim-data\glove-wiki-gigaword-300\glove-wiki-gigaword-300.gz
2025-03-27 13:13:25 - KeyedVectors lifecycle event {'msg': 'loaded (400000, 300) matrix of type float32 from C:\\Users\\willi/gensim-data\\glove-wiki-gigaword-300\\glove-wiki-gigaword-300.gz', 'binary': False, 'encoding': 'utf8', 'datetime': '2025-03-27T13:13:25.780208', 'gensim': '4.3.3', 'python': '3.11.2 (tags/v3.11.2:878ead1, Feb  7 2023, 16:38:35) [MSC v.1934 64 bit (AMD64)]', 'platform': 'Windows-10-10.0.26100-SP0', 'event': 'load_word2vec_format'}


In [5]:
# class GloveVectorizer(BaseEstimator, TransformerMixin):
#     def __init__(self, sep_token: str = '[SEP]'):
#         self.glove = glove_embeddings
#         self.vector_size = 300
#         self.sep_token = sep_token
        
#     @staticmethod
#     def _pre_process(doc: str) -> str:
#         # Remove any unrepresentable characters
#         doc = doc.encode('ascii', 'ignore').decode('ascii')
#         # Remove any double quotes at the beginning and end of the document
#         doc = doc.strip('"')
        
#         return doc
    
#     def fit(self, X, y=None):
#         return self
    
#     def transform(self, X):
#         doc_vectors = []
        
#         for doc in X:
#             # Split on [SEP] token to separate claim and evidence
#             try:
#                 claim, evidence = doc.split(self.sep_token)
#             except ValueError as ve:
#                 raise ValueError(f"Document splitting error: Expected 2 parts separated by '{self.sep_token}', but got an error: {ve}")
#             except Exception as e:
#                 raise Exception(f"An unexpected error occurred while splitting the document: {e}")
            
#             # Pre-process the claim and evidence
#             claim = self._pre_process(claim)
#             evidence = self._pre_process(evidence)
            
#             # Get vectors for claim
#             claim_vectors = [self.glove[word] for word in claim.split() 
#                            if word in self.glove]
            
#             # Get vectors for evidence
#             evidence_vectors = [self.glove[word] for word in evidence.split() 
#                               if word in self.glove]
            
#             # Combine vectors (if either part has no vectors, use empty list)
#             combined_vectors = []
#             if claim_vectors:
#                 combined_vectors.extend(claim_vectors)
#             if evidence_vectors:
#                 combined_vectors.extend(evidence_vectors)
                
#             # If no vectors found, use zero vector
#             if not combined_vectors:
#                 combined_vectors = [np.zeros(self.vector_size)]
            
#             doc_vectors.append(combined_vectors)
            
#         # Pad sequences to same length (use longest document as reference)
#         max_length = max(len(vectors) for vectors in doc_vectors)
#         padded_vectors = []
        
#         for vectors in doc_vectors:
#             # Pad with zero vectors if needed
#             padding = [np.zeros(self.vector_size)] * (max_length - len(vectors))
#             padded_vectors.append(vectors + padding)
            
#         return np.array(padded_vectors)

In [6]:
class GloveVectorizer(BaseEstimator, TransformerMixin):
    def __init__(self, sep_token: str = '[SEP]'):
        self.glove = glove_embeddings
        self.vector_size = 300
        self.sep_token = sep_token
        
    @staticmethod
    def _pre_process(doc: str) -> str:
        # Remove any unrepresentable characters
        doc = doc.encode('ascii', 'ignore').decode('ascii')
        # Remove any double quotes at the beginning and end of the document
        doc = doc.strip('"')
        return doc
    
    def _get_mean_vector(self, text: str) -> np.ndarray:
        # Get vectors for all words in text and return their mean
        vectors = [self.glove[word] for word in text.split() 
                  if word in self.glove]
        if vectors:
            return np.mean(vectors, axis=0)
        return np.zeros(self.vector_size)
        
    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        doc_vectors = []
        
        for doc in X:
            # Split on [SEP] token to separate claim and evidence
            try:
                claim, evidence = doc.split(self.sep_token)
            except ValueError as ve:
                raise ValueError(f"Document splitting error: Expected 2 parts separated by '{self.sep_token}', but got an error: {ve}")
            
            # Pre-process the claim and evidence
            claim = self._pre_process(claim)
            evidence = self._pre_process(evidence)
            
            # Get mean vectors for claim and evidence
            claim_vector = self._get_mean_vector(claim)
            evidence_vector = self._get_mean_vector(evidence)
            
            # Concatenate claim and evidence vectors
            doc_vectors.append(np.concatenate([claim_vector, evidence_vector]))
            
        return np.array(doc_vectors)

In [7]:
def prepare_data(train_df, aug_train_df, dev_df):
    """Prepare data for XGBoost training."""
    # Combine claim and evidence into a single text feature for TF-IDF
    train_df['text'] = train_df['Claim'] + " [SEP] " + train_df['Evidence']
    aug_train_df['text'] = aug_train_df['Claim'] + " [SEP] " + aug_train_df['Evidence']
    dev_df['text'] = dev_df['Claim'] + " [SEP] " + dev_df['Evidence']
    
    # Extract labels
    train_labels = train_df['label'].values
    aug_train_labels = aug_train_df['label'].values
    dev_labels = dev_df['label'].values
    
    # Combine the augmented training data with the original training data
    train_df = pd.concat([train_df, aug_train_df])
    train_labels = np.concatenate([train_labels, aug_train_labels])
    
    return train_df, dev_df, train_labels, dev_labels

In [8]:
def calculate_all_metrics(y_true, y_pred):
    """
    Calculate comprehensive evaluation metrics.
    
    Returns:
        dict: Dictionary containing all metrics
    """
    # Basic accuracy
    accuracy = accuracy_score(y_true, y_pred)
    
    # Calculate precision, recall, f1 (macro)
    macro_precision, macro_recall, macro_f1, _ = precision_recall_fscore_support(
        y_true, y_pred, average='macro'
    )
    
    # Calculate precision, recall, f1 (weighted)
    weighted_precision, weighted_recall, weighted_f1, _ = precision_recall_fscore_support(
        y_true, y_pred, average='weighted'
    )
    
    # Matthews Correlation Coefficient
    mcc = matthews_corrcoef(y_true, y_pred)
    
    metrics = {
        'Accuracy': accuracy,
        'Macro-P': macro_precision,
        'Macro-R': macro_recall,
        'Macro-F1': macro_f1,
        'W Macro-P': weighted_precision,
        'W Macro-R': weighted_recall,
        'W Macro-F1': weighted_f1,
        'MCC': mcc
    }
    
    return metrics

In [9]:
train_df = pd.read_csv(TRAIN_PATH)
aug_train_df = pd.read_csv(AUGMENTED_DATA_PATH)
dev_df = pd.read_csv(DEV_PATH)
   
train_df, dev_df, train_labels, dev_labels = prepare_data(train_df, aug_train_df, dev_df)

In [10]:
def objective(trial):    
    # XGBoost hyperparameters
    xgb_params = {
        'max_depth': trial.suggest_int('max_depth', 3, 8),
        'learning_rate': trial.suggest_float('learning_rate', 1e-3, 0.3, log=True),
        'n_estimators': trial.suggest_int('n_estimators', 100, 500),
        'min_child_weight': trial.suggest_int('min_child_weight', 1, 7),
        'gamma': trial.suggest_float('gamma', 1e-8, 1.0, log=True),
        'subsample': trial.suggest_float('subsample', 0.5, 1.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.5, 1.0),
        'reg_alpha': trial.suggest_float('reg_alpha', 1e-8, 1.0, log=True),
        'reg_lambda': trial.suggest_float('reg_lambda', 1e-8, 1.0, log=True),
        'objective': 'binary:logistic',
        'eval_metric': 'logloss',
        'use_label_encoder': False,
        'tree_method': 'hist',
        'max_bin': 256,
        'random_state': 42
    }
    
    logging.info(f"Training XGBoost with hyperparameters: {xgb_params}")
    
    CHUNK_SIZE = 1000
    
    # Get embeddings for training data in chunks
    X_train_chunks = []
    for i in range(0, len(train_df), CHUNK_SIZE):
        
        chunk = train_df['text'].iloc[i:i + CHUNK_SIZE]
        X_chunk = GloveVectorizer().fit_transform(chunk)
        X_chunk = StandardScaler(with_mean=False).fit_transform(X_chunk)
        
        X_train_chunks.append(X_chunk)
    
    X_train = np.vstack(X_train_chunks)
    del X_train_chunks
    
    # Process dev data
    X_dev_chunks = []
    for i in range(0, len(dev_df), CHUNK_SIZE):
        chunk = dev_df['text'].iloc[i:i + CHUNK_SIZE]
        
        X_chunk = GloveVectorizer().transform(chunk)
        X_chunk = StandardScaler(with_mean=False).fit_transform(X_chunk)
        
        X_dev_chunks.append(X_chunk)
        
    X_dev = np.vstack(X_dev_chunks)
    del X_dev_chunks
    
    dtrain = xgb.DMatrix(X_train, label=train_labels, 
                        enable_categorical=True,
                        nthread=-1)
    ddev = xgb.DMatrix(X_dev, label=dev_labels,
                       enable_categorical=True,
                       nthread=-1)
    del X_train, X_dev
    
    # Train model
    model = xgb.train(
        xgb_params,
        dtrain,
        num_boost_round=xgb_params['n_estimators'],
        evals=[(ddev, 'eval')],
        early_stopping_rounds=25,
        verbose_eval=False
    )
    
    # Make predictions
    dev_preds = (model.predict(ddev) >= 0.5).astype(int)    
    metrics = calculate_all_metrics(dev_labels, dev_preds)
    
    # Report intermediate values for pruning
    trial.report(metrics['W Macro-F1'], step=model.best_iteration)
    if trial.should_prune():
        raise optuna.exceptions.TrialPruned()
    
    return metrics['W Macro-F1']

In [None]:
if __name__ == "__main__":
    print("\nHYPERPARAMETER TUNING")
    print("=====================")
    print(f"Running {N_TRIALS} trials...")
    
    # Create a study with TPE sampler and MedianPruner
    sampler = TPESampler(seed=42, 
                         n_startup_trials=int(N_TRIALS / 10), # First 10% of trials are random, then TPE
                         multivariate=True, 
                         constant_liar=True) # constant_liar = True as we are doing distributed optimisation

    pruner = MedianPruner(n_startup_trials=5, 
                          n_warmup_steps=5, 
                          interval_steps=2)
    
    study = optuna.create_study(
        direction='maximize',
        sampler=sampler,
        pruner=pruner,
        study_name='xgboost_evidence_detection'
    )
    
    try:
        study.optimize(objective, n_trials=N_TRIALS, n_jobs=-1)
    except KeyboardInterrupt:
        print("Hyperparameter tuning interrupted.")
    
    print("\nBest trial:")
    trial = study.best_trial
    print(f"  Value (Accuracy): {trial.value}")
    print("  Params:")
    for key, value in trial.params.items():
        print(f"    {key}: {value}")

[I 2025-03-27 13:13:26,182] A new study created in memory with name: xgboost_evidence_detection



HYPERPARAMETER TUNING
Running 50 trials...


[I 2025-03-27 13:19:14,403] Trial 7 finished with value: 0.7935391242811073 and parameters: {'max_depth': 3, 'learning_rate': 0.10229155080951147, 'n_estimators': 125, 'min_child_weight': 4, 'gamma': 1.8876263249109634e-08, 'subsample': 0.8245508706032039, 'colsample_bytree': 0.8709318045473746, 'reg_alpha': 0.022362500447227703, 'reg_lambda': 4.574416947170543e-08}. Best is trial 7 with value: 0.7935391242811073.
[I 2025-03-27 13:19:18,176] Trial 15 finished with value: 0.7981341165677212 and parameters: {'max_depth': 6, 'learning_rate': 0.263546115378612, 'n_estimators': 142, 'min_child_weight': 3, 'gamma': 2.000319653606339e-08, 'subsample': 0.8895640669209626, 'colsample_bytree': 0.7062913286508585, 'reg_alpha': 0.00025935398203485, 'reg_lambda': 0.005711922093594107}. Best is trial 15 with value: 0.7981341165677212.
[I 2025-03-27 13:20:48,994] Trial 2 finished with value: 0.8039039428668125 and parameters: {'max_depth': 4, 'learning_rate': 0.21725135682260818, 'n_estimators': 382,

In [None]:
best_f1 = 0.796359627026355
best_params = {
    "max_features": 5000,
    "min_df": 3,
    "ngram_range": (1, 3),
    "max_depth": 8,
    "learning_rate": 0.22260998418525665,
    "n_estimators": 666,
    "min_child_weight": 2,
    "gamma": 0.005341979637780388,
    "subsample": 0.9900049863839708,
    "colsample_bytree": 0.5918673659371315,
    "reg_alpha": 6.834558057096682e-08,
    "reg_lambda": 0.008776003604199037,
}