In [1]:
!pip install transformers datasets huggingface_hub optuna tensorboard peft



In [3]:
import logging
import os
import pandas as pd
import torch
import numpy as np
import matplotlib.pyplot as plt
import optuna
from optuna.samplers import TPESampler
import json
from tqdm import tqdm
from pathlib import Path
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    AutoConfig,
    TrainingArguments,
    Trainer,
    EarlyStoppingCallback,
    DataCollatorWithPadding
)
from peft import get_peft_model, LoraConfig, TaskType, PeftModel
from sklearn.metrics import (
    accuracy_score,
    precision_recall_fscore_support,
    matthews_corrcoef,
    confusion_matrix,
)
from datasets import Dataset as HFDataset
from torch.optim import AdamW
from torch.utils.data import DataLoader
from torch import nn
from tqdm import tqdm

# Configure logging
logging.basicConfig(
    format='%(asctime)s - %(message)s',
    datefmt='%Y-%m-%d %H:%M:%S',
    level=logging.INFO
)

# Disable wandb
os.environ['WANDB_DISABLED'] = 'true'

# Path configuration
DATA_DIR = Path("/kaggle/working/")
TRAIN_FILE = "/kaggle/input/ed-uom/train.csv"
DEV_FILE = "/kaggle/input/ed-uom/dev.csv"
AUG_TRAIN_FILE = "/kaggle/input/ed-uom/train_augmented.csv"
NEW_AUG = "/kaggle/input/ed-uom/train_augmented_new.csv"
AUG_TRAIN_HIGH_REPLACEMENT_FILE = DATA_DIR / "train_augmented_high_replacement_fraction.csv"
SAVE_DIR = DATA_DIR / "results" / "transformer"
SAVE_DIR.mkdir(parents=True, exist_ok=True)

In [7]:
# Training parameters
BATCH_SIZE = 8
NUM_EPOCHS = 3
LEARNING_RATE = 5e-5
WEIGHT_DECAY = 0.03
WARMUP_RATIO = 0.11
DROPOUT_RATE = 0.05
FF_DROPOUT_RATE = 0.05
MAX_SEQ_LENGTH = 512
BASE_MODEL = 'microsoft/deberta-v3-large'


# Optuna parameters
N_TRIALS = 10

WEIGHT_DECAYS = [0.001, 0.1]
WARMUP_RATIOS = [0.05, 0.15]
DROPOUT_RATES = [0.05]
FF_DROPOUT_RATES = [0.05]

In [8]:
def get_device() -> torch.device:
    """Determine the device to use for computations."""
    if torch.cuda.is_available():
        return torch.device('cuda')
    elif hasattr(torch.backends, 'mps') and torch.backends.mps.is_available():
        return torch.device('mps')
    return torch.device('cpu')

def preprocess_function(examples, tokenizer, max_seq_length):
    """Process examples for BERT/DeBERTa classification."""
    # Combine claim and evidence
    claims = []
    evidences = []

    # Create inputs and targets
    for claim, evidence in zip(examples['Claim'], examples['Evidence']):
        formatted_claim = f"Claim: {claim}"
        formatted_evidence = f"Evidence: {evidence}"
        claims.append(formatted_claim)
        evidences.append(formatted_evidence)
    
    # Tokenize inputs
    model_inputs = tokenizer(
        claims,
        evidences,
        max_length=max_seq_length,
        padding=False,
        truncation=True,
    )
    
    # Add labels (binary classification)
    model_inputs["labels"] = examples['label']
    return model_inputs

def convert_to_hf_dataset(dataframe):
    """Convert pandas dataframe to HuggingFace dataset format."""
    return HFDataset.from_pandas(dataframe)

def load_data(tokenizer, max_seq_length):
    """Load and prepare the training and development datasets."""
    logging.info("Loading datasets...")
    
    # Load CSV files into pandas dataframes
    train_df = pd.read_csv(AUG_TRAIN_FILE)
    dev_df = pd.read_csv(DEV_FILE)

    # try:
    #     train_augmented_df = pd.read_csv(AUG_TRAIN_FILE)
    #     another_aug_df = pd.read_csv(ANOTHER_AUG_FILE)
    #     train_df = pd.concat([train_df, train_augmented_df, another_aug_df])
    # except Exception as e:
    #     logging.error(f"Error loading or concatenating augmented training data: {e}")
    #     raise
    
    print(f"Training data shape: {train_df.shape}")
    print(f"Development data shape: {dev_df.shape}")
    
    # Check and report class distribution
    train_positive = (train_df['label'] == 1).sum()
    train_negative = (train_df['label'] == 0).sum()
    dev_positive = (dev_df['label'] == 1).sum()
    dev_negative = (dev_df['label'] == 0).sum()
    
    print(f"Training data distribution: Positive: {train_positive} ({train_positive/len(train_df)*100:.1f}%), "
                 f"Negative: {train_negative} ({train_negative/len(train_df)*100:.1f}%)")
    print(f"Dev data distribution: Positive: {dev_positive} ({dev_positive/len(dev_df)*100:.1f}%), "
                 f"Negative: {dev_negative} ({dev_negative/len(dev_df)*100:.1f}%)")
    
    # Add a sequential index to keep track of original order (if not already present)
    if 'original_index' not in dev_df.columns:
        dev_df['original_index'] = list(range(len(dev_df)))
    
    # Convert to HuggingFace datasets
    train_dataset = convert_to_hf_dataset(train_df)
    dev_dataset = convert_to_hf_dataset(dev_df)
    
    # Apply preprocessing (tokenization)
    train_dataset = train_dataset.map(
        lambda examples: preprocess_function(examples, tokenizer, max_seq_length),
        batched=True,
        batch_size=1000,
        remove_columns=['Claim', 'Evidence', 'label']
    )
    
    # For dev dataset, keep track of original indices but remove other columns
    columns_to_remove = [col for col in dev_df.columns if col not in ['original_index']]
    dev_dataset = dev_dataset.map(
        lambda examples: preprocess_function(examples, tokenizer, max_seq_length),
        batched=True,
        batch_size=1000,
        remove_columns=columns_to_remove
    )
    
    # Set format for pytorch
    train_dataset.set_format(type='torch')
    dev_dataset.set_format(type='torch')
    
    return train_dataset, dev_dataset, dev_df

def compute_metrics(eval_pred):
    """Calculate evaluation metrics for classification."""
    predictions, labels = eval_pred
    
    # For binary classification, get the predicted class (0 or 1)
    predictions = predictions.argmax(axis=1)
    
    # Calculate metrics with more focus on positive class
    accuracy = accuracy_score(labels, predictions)
    
    # Get more detailed metrics for both classes
    precision, recall, f1, _ = precision_recall_fscore_support(
        labels, predictions, average=None, zero_division=0
    )
    
    # Weighted metrics
    weighted_precision, weighted_recall, weighted_f1, _ = precision_recall_fscore_support(
        labels, predictions, average='weighted', zero_division=0
    )
    
    # Matthews Correlation Coefficient
    mcc = matthews_corrcoef(labels, predictions)
    
    # Return both class-specific and overall metrics
    metrics = {
        'Accuracy': accuracy,
        'Positive_Precision': precision[1] if len(precision) > 1 else 0,
        'Positive_Recall': recall[1] if len(recall) > 1 else 0,
        'Positive_F1': f1[1] if len(f1) > 1 else 0,
        'W Macro-P': weighted_precision,
        'W Macro-R': weighted_recall,
        'W Macro-F1': weighted_f1,
        'MCC': mcc
    }
    
    return metrics

def plot_confusion_matrix(y_true, y_pred, save_path):
    """Plot and save confusion matrix."""
    cm = confusion_matrix(y_true, y_pred)
    plt.figure(figsize=(8, 6))
    plt.imshow(cm, interpolation='nearest', cmap=plt.cm.Blues)
    plt.title('Confusion Matrix')
    plt.colorbar()
    
    classes = ['Negative', 'Positive']
    tick_marks = np.arange(len(classes))
    plt.xticks(tick_marks, classes, rotation=45)
    plt.yticks(tick_marks, classes)
    
    # Normalize confusion matrix
    cm_norm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
    
    # Add text annotations
    thresh = cm.max() / 2.
    for i, j in np.ndindex(cm.shape):
        plt.text(j, i, f'{cm[i, j]}\n({cm_norm[i, j]:.2f})',
                horizontalalignment="center",
                color="white" if cm[i, j] > thresh else "black")
    
    plt.tight_layout()
    plt.ylabel('True label')
    plt.xlabel('Predicted label')
    plt.savefig(save_path)
    plt.close()

def train_model(
    model,
    train_dataset,
    eval_dataset,
    output_dir,
    tokenizer,
    **kwargs
):
    """Train the classification model."""
    logging.info("Starting training...")
    
    # Free up CUDA memory before training
    torch.cuda.empty_cache()
    
    # Create data collator for dynamic padding
    data_collator = DataCollatorWithPadding(
        tokenizer=tokenizer,
        padding='longest'
    )
    
    training_args = TrainingArguments(
        output_dir=str(output_dir),
        greater_is_better=True,
        **kwargs
    )
    
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=eval_dataset,
        processing_class=tokenizer,
        data_collator=data_collator,
        compute_metrics=compute_metrics,
        callbacks=[EarlyStoppingCallback(early_stopping_patience=8)]
    )
    
    trainer.train()
    
    # Get the final model from the trainer
    model = trainer.model

    # If using PEFT, merge adapters before saving
    if isinstance(model, PeftModel):
        logging.info("Merging PEFT adapters into the base model...")
        model = model.merge_and_unload()
        logging.info("Adapters merged.")

    # Save the potentially merged model and tokenizer
    logging.info(f"Saving final model to {output_dir}")
    model.save_pretrained(output_dir)
    tokenizer.save_pretrained(output_dir)
    logging.info(f"Model and tokenizer saved to {output_dir}")

    eval_results = trainer.evaluate(eval_dataset)
    dev_preds = trainer.predict(eval_dataset)
    y_true = dev_preds.label_ids
    y_pred = dev_preds.predictions.argmax(axis=1)

    # Save predictions to a CSV file with original dev data for alignment
    # First, load the original dev CSV to maintain alignment
    dev_df = pd.read_csv(DEV_FILE)
    
    # Create a dataframe with predictions
    predictions_df = pd.DataFrame({'prediction': y_pred})

    
    
    # Check if the evaluation dataset has original indices
    if hasattr(eval_dataset, 'original_index') or 'original_index' in eval_dataset.features:
        # Get original indices if present
        try:
            original_indices = [item['original_index'] for item in eval_dataset]
            # Sort predictions by original index
            predictions_df['original_index'] = original_indices
            predictions_df = predictions_df.sort_values('original_index')
            del predictions_df['original_index']  # Remove after sorting
        except Exception as e:
            logging.warning(f"Couldn't use original indices: {e}")
    
    # Ensure the predictions align with the original data
    if len(dev_df) == len(predictions_df):
        # Add predictions to the original dev dataframe
        dev_df['prediction'] = predictions_df['prediction'].values
        predictions_csv_path = os.path.join(output_dir, "predictions_with_data.csv")
        dev_df.to_csv(predictions_csv_path, index=False)
        print(f"Predictions with original data saved to {predictions_csv_path}")
        
        # Also save just the predictions for convenience
        predictions_only_path = os.path.join(output_dir, "predictions.csv")
        predictions_df.to_csv(predictions_only_path, index=False)
    else:
        print(f"Prediction count ({len(predictions_df)}) doesn't match dev data count ({len(dev_df)})")
        # Save just the predictions
        predictions_csv_path = os.path.join(output_dir, "predictions.csv")
        predictions_df.to_csv(predictions_csv_path, index=False)
        print(f"Predictions saved to {predictions_csv_path}")
    
    # Plot and save confusion matrix
    cm_save_path = os.path.join(output_dir, "confusion_matrix.png")
    plot_confusion_matrix(y_true, y_pred, cm_save_path)

    print(eval_results)
    
    return eval_results

In [9]:
def objective(trial):
    """Optuna objective function for hyperparameter optimization."""
    # Get hyperparameters from trial
    weight_decay = trial.suggest_float("weight_decay", WEIGHT_DECAYS[0], WEIGHT_DECAYS[1], log=True)
    warmup_ratio = trial.suggest_float("warmup_ratio", WARMUP_RATIOS[0], WARMUP_RATIOS[1])
    
    device = get_device()
    logging.info(f"Trial {trial.number}: Using device: {device}")
    
    # Free GPU memory
    torch.cuda.empty_cache()

    peft_config = LoraConfig(
        task_type=TaskType.SEQ_CLS,
        inference_mode=False,
        r=8,  # Increased rank
        lora_alpha=16,  # Higher scale
        lora_dropout=0.1,
        target_modules=["query_proj", "key_proj", "value_proj", "dense"],  # Target both attention and FFN
        init_lora_weights='pissa',
        layers_to_transform=[i for i in range(6, 24)]
    )

    # Initialize tokenizer and model
    tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL)
    model = AutoModelForSequenceClassification.from_pretrained(
        BASE_MODEL, 
        num_labels=2,
        hidden_dropout_prob=DROPOUT_RATE,
        attention_probs_dropout_prob=DROPOUT_RATE,
    )

    hidden_size = model.classifier.in_features
    model.classifier = nn.Sequential(
        nn.Linear(hidden_size, hidden_size),
        nn.GELU(),
        nn.LayerNorm(hidden_size),
        nn.Dropout(FF_DROPOUT_RATE),
        nn.Linear(hidden_size, 2)
    )
    model.config.num_labels = 2

    model = get_peft_model(model, peft_config)
    
    model.to(device)
    
    # Load data with current max_seq_length
    train_dataset, dev_dataset, dev_df = load_data(tokenizer, MAX_SEQ_LENGTH)
    
    # Training parameters
    training_params = {
        'per_device_train_batch_size': BATCH_SIZE,
        'per_device_eval_batch_size': BATCH_SIZE,
        'learning_rate': LEARNING_RATE,
        'weight_decay': weight_decay,
        'num_train_epochs': NUM_EPOCHS,
        'warmup_ratio': warmup_ratio,
        'lr_scheduler_type': 'cosine',
        'evaluation_strategy': 'steps',
        'eval_steps': 1000,
        'save_strategy': 'steps',
        'save_steps': 1000,
        'save_total_limit': 1,
        'load_best_model_at_end': True,
        'metric_for_best_model': 'MCC',
        'fp16': torch.cuda.is_available(),
        'optim': 'adamw_torch',
        'logging_steps': 100,
        'logging_first_step': True,
        'group_by_length': True,
        'seed': 42,
        'label_smoothing_factor': 0.1,
    }
    
    # Set trial output directory
    trial_dir = SAVE_DIR / f"trial_{trial.number}"
    
    try:
        # Train with current hyperparameters
        eval_results = train_model(
            model,
            train_dataset,
            dev_dataset,
            trial_dir,
            tokenizer,
            **training_params
        )
        
        # Log the hyperparameters and results
        params = {
            "weight_decay": weight_decay,
            "warmup_ratio": warmup_ratio,
        }
        
        with open(trial_dir / "hyperparameters.json", "w") as f:
            json.dump({**params, **eval_results}, f, indent=2)
        
        # Return Matthews Correlation Coefficient as the objective value
        return eval_results["eval_MCC"]
    
    except Exception as e:
        print(f"Trial {trial.number} failed with error: {e}")
        # Return very bad score for failed trials
        return -1.0

def run_optuna_experiment():
    """Run Optuna hyperparameter optimization experiment."""
    logging.info("Starting hyperparameter optimization with Optuna...")
    
    # Create output directory for study
    study_dir = SAVE_DIR / "optuna_study"
    study_dir.mkdir(exist_ok=True)
    
    # Create a pruner to terminate unpromising trials
    pruner = optuna.pruners.MedianPruner()
    
    # Create a storage for the study
    storage_name = f"sqlite:///{study_dir}/optuna_study.db"
    
    # Create TPE sampler for Bayesian optimization
    sampler = TPESampler(seed=42)
    
    # Create the study
    study = optuna.create_study(
        direction="maximize",
        pruner=pruner,
        storage=storage_name,
        study_name="deberta_claim_evidence",
        load_if_exists=True,
        sampler=sampler
    )
    
    # Run optimization
    study.optimize(objective, n_trials=N_TRIALS)
    
    # Get best trial
    best_trial = study.best_trial
    
    # Log additional information about the Bayesian optimization
    logging.info(f"Using Bayesian optimization with TPE sampler")
    logging.info(f"Best trial: {best_trial.number}")
    logging.info(f"Best value: {best_trial.value}")
    logging.info("Best hyperparameters:")
    
    for param, value in best_trial.params.items():
        logging.info(f"\t{param}: {value}")
    
    # Save best parameters
    best_params = {
        "weight_decay": best_trial.params["weight_decay"],
        "warmup_ratio": best_trial.params["warmup_ratio"],
    }
    
    with open(study_dir / "best_params.json", "w") as f:
        json.dump(best_params, f, indent=2)
    
    # Plot optimization history
    fig = optuna.visualization.plot_optimization_history(study)
    fig.write_html(str(study_dir / "optimization_history.html"))
    
    # Plot parameter importance
    fig = optuna.visualization.plot_param_importances(study)
    fig.write_html(str(study_dir / "param_importances.html"))
    
    # Plot parameter relationships
    fig = optuna.visualization.plot_parallel_coordinate(study)
    fig.write_html(str(study_dir / "parallel_coordinate.html"))
    
    # Plot high-dimensional parameter relationships
    fig = optuna.visualization.plot_contour(study)
    fig.write_html(str(study_dir / "contour.html"))
    
    return best_params


In [10]:
def main():
    """Main execution function."""
    device = get_device()
    logging.info(f"Using device: {device}")

    # best_params = run_optuna_experiment()
    
    # Free GPU memory
    torch.cuda.empty_cache()

    peft_config = LoraConfig(
        task_type=TaskType.SEQ_CLS,
        inference_mode=False,
        r=8,  # Increased rank
        lora_alpha=16,  # Higher scale
        lora_dropout=0.1,
        target_modules=["query_proj", "key_proj", "value_proj", "dense"],  # Target both attention and FFN
        init_lora_weights='pissa',
        layers_to_transform=[i for i in range(6, 24)]
    )

    # Initialize tokenizer and model
    tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL)
    model = AutoModelForSequenceClassification.from_pretrained(
        BASE_MODEL, 
        num_labels=2,
        hidden_dropout_prob=DROPOUT_RATE,
        attention_probs_dropout_prob=DROPOUT_RATE,
    )

    hidden_size = model.classifier.in_features
    model.classifier = nn.Sequential(
        nn.Linear(hidden_size, hidden_size),
        nn.GELU(),
        nn.LayerNorm(hidden_size),
        nn.Dropout(FF_DROPOUT_RATE),
        nn.Linear(hidden_size, 2)
    )
    model.config.num_labels = 2

    # model = get_peft_model(model, peft_config)
    
    model.to(device)

    # Load data
    train_dataset, dev_dataset, dev_df = load_data(tokenizer, MAX_SEQ_LENGTH)
    
    # Training parameters with focus on preventing overfitting
    training_params = {
        'per_device_train_batch_size': BATCH_SIZE,
        'per_device_eval_batch_size': BATCH_SIZE,
        'learning_rate': 5e-5,
        'weight_decay': WEIGHT_DECAY,
        'num_train_epochs': 2,
        'warmup_ratio': WARMUP_RATIO,
        'lr_scheduler_type': 'cosine',
        'evaluation_strategy': 'steps',
        'eval_steps': 1000,
        'save_strategy': 'steps',
        'save_steps': 1000,
        'save_total_limit': 1,
        'load_best_model_at_end': True,
        'metric_for_best_model': 'MCC',
        'fp16': torch.cuda.is_available(),
        'optim': 'adamw_torch',
        'logging_steps': 100,
        'logging_first_step': True,
        'group_by_length': True,
        'seed': 42,
        'label_smoothing_factor': 0.1,
    }
    
    # Train with default parameters
    model_save_path = SAVE_DIR / BASE_MODEL.split('/')[-1]
    train_model(
        model,
        train_dataset,
        dev_dataset,
        model_save_path,
        tokenizer,
        **training_params
    )

if __name__ == "__main__":
    main()

tokenizer_config.json:   0%|          | 0.00/52.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/580 [00:00<?, ?B/s]

spm.model:   0%|          | 0.00/2.46M [00:00<?, ?B/s]



pytorch_model.bin:   0%|          | 0.00/874M [00:00<?, ?B/s]

Some weights of DebertaV2ForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-v3-large and are newly initialized: ['classifier.bias', 'classifier.weight', 'pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Training data shape: (28927, 3)
Development data shape: (5926, 3)
Training data distribution: Positive: 11708 (40.5%), Negative: 17219 (59.5%)
Dev data distribution: Positive: 1640 (27.7%), Negative: 4286 (72.3%)


Map:   0%|          | 0/28927 [00:00<?, ? examples/s]

Map:   0%|          | 0/5926 [00:00<?, ? examples/s]

Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


Step,Training Loss,Validation Loss,Accuracy,Positive Precision,Positive Recall,Positive F1,W macro-p,W macro-r,W macro-f1,Mcc
1000,0.5853,0.497093,0.802565,0.745303,0.435366,0.549654,0.794704,0.802565,0.783926,0.459904
2000,0.6844,0.617761,0.733716,0.620155,0.097561,0.168599,0.706027,0.733716,0.655256,0.163764
3000,0.6795,0.620229,0.733547,0.61597,0.09878,0.170257,0.704957,0.733547,0.655584,0.163401
4000,0.6886,0.618846,0.723253,0.0,0.0,0.0,0.523096,0.723253,0.607102,0.0
5000,0.6794,0.641289,0.723253,0.0,0.0,0.0,0.523096,0.723253,0.607102,0.0
6000,0.682,0.638811,0.723253,0.0,0.0,0.0,0.523096,0.723253,0.607102,0.0
7000,0.6823,0.631794,0.723253,0.0,0.0,0.0,0.523096,0.723253,0.607102,0.0


Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Tr

Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.


Predictions with original data saved to /kaggle/working/results/transformer/deberta-v3-large/predictions_with_data.csv
{'eval_loss': 0.4970925450325012, 'eval_Accuracy': 0.8025649679379008, 'eval_Positive_Precision': 0.7453027139874739, 'eval_Positive_Recall': 0.4353658536585366, 'eval_Positive_F1': 0.5496535796766744, 'eval_W Macro-P': 0.794704086860165, 'eval_W Macro-R': 0.8025649679379008, 'eval_W Macro-F1': 0.7839259397561248, 'eval_MCC': 0.45990408263906535, 'eval_runtime': 61.5547, 'eval_samples_per_second': 96.272, 'eval_steps_per_second': 12.038, 'epoch': 2.0}


In [None]:
from safetensors.torch import load_file as load_safetensors_file


def prepare_input(claim: str, evidence: str, tokenizer, max_length: int, device: torch.device):
    """Formats and tokenizes a single claim-evidence pair."""
    # --- Reuses the formatting logic from preprocess_function ---
    formatted_claim = f"Claim: {claim}"
    formatted_evidence = f"Evidence: {evidence}"

    # --- Reuses the tokenization logic ---
    inputs = tokenizer(
        formatted_claim,
        formatted_evidence,
        max_length=max_length,
        padding="max_length", # Or another appropriate padding strategy
        truncation=True,
        return_tensors="pt"  # Return PyTorch tensors
    )
    inputs = {k: v.to(device) for k, v in inputs.items()}
    return inputs

# --- Main Prediction Logic ---
def run_predictions(model_path: str, input_csv_path: str, output_csv_path: str):
    """Loads model MANUALLY, reads CSV, makes predictions, and saves results."""

    # 1. Load Tokenizer and Config (as before)
    print(f"Loading tokenizer from: {model_path}")
    device = get_device()
    tokenizer = AutoTokenizer.from_pretrained(model_path)
    config = AutoConfig.from_pretrained(model_path) # Load config separately

    # 2. *** Manually Construct the Model Architecture *** (same as before)
    print("Constructing model architecture...")
    model = AutoModelForSequenceClassification.from_config(config)
    hidden_size = model.classifier.in_features
    model.classifier = nn.Sequential(
        nn.Linear(hidden_size, hidden_size),
        nn.GELU(),
        nn.LayerNorm(hidden_size),
        nn.Dropout(FF_DROPOUT_RATE),
        nn.Linear(hidden_size, config.num_labels)
    )
    print("Custom classifier head applied.")

    # 3. *** Load the Saved Weights (State Dictionary) - MODIFIED ***
    safetensors_path = os.path.join(model_path, "model.safetensors")
    pytorch_bin_path = os.path.join(model_path, "pytorch_model.bin")

    state_dict = None
    weights_loaded_from = None

    if os.path.exists(safetensors_path):
        print(f"Loading weights from SafeTensors file: {safetensors_path}...")
        try:
            state_dict = load_safetensors_file(safetensors_path, device='cpu') # Load using safetensors library
            weights_loaded_from = safetensors_path
        except Exception as e:
            print(f"Error loading safetensors file: {e}")
            # Optionally, try pytorch_model.bin if safetensors fails
            if os.path.exists(pytorch_bin_path):
                 print(f"Attempting to load pytorch_model.bin instead...")
            else:
                 return # Stop if neither format seems to work

    if state_dict is None and os.path.exists(pytorch_bin_path):
        print(f"Loading weights from PyTorch bin file: {pytorch_bin_path}...")
        try:
            # Use weights_only=True for security as recommended by the warning
            state_dict = torch.load(pytorch_bin_path, map_location='cpu', weights_only=True)
            weights_loaded_from = pytorch_bin_path
        except Exception as e:
            print(f"Error loading pytorch_model.bin file: {e}. This might indicate corruption.")
            print("Please ensure the model saving process completed successfully.")
            return # Stop if loading fails

    if state_dict is None:
        print(f"Error: No weight file (model.safetensors or pytorch_model.bin) found or loaded successfully in {model_path}")
        return

    print(f"Weights loaded successfully from {weights_loaded_from}")

    # Load the state dict into the manually constructed model
    try:
        model.load_state_dict(state_dict)
    except RuntimeError as e:
        print(f"Error loading state dict into model: {e}")
        print("This often means the manually constructed architecture doesn't match the keys in the weights file.")
        print("Ensure the custom classifier definition EXACTLY matches the one used during training.")
        return


    model.to(device) # Move the complete model to the target device
    model.eval() # Set the model to evaluation mode
    print(f"Model constructed and weights loaded. Using device: {device}")

    # 4. Read Input CSV (same as before)
    print(f"Reading input CSV: {input_csv_path}")
    try:
        input_df = pd.read_csv(input_csv_path)
        if 'Claim' not in input_df.columns or 'Evidence' not in input_df.columns:
            raise ValueError("Input CSV must contain 'Claim' and 'Evidence' columns.")
        print(f"Loaded {len(input_df)} rows from {input_csv_path}")
    except FileNotFoundError:
        print(f"Error: Input CSV file not found at {input_csv_path}")
        return
    except Exception as e:
        print(f"Error reading CSV file: {e}")
        return

    # 5. Make Predictions (same as before)
    predictions = []
    print("Making predictions...")
    for index, row in tqdm(input_df.iterrows(), total=input_df.shape[0], desc="Predicting"):
        claim = str(row['Claim'])
        evidence = str(row['Evidence'])
        if not claim or not evidence:
             print(f"Warning: Skipping row {index} due to empty Claim or Evidence.")
             predictions.append(None)
             continue
        inputs = prepare_input(claim, evidence, tokenizer, MAX_SEQ_LENGTH, device)
        with torch.no_grad():
            outputs = model(**inputs)
            logits = outputs.logits
            predicted_class_id = torch.argmax(logits, dim=-1).item()
            predictions.append(predicted_class_id)

    # 6. Save Predictions (same as before)
    output_df = pd.DataFrame({'prediction': predictions})
    print(f"Saving predictions to: {output_csv_path}")
    try:
        output_df.to_csv(output_csv_path, index=False)
        print("Predictions saved successfully.")
    except Exception as e:
        print(f"Error saving predictions: {e}")

torch.cuda.empty_cache()
model_save_path = "/kaggle/working/results/transformer/deberta-v3-large"

run_predictions(model_save_path, DEV_FILE, 'predictions.csv')

torch.cuda.empty_cache()

Loading tokenizer from: /kaggle/working/results/transformer/deberta-v3-large
Constructing model architecture...
Custom classifier head applied.
Loading weights from SafeTensors file: /kaggle/working/results/transformer/deberta-v3-large/model.safetensors...
Weights loaded successfully from /kaggle/working/results/transformer/deberta-v3-large/model.safetensors
Model constructed and weights loaded. Using device: cuda
Reading input CSV: /kaggle/input/ed-uom/dev.csv
Loaded 5926 rows from /kaggle/input/ed-uom/dev.csv
Making predictions...


Predicting:  46%|████▌     | 2736/5926 [04:43<05:31,  9.63it/s]