# Group 26 - Deep learning with transformers -  Demo Code
## DeBERTa

#### Harvey Dennis and William Asbery

### Dependencies

__PLEASE RUN THE CELLS BELOW__

In [None]:
!pip install transformers datasets huggingface_hub optuna tensorboard peft

In [None]:
import logging
import os
import pandas as pd
import torch
import numpy as np
import matplotlib.pyplot as plt
import optuna
from optuna.samplers import TPESampler
import json
from tqdm import tqdm
from pathlib import Path
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    AutoConfig,
    TrainingArguments,
    Trainer,
    EarlyStoppingCallback,
    DataCollatorWithPadding
)
from peft import get_peft_model, LoraConfig, TaskType, PeftModel
from sklearn.metrics import (
    accuracy_score,
    precision_recall_fscore_support,
    matthews_corrcoef,
    confusion_matrix,
)
from datasets import Dataset as HFDataset
from torch import nn
from tqdm import tqdm
from safetensors.torch import load_file as load_safetensors_file

# Configure logging
logging.basicConfig(
    format='%(asctime)s - %(message)s',
    datefmt='%Y-%m-%d %H:%M:%S',
    level=print
)

# Disable wandb
os.environ['WANDB_DISABLED'] = 'true'

### Constants

This config specifies the configuration for the paths of the files being used to train, validate and test the model.

Please add the relative or absolute paths (from Kaggle) to the train, dev, test and augmented train files. We provide you with an augmented train file as the augmentation pipeline takes about 2 hours to run.

__PLEASE RUN THE CELL BELOW.__

### Path constants

In [None]:
# Path configuration
DATA_DIR = Path("/kaggle/working/")
TRAIN_FILE = "/kaggle/input/ed-uom/train.csv"
DEV_FILE = "/kaggle/input/ed-uom/dev.csv"
AUG_TRAIN_FILE = "/kaggle/input/ed-uom/train_augmented.csv"
NEW_AUG = "/kaggle/input/ed-uom/train_augmented_new.csv"

SAVE_DIR = DATA_DIR / "results" / "transformer"
SAVE_DIR.mkdir(parents=True, exist_ok=True)

### Best params from tuning

In [None]:
# Training parameters
BATCH_SIZE = 8
NUM_EPOCHS = 5
LEARNING_RATE = 5e-5
WEIGHT_DECAY = 0.03
WARMUP_RATIO = 0.11
DROPOUT_RATE = 0.05
FF_DROPOUT_RATE = 0.05
MAX_SEQ_LENGTH = 512
BASE_MODEL = 'microsoft/deberta-v3-large'

### Demo code: run predictions

In [None]:

def get_device() -> torch.device:
    """Determine the device to use for computations."""
    if torch.cuda.is_available():
        return torch.device('cuda')
    elif hasattr(torch.backends, 'mps') and torch.backends.mps.is_available():
        return torch.device('mps')
    return torch.device('cpu')

def prepare_input(claim: str, evidence: str, tokenizer, max_length: int, device: torch.device):
    """Formats and tokenizes a single claim-evidence pair."""
    # --- Reuses the formatting logic from preprocess_function ---
    formatted_claim = f"Claim: {claim}"
    formatted_evidence = f"Evidence: {evidence}"

    # --- Reuses the tokenization logic ---
    inputs = tokenizer(
        formatted_claim,
        formatted_evidence,
        max_length=max_length,
        padding="max_length", # Or another appropriate padding strategy
        truncation=True,
        return_tensors="pt"  # Return PyTorch tensors
    )
    inputs = {k: v.to(device) for k, v in inputs.items()}
    return inputs

# --- Main Prediction Logic ---
def run_predictions(model_path: str, input_csv_path: str, output_csv_path: str):
    """Loads model MANUALLY, reads CSV, makes predictions, and saves results."""

    # 1. Load Tokenizer and Config (as before)
    print(f"Loading tokenizer from: {model_path}")
    device = get_device()
    tokenizer = AutoTokenizer.from_pretrained(model_path)
    config = AutoConfig.from_pretrained(model_path) # Load config separately

    # 2. *** Manually Construct the Model Architecture *** (same as before)
    print("Constructing model architecture...")
    model = AutoModelForSequenceClassification.from_config(config)
    hidden_size = model.classifier.in_features
    model.classifier = nn.Sequential(
        nn.Linear(hidden_size, hidden_size),
        nn.GELU(),
        nn.LayerNorm(hidden_size),
        nn.Dropout(FF_DROPOUT_RATE),
        nn.Linear(hidden_size, config.num_labels)
    )
    print("Custom classifier head applied.")

    # 3. *** Load the Saved Weights (State Dictionary) - MODIFIED ***
    safetensors_path = os.path.join(model_path, "model.safetensors")
    pytorch_bin_path = os.path.join(model_path, "pytorch_model.bin")

    state_dict = None
    weights_loaded_from = None

    if os.path.exists(safetensors_path):
        print(f"Loading weights from SafeTensors file: {safetensors_path}...")
        try:
            state_dict = load_safetensors_file(safetensors_path, device='cpu') # Load using safetensors library
            weights_loaded_from = safetensors_path
        except Exception as e:
            print(f"Error loading safetensors file: {e}")
            # Optionally, try pytorch_model.bin if safetensors fails
            if os.path.exists(pytorch_bin_path):
                 print(f"Attempting to load pytorch_model.bin instead...")
            else:
                 return # Stop if neither format seems to work

    if state_dict is None and os.path.exists(pytorch_bin_path):
        print(f"Loading weights from PyTorch bin file: {pytorch_bin_path}...")
        try:
            # Use weights_only=True for security as recommended by the warning
            state_dict = torch.load(pytorch_bin_path, map_location='cpu', weights_only=True)
            weights_loaded_from = pytorch_bin_path
        except Exception as e:
            print(f"Error loading pytorch_model.bin file: {e}. This might indicate corruption.")
            print("Please ensure the model saving process completed successfully.")
            return # Stop if loading fails

    if state_dict is None:
        print(f"Error: No weight file (model.safetensors or pytorch_model.bin) found or loaded successfully in {model_path}")
        return

    print(f"Weights loaded successfully from {weights_loaded_from}")

    # Load the state dict into the manually constructed model
    try:
        model.load_state_dict(state_dict)
    except RuntimeError as e:
        print(f"Error loading state dict into model: {e}")
        print("This often means the manually constructed architecture doesn't match the keys in the weights file.")
        print("Ensure the custom classifier definition EXACTLY matches the one used during training.")
        return


    model.to(device) # Move the complete model to the target device
    model.eval() # Set the model to evaluation mode
    print(f"Model constructed and weights loaded. Using device: {device}")

    # 4. Read Input CSV (same as before)
    print(f"Reading input CSV: {input_csv_path}")
    try:
        input_df = pd.read_csv(input_csv_path)
        if 'Claim' not in input_df.columns or 'Evidence' not in input_df.columns:
            raise ValueError("Input CSV must contain 'Claim' and 'Evidence' columns.")
        print(f"Loaded {len(input_df)} rows from {input_csv_path}")
    except FileNotFoundError:
        print(f"Error: Input CSV file not found at {input_csv_path}")
        return
    except Exception as e:
        print(f"Error reading CSV file: {e}")
        return

    # 5. Make Predictions (same as before)
    predictions = []
    print("Making predictions...")
    for index, row in tqdm(input_df.iterrows(), total=input_df.shape[0], desc="Predicting"):
        claim = str(row['Claim'])
        evidence = str(row['Evidence'])
        if not claim or not evidence:
             print(f"Warning: Skipping row {index} due to empty Claim or Evidence.")
             predictions.append(None)
             continue
        inputs = prepare_input(claim, evidence, tokenizer, MAX_SEQ_LENGTH, device)
        with torch.no_grad():
            outputs = model(**inputs)
            logits = outputs.logits
            predicted_class_id = torch.argmax(logits, dim=-1).item()
            predictions.append(predicted_class_id)

    # 6. Save Predictions (same as before)
    output_df = pd.DataFrame({'prediction': predictions})
    print(f"Saving predictions to: {output_csv_path}")
    try:
        output_df.to_csv(output_csv_path, index=False)
        print("Predictions saved successfully.")
    except Exception as e:
        print(f"Error saving predictions: {e}")

torch.cuda.empty_cache()
model_save_path = "/kaggle/working/results/transformer/deberta-v3-large"

run_predictions(model_save_path, DEV_FILE, 'predictions.csv')

torch.cuda.empty_cache()