# Evidence Detection

## Data is in the form of: Claim, Evidence, Labels

## Labels
- **1 (Relevant)** - The evidence supports or is related to the claim.
- **0 (Not Relevant)** – The evidence does not support or is unrelated to the claim.

### Dependency Management

In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from torch.nn.utils.rnn import pad_sequence, pack_padded_sequence, pad_packed_sequence
import numpy as np
import pandas as pd
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score
from sklearn.model_selection import train_test_split
from tqdm import tqdm
import nltk
from nltk.tokenize import word_tokenize
from collections import Counter
import pickle
import os

# Create a directory for NLTK data
nltk_data_dir = os.path.join(os.getcwd(), 'nltk_data')
os.makedirs(nltk_data_dir, exist_ok=True)

# Set the NLTK data path
nltk.data.path.append(nltk_data_dir)

# Download punkt to the specified directory
nltk.download('punkt_tab', download_dir=nltk_data_dir)

[nltk_data] Downloading package punkt_tab to /content/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


True

In [5]:
from evidence_detection.bilstm_with_attention import BiLSTMAttention
from evidence_detection.evidence_detection_dataset import EvidenceDetectionDataset
from evidence_detection.vocabulary import Vocabulary
from evidence_detection.trainer import Trainer

In [6]:
# Set random seed for reproducibility
torch.manual_seed(42)
np.random.seed(42)
if torch.cuda.is_available():
    torch.cuda.manual_seed_all(42)

# Device configuration
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")

# Create output directory for models and plots
os.makedirs('models', exist_ok=True)
# os.makedirs('plots', exist_ok=True)

Using device: cuda


### Define Glove Embedding Method

In [7]:
def load_glove_embeddings(vocab, glove_path, embedding_dim=300):
    """Load GloVe embeddings for words in vocabulary"""
    embeddings = np.zeros((len(vocab), embedding_dim))

    # Initialize random embeddings
    for i in range(len(vocab)):
        embeddings[i] = np.random.normal(scale=0.1, size=(embedding_dim, ))

    # Load pretrained embeddings
    if not os.path.exists(glove_path):
        print(f"GloVe embeddings not found at {glove_path}. Using random embeddings.")
        return embeddings

    print(f"Loading GloVe embeddings from {glove_path}...")

    with open(glove_path, 'r', encoding='utf-8') as f:
        for i, line in enumerate(tqdm(f, desc="Loading GloVe")):
            try:
                values = line.split()

                # Check if the vector has the correct dimension
                if len(values) != embedding_dim + 1:  # +1 for the word itself
                    print(f"Warning: Line {i} has {len(values)} values, expected {embedding_dim + 1}. Skipping.")
                    continue

                word = values[0]
                if word in vocab.stoi:
                    vector = np.array(values[1:], dtype='float32')

                    # Double-check vector dimension
                    if len(vector) != embedding_dim:
                        print(f"Warning: Vector for word '{word}' has dimension {len(vector)}, expected {embedding_dim}. Skipping.")
                        continue

                    embeddings[vocab.stoi[word]] = vector
            except Exception as e:
                print(f"Error processing line {i}: {e}")
                continue

    print(f"Loaded {embedding_dim}-dimensional GloVe embeddings.")
    return embeddings

### Get Data, build vocabulary and Build Glove Embeddings

In [8]:
# Load datasets
print("Loading datasets...")
# train_df = pd.read_csv('./data/train.csv')
# val_df = pd.read_csv('./data/dev.csv')
print("Using train.csv for train and validation")
print("Using dev.csv for testing")
train_df = pd.read_csv('./data/train.csv')
train_df, val_df = train_test_split(train_df, test_size=0.2, random_state=42)
test_df = pd.read_csv('./data/dev.csv')

print(f"Train shape: {train_df.shape}, Validation shape: {val_df.shape}")
print(f"Label distribution in train: {train_df['label'].value_counts().to_dict()}")
print(f"Label distribution in val: {val_df['label'].value_counts().to_dict()}")

Loading datasets...
Using train.csv for train and validation
Using dev.csv for testing
Train shape: (17206, 3), Validation shape: (4302, 3)
Label distribution in train: {0: 12504, 1: 4702}
Label distribution in val: {0: 3150, 1: 1152}


In [9]:
# Load or create vocabulary
vocab_path = 'vocab.pkl'
if os.path.exists(vocab_path):
    print(f"Loading vocabulary from {vocab_path}")
    with open(vocab_path, 'rb') as f:
        vocab = pickle.load(f)
else:
    print("Creating new vocabulary")
    vocab = Vocabulary(freq_threshold=3)
    all_texts = train_df['Claim'].tolist() + train_df['Evidence'].tolist()
    vocab.build_vocabulary(all_texts)

    # Save vocabulary
    with open(vocab_path, 'wb') as f:
        pickle.dump(vocab, f)

print(f"Vocabulary size: {len(vocab)}")

Creating new vocabulary


Building vocabulary: 100%|██████████| 34412/34412 [00:04<00:00, 8333.98it/s]

Vocabulary size: 12686
Vocabulary size: 12686





In [10]:
# Load GloVe embeddings
embedding_dim = 300
glove_path = "glove.6B.300d.txt"
embeddings = load_glove_embeddings(vocab, glove_path, embedding_dim)

Loading GloVe embeddings from glove.6B.300d.txt...


Loading GloVe: 200582it [00:03, 50552.73it/s]

Loaded 300-dimensional GloVe embeddings.





### Train Model

In [11]:
# Create datasets
train_dataset = EvidenceDetectionDataset(train_df, vocab)
val_dataset = EvidenceDetectionDataset(val_df, vocab)
test_dataset = EvidenceDetectionDataset(test_df, vocab)

print(f"Train dataset size: {len(train_dataset)}")
print(f"Validation dataset size: {len(val_dataset)}")
print(f"Test dataset size: {len(test_dataset)}")

# Model hyperparameters
hidden_dim = 256
num_layers = 1
dropout = 0.4

# Initialize model
model = BiLSTMAttention(
    vocab_size=len(vocab),
    embedding_dim=embedding_dim,
    hidden_dim=hidden_dim,
    num_layers=num_layers,
    dropout=dropout,
    pretrained_embeddings=embeddings
)

# Print model architecture summary
print("\nModel Architecture:")
print(f"Vocabulary Size: {len(vocab)}")
print(f"Embedding Dimension: {embedding_dim}")
print(f"Hidden Dimension: {hidden_dim}")
print(f"Number of LSTM Layers: {num_layers}")
print(f"Dropout Rate: {dropout}")

# Count parameters
total_params = sum(p.numel() for p in model.parameters())
trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
print(f"Total Parameters: {total_params:,}")
print(f"Trainable Parameters: {trainable_params:,}")

# Training hyperparameters
batch_size = 32
learning_rate = 1e-3
weight_decay = 1e-5
num_epochs = 15

# Initialize trainer
trainer = Trainer(
    model=model,
    train_dataset=train_dataset,
    val_dataset=val_dataset,
    batch_size=batch_size,
    learning_rate=learning_rate,
    weight_decay=weight_decay,
    device=device
)

# Train model
print("\nStarting training...")
trainer.train(num_epochs=num_epochs)

# Final evaluation
print("\nPerforming final evaluation on validation set...")
val_metrics = trainer.evaluate()

print("\nFinal Validation Metrics:")
print(f"Loss: {val_metrics['loss']:.4f}")
print(f"Accuracy: {val_metrics['accuracy']:.4f}")
print(f"Precision: {val_metrics['precision']:.4f}")
print(f"Recall: {val_metrics['recall']:.4f}")
print(f"F1 Score: {val_metrics['f1']:.4f}")

# Save final model
final_model_path = 'models/final_model.pt'
trainer.save_model(final_model_path)

print("\nTraining and evaluation completed!")

Train dataset size: 17206
Validation dataset size: 4302
Test dataset size: 5926

Model Architecture:
Vocabulary Size: 12686
Embedding Dimension: 300
Hidden Dimension: 256
Number of LSTM Layers: 1
Dropout Rate: 0.4
Total Parameters: 4,884,332
Trainable Parameters: 4,884,332

Starting training...
Starting training on device: cuda
Training set size: 17206
Validation set size: 4302


Training: 100%|██████████| 538/538 [00:16<00:00, 31.95it/s]
Validating: 100%|██████████| 135/135 [00:01<00:00, 68.76it/s]


Epoch 1/15 - Time: 18.88s
Train Loss: 0.4447, Train F1: 0.5409
Val Loss: 0.3782, Val F1: 0.6112
Val Precision: 0.7617, Val Recall: 0.5104
Validation F1 increased (0.000000 --> 0.611227). Saving model...


Training: 100%|██████████| 538/538 [00:15<00:00, 35.48it/s]
Validating: 100%|██████████| 135/135 [00:01<00:00, 68.86it/s]


Epoch 2/15 - Time: 17.19s
Train Loss: 0.3416, Train F1: 0.7053
Val Loss: 0.3596, Val F1: 0.6746
Val Precision: 0.7135, Val Recall: 0.6398
Validation F1 increased (0.611227 --> 0.674600). Saving model...


Training: 100%|██████████| 538/538 [00:15<00:00, 35.22it/s]
Validating: 100%|██████████| 135/135 [00:01<00:00, 69.93it/s]


Epoch 3/15 - Time: 17.29s
Train Loss: 0.2478, Train F1: 0.8011
Val Loss: 0.4162, Val F1: 0.6708
Val Precision: 0.6465, Val Recall: 0.6970
EarlyStopping counter: 1 out of 3


Training: 100%|██████████| 538/538 [00:15<00:00, 35.63it/s]
Validating: 100%|██████████| 135/135 [00:01<00:00, 71.29it/s]


Epoch 4/15 - Time: 17.06s
Train Loss: 0.1505, Train F1: 0.8925
Val Loss: 0.5544, Val F1: 0.6344
Val Precision: 0.6689, Val Recall: 0.6033
EarlyStopping counter: 2 out of 3


Training: 100%|██████████| 538/538 [00:14<00:00, 35.89it/s]
Validating: 100%|██████████| 135/135 [00:01<00:00, 68.94it/s]


Epoch 5/15 - Time: 17.04s
Train Loss: 0.0785, Train F1: 0.9501
Val Loss: 0.7167, Val F1: 0.6092
Val Precision: 0.6417, Val Recall: 0.5799
EarlyStopping counter: 3 out of 3
Early stopping triggered
Loaded best model from 'best_bilstm_attention_model.pt'

Performing final evaluation on validation set...


Validating: 100%|██████████| 135/135 [00:01<00:00, 69.80it/s]



Final Validation Metrics:
Loss: 0.3596
Accuracy: 0.8347
Precision: 0.7135
Recall: 0.6398
F1 Score: 0.6746
Model saved to models/final_model.pt

Training and evaluation completed!


### Test the model

In [12]:
# Create test data loader for final testing
test_loader = DataLoader(
    test_dataset,
    batch_size=batch_size,
    shuffle=False,
    collate_fn=trainer.collate_fn
)

# Now evaluate on test set
print("\nPerforming evaluation on test set (dev.csv)...")
model.eval()
test_loss = 0.0
all_predictions = []
all_labels = []

with torch.no_grad():
    for batch in tqdm(test_loader, desc="Testing"):
        # Move batch to device
        claim_ids = batch['claim_ids'].to(device)
        claim_lengths = batch['claim_lengths']
        evidence_ids = batch['evidence_ids'].to(device)
        evidence_lengths = batch['evidence_lengths']
        labels = batch['labels'].to(device)

        # Forward pass
        logits, _ = model(claim_ids, claim_lengths, evidence_ids, evidence_lengths)
        loss = trainer.criterion(logits, labels)

        # Accumulate metrics
        test_loss += loss.item()
        predictions = torch.argmax(logits, dim=1).cpu().numpy()
        all_predictions.extend(predictions)
        all_labels.extend(labels.cpu().numpy())

# Calculate metrics
accuracy = accuracy_score(all_labels, all_predictions)
precision = precision_score(all_labels, all_predictions, average='binary')
recall = recall_score(all_labels, all_predictions, average='binary')
f1 = f1_score(all_labels, all_predictions, average='binary')

print("\nTest Set Metrics:")
print(f"Loss: {test_loss / len(test_loader):.4f}")
print(f"Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1 Score: {f1:.4f}")


Performing evaluation on test set (dev.csv)...


Testing: 100%|██████████| 186/186 [00:02<00:00, 68.23it/s]



Test Set Metrics:
Loss: 0.3728
Accuracy: 0.8237
Precision: 0.7105
Recall: 0.6122
F1 Score: 0.6577
