# Group 5 - ED(B) - Deep Learning Approach without the use of Transformers - Demo

### Dependency Management

In [None]:
import torch
from torch.utils.data import DataLoader
import pandas as pd
import numpy as np
import pickle
import os
import nltk
from tqdm import tqdm
from torch.nn.utils.rnn import pad_sequence

In [None]:
# Create a directory for NLTK data
nltk_data_dir = os.path.join(os.getcwd(), 'nltk_data')
os.makedirs(nltk_data_dir, exist_ok=True)

# Set the NLTK data path
nltk.data.path.append(nltk_data_dir)

# Download punkt to the specified directory
nltk.download('punkt_tab', download_dir=nltk_data_dir)

In [None]:
# Set random seed for reproducibility
torch.manual_seed(42)
np.random.seed(42)
if torch.cuda.is_available():
    torch.cuda.manual_seed_all(42)

# Device configuration
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")

# Create output directory for models and plots
os.makedirs('models', exist_ok=True)

In [None]:
from evidence_detection.bilstm_with_attention import BiLSTMAttention
from evidence_detection.vocabulary import Vocabulary
from evidence_detection.evidence_detection_dataset import EvidenceDetectionDataset

### Load test set

In [None]:
test_path = 'data/test.csv'
test_df = pd.read_csv(test_path)

test_df.head()

### Load the vocabulary and set the embedding dimension

The vocabulary "vocab.pkl" an be downloaded from the following link: https://drive.google.com/drive/folders/1TWv5UKNsNeQGxafx3GQf87Dc8vcp5V8t

In [None]:
# Load vocabulary
with open('vocab.pkl', 'rb') as f:
    vocab = pickle.load(f)

In [None]:
# Use the same embedding dimension used in training
embedding_dim = 300

### Create test dataset and dataloader

In [None]:
def collate_fn(batch):
  """
  Custom collate function to handle variable-length sequences and
  to collate batches without labels for test data.

  Args:
    batch: A batch of data from the dataset

  Returns:
    Dictionary with padded sequences and other batch information
  """
  # Separate batch elements
  claims = [item['claim_ids'] for item in batch]
  claim_lengths = torch.tensor([item['claim_length'] for item in batch])
  evidences = [item['evidence_ids'] for item in batch]
  evidence_lengths = torch.tensor([item['evidence_length'] for item in batch])

  # Pad sequences
  padded_claims = pad_sequence(claims, batch_first=True, padding_value=0)
  padded_evidences = pad_sequence(evidences, batch_first=True, padding_value=0)

  return {
    'claim_ids': padded_claims,
    'claim_lengths': claim_lengths,
    'evidence_ids': padded_evidences,
    'evidence_lengths': evidence_lengths,
  }

In [None]:
# Create dataset
test_dataset = EvidenceDetectionDataset(test_df, vocab, is_test=True)
print(f"Test dataset created: {len(test_dataset)} samples")

Test dataset created: 5926 samples


In [None]:
# Create test data loader for testing
test_loader = DataLoader(
    test_dataset,
    batch_size=32,
    shuffle=False,
    collate_fn=collate_fn
)

### Load the model architecture and weights

The model weights "ED_model_B.pt" and architecture "model_architecture_parameters/parameters.pkl" can be downloaded from the following link:
https://drive.google.com/drive/folders/1TWv5UKNsNeQGxafx3GQf87Dc8vcp5V8t

In [None]:
model_path = "ED_model_B.pt"

# Load the best hyperparameters
with open('model_architecture_parameters/parameters.pkl', 'rb') as f:
    best_params = pickle.load(f)

print("Loaded hyperparameters:", best_params)
model = BiLSTMAttention(
    vocab_size=len(vocab),
    embedding_dim=embedding_dim,
    num_layers=1,
    hidden_dim=best_params['hidden_dim'],
    dropout=best_params['dropout'],
)

# Load the trained weights
model.load_state_dict(torch.load(model_path))
model.to(device)
print("\nModel loaded successfully.\n")
print(model)

### Test the model

In [None]:
# Set to evaluation mode
model.eval()

# Run inference
print("\nMaking predictions on test data...\n")
all_predictions = []

with torch.no_grad():
    for batch in tqdm(test_loader, desc="Testing"):
        # Move batch to device
        claim_ids = batch['claim_ids'].to(device)
        claim_lengths = batch['claim_lengths']
        evidence_ids = batch['evidence_ids'].to(device)
        evidence_lengths = batch['evidence_lengths']

        # Forward pass
        logits, _ = model(claim_ids, claim_lengths, evidence_ids, evidence_lengths)

        # Get predictions (0 or 1)
        predictions = torch.argmax(logits, dim=1).cpu().numpy()
        all_predictions.extend(predictions)

# Save predictions to CSV
predictions_df = pd.DataFrame({'prediction': all_predictions})
predictions_df.to_csv('Group_5_B.csv', index=False)
print(f"Predictions saved to 'Group_5_B.csv'")