In [1]:
import os
import random
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from pathlib import Path
import xml.etree.ElementTree as ET
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, classification_report
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm import tqdm
from transformers import AutoTokenizer, AutoModel
import umap.umap_ as umap
from collections import Counter

# Set seeds for reproducibility
random.seed(42)
np.random.seed(42)
torch.manual_seed(42)

# Define paths
XML_DIR = Path("./../GNN/xml_files")
MODEL_DIR = Path("./../second_classifier_premise-vs-conclusion/results/secondAttemptWithMetrics/RoBERTa_prem_conc_finetuned")
OUTPUT_DIR = Path("classifier_results")
OUTPUT_DIR.mkdir(parents=True, exist_ok=True)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# Load XML files
xml_files = [f for f in XML_DIR.glob("*.xml") if f.is_file()]
print(f"Found {len(xml_files)} XML files")


  from .autonotebook import tqdm as notebook_tqdm
2025-04-14 16:34:31.747512: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2025-04-14 16:34:31.758778: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:467] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1744628671.772061 1446952 cuda_dnn.cc:8579] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1744628671.776085 1446952 cuda_blas.cc:1407] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
W0000 00:00:1744628671.786576 1446952 computation_placer.cc:177] computation placer already r

Using device: cpu
Found 40 XML files


  return torch._C._cuda_getDeviceCount() > 0


In [2]:
def process_xml(xml_path):
    try:
        tree = ET.parse(xml_path)
        root = tree.getroot()
        
        nodes = []
        id_registry = set()
        for elem in root.iter():
            if elem.tag not in ('prem', 'conc'):
                continue
                
            node_id = elem.attrib.get('ID', '').strip()
            if not node_id or node_id in id_registry:
                continue
                
            nodes.append({
                'id': node_id,
                'text': elem.text.strip() if elem.text else '',
                'type': elem.tag,
                'xml_file': xml_path.name
            })
            id_registry.add(node_id)
            
        id_to_idx = {node['id']: idx for idx, node in enumerate(nodes)}
        
        # Extract relations
        relations = []
        for elem in root.iter():
            if elem.tag not in ('prem', 'conc'):
                continue
                
            source_id = elem.attrib.get('ID', '').strip()
            if not source_id or source_id not in id_to_idx:
                continue

            def clean_split(value):
                return [t.strip() for t in value.strip().split('|') if t.strip()]
                
            for rel_type in ['SUP', 'ATT']:
                if rel_type in elem.attrib:
                    targets = clean_split(elem.attrib[rel_type])
                    relation_label = 0 if rel_type == 'SUP' else 1  # 0=Support, 1=Attack
                    
                    for target_id in targets:
                        if target_id in id_to_idx:
                            source_idx = id_to_idx[source_id]
                            target_idx = id_to_idx[target_id]
                            relations.append({
                                'source_id': source_id,
                                'target_id': target_id,
                                'source_idx': source_idx,
                                'target_idx': target_idx,
                                'label': relation_label,
                                'xml_file': xml_path.name
                            })
        
        return nodes, relations, id_to_idx
        
    except ET.ParseError as e:
        print(f"XML parse error in {xml_path.name}: {e}")
        return [], [], {}


In [3]:
def generate_no_relation_samples(nodes, relations, id_map):
    # Create set of existing relations
    existing_relations = set()
    for rel in relations:
        existing_relations.add((rel['source_idx'], rel['target_idx']))
    
    # Count support and attack relations
    support_count = sum(1 for rel in relations if rel['label'] == 0)
    attack_count = sum(1 for rel in relations if rel['label'] == 1)
    
    # Determine sample size based on class frequencies
    sample_size = support_count + attack_count * 5  # Prioritize attack relations
    
    # Generate all possible No-Relation pairs
    no_relation_candidates = []
    for i, source in enumerate(nodes):
        for j, target in enumerate(nodes):
            if i != j and (i, j) not in existing_relations:
                # Prioritize premise-conclusion pairs as they're more meaningful
                priority = 2 if source['type'] == 'prem' and target['type'] == 'conc' else 1
                no_relation_candidates.append({
                    'source_id': source['id'],
                    'target_id': target['id'],
                    'source_idx': i,
                    'target_idx': j,
                    'label': 2,  # 2=No-Relation
                    'priority': priority,
                    'xml_file': source['xml_file']
                })
    
    # Sort by priority and sample
    no_relation_candidates.sort(key=lambda x: x['priority'], reverse=True)
    sample_size = min(sample_size, len(no_relation_candidates))
    sampled_no_relations = no_relation_candidates[:sample_size]
    
    print(f"Sampled {len(sampled_no_relations)} No-Relation pairs from {len(no_relation_candidates)} candidates")
    return sampled_no_relations


In [4]:
# Load RoBERTa model
print("Loading RoBERTa model...")
tokenizer = AutoTokenizer.from_pretrained(MODEL_DIR)
model = AutoModel.from_pretrained(MODEL_DIR)
model = model.to(device)
model.eval()

def generate_embeddings(texts, batch_size=8):
    embeddings = []
    
    for i in range(0, len(texts), batch_size):
        batch = texts[i:i+batch_size]
        inputs = tokenizer(
            batch, 
            padding=True, 
            truncation=True, 
            max_length=512, 
            return_tensors="pt"
        ).to(device)
        
        with torch.no_grad():
            outputs = model(**inputs)
        
        # Get CLS token embeddings
        embeddings.append(outputs.last_hidden_state[:,0,:].cpu())
    
    return torch.cat(embeddings, dim=0)


Loading RoBERTa model...


Some weights of RobertaModel were not initialized from the model checkpoint at ../second_classifier_premise-vs-conclusion/results/secondAttemptWithMetrics/RoBERTa_prem_conc_finetuned and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [5]:
# Process all XML files first to gather argument pairs
all_nodes = []
all_relations = []
all_no_relations = []
doc_to_pairs = {}  # Track pairs by document

for xml_file in tqdm(xml_files, desc="Processing XML files"):
    nodes, relations, id_map = process_xml(xml_file)
    
    if not nodes:
        print(f"Skipping {xml_file.name}: No valid nodes found")
        continue
    
    # Generate no-relation samples
    no_relations = generate_no_relation_samples(nodes, relations, id_map)
    
    # Add to global collections
    start_idx = len(all_nodes)
    all_nodes.extend(nodes)
    all_relations.extend(relations)
    all_no_relations.extend(no_relations)
    
    # Store pairs for this document
    doc_pairs = []
    
    # Extract relation pairs
    for rel in relations:
        source_node = nodes[rel['source_idx']]
        target_node = nodes[rel['target_idx']]
        
        doc_pairs.append({
            'source_text': source_node['text'],
            'target_text': target_node['text'],
            'label': rel['label'],  # 0=Support, 1=Attack
            'document': xml_file.name
        })
    
    # Extract no-relation pairs
    for rel in no_relations:
        source_node = nodes[rel['source_idx']]
        target_node = nodes[rel['target_idx']]
        
        doc_pairs.append({
            'source_text': source_node['text'],
            'target_text': target_node['text'],
            'label': 2,  # 2=No-Relation
            'document': xml_file.name
        })
    
    # Only store if we have pairs for this document
    if doc_pairs:
        doc_to_pairs[xml_file.name] = doc_pairs

# Get documents that have pairs
valid_docs = list(doc_to_pairs.keys())
print(f"Documents with valid pairs: {len(valid_docs)}")

# Split documents into train and test
train_docs, test_docs = train_test_split(valid_docs, test_size=0.2, random_state=42)
print(f"Train documents: {len(train_docs)}")
print(f"Test documents: {len(test_docs)}")

# Flatten pairs by document
train_pairs = []
for doc in train_docs:
    train_pairs.extend(doc_to_pairs[doc])

test_pairs = []
for doc in test_docs:
    test_pairs.extend(doc_to_pairs[doc])

print(f"Train samples: {len(train_pairs)}")
print(f"Test samples: {len(test_pairs)}")

# Verify we have samples in both sets
if len(test_pairs) == 0:
    print("WARNING: No test samples! Using sample-level split instead of document-level split")
    # Fall back to sample-level split
    all_pairs = []
    for pairs in doc_to_pairs.values():
        all_pairs.extend(pairs)
    
    train_pairs, test_pairs = train_test_split(all_pairs, test_size=0.2, random_state=42)
    print(f"New split - Train: {len(train_pairs)}, Test: {len(test_pairs)}")


Processing XML files: 100%|██████████| 40/40 [00:00<00:00, 440.18it/s]

Sampled 48 No-Relation pairs from 1592 candidates
Sampled 99 No-Relation pairs from 9427 candidates
Sampled 56 No-Relation pairs from 2701 candidates
Sampled 98 No-Relation pairs from 5328 candidates
Sampled 64 No-Relation pairs from 2592 candidates
Sampled 114 No-Relation pairs from 9030 candidates
Sampled 74 No-Relation pairs from 2928 candidates
Sampled 65 No-Relation pairs from 5635 candidates
Sampled 57 No-Relation pairs from 2117 candidates
Sampled 43 No-Relation pairs from 1293 candidates
Sampled 108 No-Relation pairs from 5786 candidates
Sampled 114 No-Relation pairs from 4083 candidates
Sampled 75 No-Relation pairs from 3961 candidates
Sampled 226 No-Relation pairs from 21324 candidates
Sampled 29 No-Relation pairs from 1031 candidates
Sampled 77 No-Relation pairs from 2494 candidates
Sampled 65 No-Relation pairs from 4627 candidates
Sampled 121 No-Relation pairs from 12323 candidates
Sampled 38 No-Relation pairs from 1298 candidates
Sampled 35 No-Relation pairs from 839 candi




In [6]:
# Process in batches to avoid memory issues
def generate_pair_embeddings(pairs, batch_size=32):
    if not pairs:
        return None  # Return None for empty sets
    
    all_embeddings = []
    
    for i in tqdm(range(0, len(pairs), batch_size)):
        batch = pairs[i:i+batch_size]
        source_texts = [pair['source_text'] for pair in batch]
        target_texts = [pair['target_text'] for pair in batch]
        
        source_embs = generate_embeddings(source_texts)
        target_embs = generate_embeddings(target_texts)
        
        # Concatenate source and target embeddings
        for j in range(len(batch)):
            combined = torch.cat([source_embs[j], target_embs[j]], dim=0)
            all_embeddings.append(combined)
    
    return torch.stack(all_embeddings)

train_embeddings = generate_pair_embeddings(train_pairs)
test_embeddings = generate_pair_embeddings(test_pairs)

# Make sure we have embeddings before continuing
if train_embeddings is None or test_embeddings is None:
    raise ValueError("Failed to generate embeddings for both train and test sets")

print(f"Generated embeddings: Train {train_embeddings.shape}, Test {test_embeddings.shape}")


100%|██████████| 137/137 [07:36<00:00,  3.33s/it]
100%|██████████| 34/34 [01:45<00:00,  3.11s/it]

Generated embeddings: Train torch.Size([4374, 1536]), Test torch.Size([1078, 1536])





In [7]:
# Create a linear layer to increase dimensions to 1024
expand_layer = nn.Linear(768*2, 1024)

# Apply linear transformation
train_expanded = expand_layer(train_embeddings)
test_expanded = expand_layer(test_embeddings)

print("Applying UMAP dimensionality reduction...")
# Use UMAP to reduce to 512 dimensions
reducer = umap.UMAP(n_components=512, random_state=42)
train_reduced = reducer.fit_transform(train_expanded.detach().numpy())
test_reduced = reducer.transform(test_expanded.detach().numpy())

# Convert back to PyTorch tensors
train_reduced = torch.tensor(train_reduced, dtype=torch.float32)
test_reduced = torch.tensor(test_reduced, dtype=torch.float32)

print(f"Reduced dimensions: {train_reduced.shape}")


  warn(


Applying UMAP dimensionality reduction...
Reduced dimensions: torch.Size([4374, 512])


In [8]:
# Define the MLP classifier
class RelationClassifier(nn.Module):
    def __init__(self, input_dim=512, hidden_dims=[256, 128]):
        super(RelationClassifier, self).__init__()
        
        layers = []
        prev_dim = input_dim
        
        for hidden_dim in hidden_dims:
            layers.append(nn.Linear(prev_dim, hidden_dim))
            layers.append(nn.ReLU())
            layers.append(nn.Dropout(0.2))
            prev_dim = hidden_dim
        
        # Output layer
        layers.append(nn.Linear(prev_dim, 3))  # 3 classes: Support, Attack, No-Relation
        
        self.mlp = nn.Sequential(*layers)
    
    def forward(self, x):
        return self.mlp(x)

def calculate_balanced_class_weights(train_pairs):
    # Count instances of each class
    support_count = sum(1 for pair in train_pairs if pair['label'] == 0)
    attack_count = sum(1 for pair in train_pairs if pair['label'] == 1)
    no_relation_count = sum(1 for pair in train_pairs if pair['label'] == 2)
    
    total_samples = support_count + attack_count + no_relation_count
    
    # Calculate balanced weights
    support_w = total_samples / (3 * support_count)
    attack_w = total_samples / (3 * attack_count)
    no_rel_w = total_samples / (3 * no_relation_count)
    
    class_weights = torch.tensor([support_w, attack_w, no_rel_w], dtype=torch.float)
    
    print(f"Class weights - Support: {support_w:.2f}, Attack: {attack_w:.2f}, No Relation: {no_rel_w:.2f}")
    return class_weights

import torch.nn.functional as F

class FocalLoss(torch.nn.Module):
    def __init__(self, weight=None, gamma=2.0, reduction='mean'):
        super(FocalLoss, self).__init__()
        self.weight = weight
        self.gamma = gamma
        self.reduction = reduction

    def forward(self, input, target):
        ce_loss = F.cross_entropy(input, target, reduction='none', weight=self.weight)
        pt = torch.exp(-ce_loss)
        focal_loss = (1 - pt) ** self.gamma * ce_loss
        
        if self.reduction == 'mean':
            return focal_loss.mean()
        elif self.reduction == 'sum':
            return focal_loss.sum()
        else:
            return focal_loss



In [9]:
# Custom dataset
class ArgumentPairDataset(Dataset):
    def __init__(self, embeddings, labels):
        self.embeddings = embeddings
        self.labels = labels
    
    def __len__(self):
        return len(self.labels)
    
    def __getitem__(self, idx):
        return {
            'embedding': self.embeddings[idx],
            'label': self.labels[idx]
        }

# Create datasets
train_labels = torch.tensor([pair['label'] for pair in train_pairs])
test_labels = torch.tensor([pair['label'] for pair in test_pairs])

train_dataset = ArgumentPairDataset(train_reduced, train_labels)
test_dataset = ArgumentPairDataset(test_reduced, test_labels)

train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False)

# Training function
def train_model(model, train_loader, val_loader, criterion, optimizer, num_epochs=20):
    model = model.to(device)
    best_val_loss = float('inf')
    train_losses = []
    val_losses = []
    
    for epoch in range(num_epochs):
        # Training
        model.train()
        epoch_loss = 0
        correct = 0
        total = 0
        
        print(f"Epoch {epoch+1}/{num_epochs}")
        progress_bar = tqdm(train_loader, desc="Training")
        
        for batch in progress_bar:
            embeddings = batch['embedding'].to(device)
            labels = batch['label'].to(device)
            
            # Forward pass
            outputs = model(embeddings)
            loss = criterion(outputs, labels)
            
            # Backward pass
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            
            # Statistics
            epoch_loss += loss.item()
            _, predicted = torch.max(outputs.data, 1)
            total += labels.size(0)
            correct += (predicted == labels).sum().item()
            
            progress_bar.set_postfix({
                'loss': f"{loss.item():.4f}",
                'acc': f"{100 * correct / total:.2f}%"
            })
        
        avg_train_loss = epoch_loss / len(train_loader)
        train_losses.append(avg_train_loss)
        print(f"Train Loss: {avg_train_loss:.4f}, Accuracy: {100 * correct / total:.2f}%")
        
        # Validation
        model.eval()
        val_loss = 0
        all_preds = []
        all_labels = []
        correct = 0
        total = 0
        
        with torch.no_grad():
            for batch in tqdm(val_loader, desc="Validation"):
                embeddings = batch['embedding'].to(device)
                labels = batch['label'].to(device)
                
                outputs = model(embeddings)
                loss = criterion(outputs, labels)
                val_loss += loss.item()
                
                _, predicted = torch.max(outputs.data, 1)
                total += labels.size(0)
                correct += (predicted == labels).sum().item()
                
                all_preds.extend(predicted.cpu().numpy())
                all_labels.extend(labels.cpu().numpy())
        
        avg_val_loss = val_loss / len(val_loader)
        val_losses.append(avg_val_loss)
        
        print(f"Val Loss: {avg_val_loss:.4f}, Accuracy: {100 * correct / total:.2f}%")
        
        # Save best model
        if avg_val_loss < best_val_loss:
            best_val_loss = avg_val_loss
            torch.save(model.state_dict(), OUTPUT_DIR / "best_model.pt")
            print("Saved new best model")
        
        # Generate confusion matrix every 5 epochs
        if (epoch + 1) % 5 == 0 or epoch == num_epochs - 1:
            cm = confusion_matrix(all_labels, all_preds)
            plt.figure(figsize=(10, 8))
            sns.heatmap(cm, annot=True, fmt='d', cmap='Blues',
                       xticklabels=['Support', 'Attack', 'No-Relation'],
                       yticklabels=['Support', 'Attack', 'No-Relation'])
            plt.xlabel('Predicted')
            plt.ylabel('True')
            plt.title(f'Confusion Matrix (Epoch {epoch+1})')
            plt.savefig(OUTPUT_DIR / f"confusion_matrix_epoch_{epoch+1}.png")
            plt.close()
    
    # Plot loss curves
    plt.figure(figsize=(10, 6))
    plt.plot(train_losses, label='Training Loss')
    plt.plot(val_losses, label='Validation Loss')
    plt.xlabel('Epochs')
    plt.ylabel('Loss')
    plt.title('Training and Validation Loss')
    plt.legend()
    plt.savefig(OUTPUT_DIR / "loss_curves.png")
    plt.close()
    
    return train_losses, val_losses


In [10]:
label_counts = Counter([pair['label'] for pair in train_pairs])
print(f"Class distribution - Support: {label_counts[0]}, Attack: {label_counts[1]}, No-Relation: {label_counts[2]}")


Class distribution - Support: 1782, Attack: 135, No-Relation: 2457


In [11]:
# Calculate class weights based on training data
class_weights = calculate_balanced_class_weights(train_pairs)
class_weights = class_weights.to(device)

# Initialize model, criterion, and optimizer
model = RelationClassifier(input_dim=512, hidden_dims=[256, 128])
criterion = FocalLoss(weight=class_weights, gamma=2.0)
optimizer = optim.Adam(model.parameters(), lr=1e-4)


Class weights - Support: 0.82, Attack: 10.80, No Relation: 0.59


In [12]:

# Train model
print("Training model...")
train_losses, val_losses = train_model(
    model, train_loader, test_loader, criterion, optimizer, num_epochs=20
)

# Load best model for final evaluation
model.load_state_dict(torch.load(OUTPUT_DIR / "best_model.pt"))
model.eval()

# Final evaluation
all_preds = []
all_labels = []

with torch.no_grad():
    for batch in tqdm(test_loader, desc="Final evaluation"):
        embeddings = batch['embedding'].to(device)
        labels = batch['label'].to(device)
        
        outputs = model(embeddings)
        _, predicted = torch.max(outputs.data, 1)
        
        all_preds.extend(predicted.cpu().numpy())
        all_labels.extend(labels.cpu().numpy())

# Generate final confusion matrix
cm = confusion_matrix(all_labels, all_preds)
plt.figure(figsize=(10, 8))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues',
           xticklabels=['Support', 'Attack', 'No-Relation'],
           yticklabels=['Support', 'Attack', 'No-Relation'])
plt.xlabel('Predicted')
plt.ylabel('True')
plt.title('Final Confusion Matrix')
plt.savefig(OUTPUT_DIR / "final_confusion_matrix.png")
plt.close()

# Print final classification report
report = classification_report(
    all_labels, all_preds,
    target_names=['Support', 'Attack', 'No-Relation'],
    digits=4
)
print("Final Classification Report:")
print(report)

# Save the report
with open(OUTPUT_DIR / "classification_report.txt", "w") as f:
    f.write(report)

print(f"All results saved to {OUTPUT_DIR}")


Training model...
Epoch 1/20


Training: 100%|██████████| 137/137 [00:00<00:00, 507.33it/s, loss=0.2137, acc=24.97%]


Train Loss: 0.5706, Accuracy: 24.97%


Validation: 100%|██████████| 34/34 [00:00<00:00, 1892.44it/s]


Val Loss: 0.2770, Accuracy: 52.13%
Saved new best model
Epoch 2/20


Training: 100%|██████████| 137/137 [00:00<00:00, 493.37it/s, loss=0.1448, acc=31.57%]


Train Loss: 0.5313, Accuracy: 31.57%


Validation: 100%|██████████| 34/34 [00:00<00:00, 1952.12it/s]


Val Loss: 0.2583, Accuracy: 51.02%
Saved new best model
Epoch 3/20


Training: 100%|██████████| 137/137 [00:00<00:00, 465.08it/s, loss=0.2132, acc=38.16%]


Train Loss: 0.4913, Accuracy: 38.16%


Validation: 100%|██████████| 34/34 [00:00<00:00, 1836.48it/s]


Val Loss: 0.2538, Accuracy: 5.84%
Saved new best model
Epoch 4/20


Training: 100%|██████████| 137/137 [00:00<00:00, 498.69it/s, loss=0.1569, acc=44.49%]


Train Loss: 0.4649, Accuracy: 44.49%


Validation: 100%|██████████| 34/34 [00:00<00:00, 1893.92it/s]


Val Loss: 0.2085, Accuracy: 51.02%
Saved new best model
Epoch 5/20


Training: 100%|██████████| 137/137 [00:00<00:00, 496.95it/s, loss=0.5280, acc=43.85%]


Train Loss: 0.4496, Accuracy: 43.85%


Validation: 100%|██████████| 34/34 [00:00<00:00, 1966.74it/s]


Val Loss: 0.2150, Accuracy: 51.02%
Epoch 6/20


Training: 100%|██████████| 137/137 [00:00<00:00, 459.36it/s, loss=1.1245, acc=46.00%]


Train Loss: 0.4510, Accuracy: 46.00%


Validation: 100%|██████████| 34/34 [00:00<00:00, 1902.41it/s]


Val Loss: 0.2036, Accuracy: 51.02%
Saved new best model
Epoch 7/20


Training: 100%|██████████| 137/137 [00:00<00:00, 439.22it/s, loss=0.1856, acc=48.06%]


Train Loss: 0.4444, Accuracy: 48.06%


Validation: 100%|██████████| 34/34 [00:00<00:00, 1986.63it/s]


Val Loss: 0.2304, Accuracy: 51.02%
Epoch 8/20


Training: 100%|██████████| 137/137 [00:00<00:00, 389.10it/s, loss=0.2538, acc=45.34%]


Train Loss: 0.4394, Accuracy: 45.34%


Validation: 100%|██████████| 34/34 [00:00<00:00, 2386.56it/s]


Val Loss: 0.2285, Accuracy: 51.02%
Epoch 9/20


Training: 100%|██████████| 137/137 [00:00<00:00, 341.36it/s, loss=0.1532, acc=47.33%]


Train Loss: 0.4350, Accuracy: 47.33%


Validation: 100%|██████████| 34/34 [00:00<00:00, 1759.72it/s]


Val Loss: 0.2157, Accuracy: 51.02%
Epoch 10/20


Training: 100%|██████████| 137/137 [00:00<00:00, 415.52it/s, loss=0.4536, acc=45.38%]


Train Loss: 0.4352, Accuracy: 45.38%


Validation: 100%|██████████| 34/34 [00:00<00:00, 1903.45it/s]


Val Loss: 0.1940, Accuracy: 51.02%
Saved new best model
Epoch 11/20


Training: 100%|██████████| 137/137 [00:00<00:00, 317.04it/s, loss=0.5754, acc=48.10%]


Train Loss: 0.4252, Accuracy: 48.10%


Validation: 100%|██████████| 34/34 [00:00<00:00, 1585.94it/s]


Val Loss: 0.1949, Accuracy: 51.02%
Epoch 12/20


Training: 100%|██████████| 137/137 [00:00<00:00, 389.80it/s, loss=0.8410, acc=48.49%]


Train Loss: 0.4266, Accuracy: 48.49%


Validation: 100%|██████████| 34/34 [00:00<00:00, 1602.71it/s]


Val Loss: 0.1958, Accuracy: 51.02%
Epoch 13/20


Training: 100%|██████████| 137/137 [00:00<00:00, 394.73it/s, loss=0.6400, acc=48.17%]


Train Loss: 0.4248, Accuracy: 48.17%


Validation: 100%|██████████| 34/34 [00:00<00:00, 1978.01it/s]


Val Loss: 0.1957, Accuracy: 51.02%
Epoch 14/20


Training: 100%|██████████| 137/137 [00:00<00:00, 347.52it/s, loss=0.8389, acc=51.78%]


Train Loss: 0.4131, Accuracy: 51.78%


Validation: 100%|██████████| 34/34 [00:00<00:00, 1487.37it/s]


Val Loss: 0.1841, Accuracy: 51.02%
Saved new best model
Epoch 15/20


Training: 100%|██████████| 137/137 [00:00<00:00, 266.53it/s, loss=0.1199, acc=47.42%]


Train Loss: 0.4190, Accuracy: 47.42%


Validation: 100%|██████████| 34/34 [00:00<00:00, 1296.39it/s]


Val Loss: 0.1822, Accuracy: 51.02%
Saved new best model
Epoch 16/20


Training: 100%|██████████| 137/137 [00:00<00:00, 311.27it/s, loss=0.1372, acc=49.50%]


Train Loss: 0.4115, Accuracy: 49.50%


Validation: 100%|██████████| 34/34 [00:00<00:00, 1247.08it/s]


Val Loss: 0.1782, Accuracy: 51.02%
Saved new best model
Epoch 17/20


Training: 100%|██████████| 137/137 [00:00<00:00, 289.09it/s, loss=0.5091, acc=47.94%]


Train Loss: 0.4134, Accuracy: 47.94%


Validation: 100%|██████████| 34/34 [00:00<00:00, 1408.30it/s]


Val Loss: 0.1687, Accuracy: 89.42%
Saved new best model
Epoch 18/20


Training: 100%|██████████| 137/137 [00:00<00:00, 277.19it/s, loss=0.2291, acc=53.86%]


Train Loss: 0.4168, Accuracy: 53.86%


Validation: 100%|██████████| 34/34 [00:00<00:00, 1502.88it/s]


Val Loss: 0.1757, Accuracy: 51.02%
Epoch 19/20


Training: 100%|██████████| 137/137 [00:00<00:00, 278.12it/s, loss=0.1496, acc=44.51%]


Train Loss: 0.4083, Accuracy: 44.51%


Validation: 100%|██████████| 34/34 [00:00<00:00, 1314.49it/s]


Val Loss: 0.1679, Accuracy: 65.03%
Saved new best model
Epoch 20/20


Training: 100%|██████████| 137/137 [00:00<00:00, 250.93it/s, loss=0.2573, acc=51.30%]


Train Loss: 0.4131, Accuracy: 51.30%


Validation: 100%|██████████| 34/34 [00:00<00:00, 1190.21it/s]


Val Loss: 0.1803, Accuracy: 51.02%


Final evaluation: 100%|██████████| 34/34 [00:00<00:00, 1754.64it/s]

Final Classification Report:
              precision    recall  f1-score   support

     Support     1.0000    0.2967    0.4576       509
      Attack     0.0234    0.9000    0.0456        10
 No-Relation     0.9982    0.9678    0.9827       559

    accuracy                         0.6503      1078
   macro avg     0.6738    0.7215    0.4953      1078
weighted avg     0.9900    0.6503    0.7261      1078

All results saved to classifier_results





In [13]:
from sklearn.preprocessing import StandardScaler
from sklearn.svm import LinearSVC
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import joblib

# Convert embeddings and labels to numpy arrays
X_train = train_reduced.numpy()
X_test = test_reduced.numpy()
y_train = train_labels.numpy()
y_test = test_labels.numpy()

print(f"Training data shape: {X_train.shape}")
print(f"Testing data shape: {X_test.shape}")

# Standardize features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Train SVM with class weights
svm = LinearSVC(
    class_weight='balanced',  # Automatically adjusts weights inversely proportional to class frequencies
    max_iter=10000,  # Ensure convergence
    random_state=42,
    verbose=1
)

print("Training SVM...")
svm.fit(X_train_scaled, y_train)

# Evaluate
y_pred = svm.predict(X_test_scaled)

print("\nEvaluation Metrics:")
print(f"Accuracy: {accuracy_score(y_test, y_pred):.4f}")
print(classification_report(y_test, y_pred, target_names=['Support', 'Attack', 'No-Relation']))

# Confusion matrix
cm = confusion_matrix(y_test, y_pred)
plt.figure(figsize=(10, 8))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues',
           xticklabels=['Support', 'Attack', 'No-Relation'],
           yticklabels=['Support', 'Attack', 'No-Relation'])
plt.xlabel('Predicted')
plt.ylabel('True')
plt.title('SVM Confusion Matrix')
plt.savefig(OUTPUT_DIR / "svm_confusion_matrix.png")
plt.close()

# Save model and scaler
joblib.dump(svm, OUTPUT_DIR / 'svm_model.pkl')
joblib.dump(scaler, OUTPUT_DIR / 'svm_scaler.pkl')

print(f"SVM results saved to {OUTPUT_DIR}")


Training data shape: (4374, 512)
Testing data shape: (1078, 512)
Training SVM...
[LibLinear]iter  1 act 2.975e+03 pre 2.965e+03 delta 2.358e-01 f 4.050e+03 |g| 3.686e+04 CG   6
cg reaches trust region boundary
iter  2 act -7.096e+01 pre 1.458e+02 delta 9.470e-02 f 1.075e+03 |g| 2.869e+03 CG  13
cg reaches trust region boundary
iter  2 act 5.543e+01 pre 8.473e+01 delta 7.052e-02 f 1.075e+03 |g| 2.869e+03 CG   5
cg reaches trust region boundary
iter  3 act 2.228e+01 pre 6.348e+01 delta 4.294e-02 f 1.019e+03 |g| 5.369e+03 CG   7
cg reaches trust region boundary
iter  4 act 4.313e+01 pre 5.302e+01 delta 4.294e-02 f 9.970e+02 |g| 5.576e+03 CG   6
cg reaches trust region boundary
iter  5 act 1.441e+01 pre 1.690e+01 delta 4.294e-02 f 9.538e+02 |g| 2.561e+03 CG   7
cg reaches trust region boundary
iter  6 act 5.851e+00 pre 8.878e+00 delta 3.210e-02 f 9.394e+02 |g| 1.804e+03 CG   6
cg reaches trust region boundary
iter  7 act 6.944e+00 pre 6.978e+00 delta 3.381e-02 f 9.336e+02 |g| 1.476e+03 CG 



In [14]:
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Load embeddings from previous steps
try:
    # Combine train and test embeddings
    X = np.concatenate([train_reduced.numpy(), test_reduced.numpy()], axis=0)
    
    # Reduce to 2D for visualization
    pca = PCA(n_components=2)
    X_pca = pca.fit_transform(X)
    
    # Apply K-means clustering
    kmeans = KMeans(n_clusters=3, random_state=42, n_init=10)
    cluster_labels = kmeans.fit_predict(X)
    
    # Visualize clusters
    plt.figure(figsize=(10, 6))
    sns.scatterplot(x=X_pca[:,0], y=X_pca[:,1], hue=cluster_labels, palette='viridis', s=50)
    plt.title('K-means Clustering (k=3) - PCA Visualization')
    plt.xlabel('PCA Component 1')
    plt.ylabel('PCA Component 2')
    plt.savefig(OUTPUT_DIR / "kmeans_clusters.png")
    plt.close()
    
    # Save cluster results
    np.save(OUTPUT_DIR / "cluster_labels.npy", cluster_labels)
    print(f"Cluster centers shape: {kmeans.cluster_centers_.shape}")
    print(f"Cluster distribution: {np.bincount(cluster_labels)}")
    
except NameError as e:
    print("Error: Make sure you've run the embedding generation steps first")
    print(f"Required variables: {e}")
except Exception as e:
    print(f"Error during clustering: {str(e)}")


Cluster centers shape: (3, 512)
Cluster distribution: [2652 2306  494]
