In [1]:
import json
import os
import numpy as np
import pickle  
import torch
import pandas as pd
from tqdm import tqdm
from transformers import BertTokenizer, BertModel
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.multioutput import MultiOutputClassifier
from xgboost import XGBClassifier
from sklearn.metrics import f1_score, jaccard_score, hamming_loss, classification_report, precision_recall_curve, accuracy_score
from torch.utils.data import DataLoader, Dataset
import xgboost as xgb
import numpy as np
import torch
import os
import pickle  # For saving/loading model
from sklearn.multioutput import MultiOutputClassifier
from sklearn.metrics import (
    f1_score, precision_recall_curve
)
from tqdm import tqdm

In [2]:
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {DEVICE}")

Using device: cuda


In [3]:
# ------------------- Load Dataset -------------------
def load_data(file_path):
    """Load JSON data from a file."""
    with open(file_path, "r", encoding="utf-8") as f:
        return json.load(f)

train_data = load_data("train_revised.json")
dev_data = load_data("dev_revised.json")
test_data = load_data("test_revised.json")

print(f"\nLoaded datasets: Train({len(train_data)}), Dev({len(dev_data)}), Test({len(test_data)})\n")

# ------------------- Extract and Filter Relations -------------------
def get_unique_relations(data):
    """Extract unique relation types from the dataset."""
    return set(rel.get("r", "UNKNOWN") for entry in data for rel in entry.get("labels", []))

train_relations = get_unique_relations(train_data)
dev_relations = get_unique_relations(dev_data)
test_relations = get_unique_relations(test_data)

# Identify missing relations across sets
missing_in_train = (dev_relations | test_relations) - train_relations
missing_in_dev = (train_relations | test_relations) - dev_relations
missing_in_test = (train_relations | dev_relations) - test_relations

print("\nDebug: Checking for missing relations")
print(f"No missing relations in Train" if not missing_in_train else f"Missing in Train: {missing_in_train}")
print(f"No missing relations in Dev" if not missing_in_dev else f"Missing in Dev: {missing_in_dev}")
print(f"No missing relations in Test" if not missing_in_test else f"Missing in Test: {missing_in_test}")

# Remove relations missing in dev or test sets
relations_to_remove = missing_in_dev | missing_in_test
print(f"\nRelations removed due to being missing: {relations_to_remove}\n")

def filter_data_fixed(data, remove_rels):
    """Filter out entries with relations in remove_rels, keeping only valid entries."""
    new_data = []
    removed_entries_count = 0
    for entry in data:
        entry["labels"] = [rel for rel in entry.get("labels", []) if rel.get("r") not in remove_rels]
        if entry["labels"]:  # Keep only entries with valid labels
            new_data.append(entry)
        else:
            removed_entries_count += 1
    print(f"Removed {removed_entries_count} entries due to missing relations: {remove_rels}")
    return new_data

# Apply filtering
train_data = filter_data_fixed(train_data, relations_to_remove)
dev_data = filter_data_fixed(dev_data, relations_to_remove)
test_data = filter_data_fixed(test_data, relations_to_remove)

print(f"\nAfter filtering: Train({len(train_data)}), Dev({len(dev_data)}), Test({len(test_data)})\n")


Loaded datasets: Train(3053), Dev(500), Test(500)


Debug: Checking for missing relations
No missing relations in Train
Missing in Dev: {'P1198'}
Missing in Test: {'P190'}

Relations removed due to being missing: {'P1198', 'P190'}

Removed 3 entries due to missing relations: {'P1198', 'P190'}
Removed 1 entries due to missing relations: {'P1198', 'P190'}
Removed 1 entries due to missing relations: {'P1198', 'P190'}

After filtering: Train(3050), Dev(499), Test(499)



In [4]:
# ------------------- Multi-Label Encoding -------------------
all_relations = sorted(set(rel.get('r', 'UNKNOWN') for entry in train_data for rel in entry.get('labels', [])))
mlb = MultiLabelBinarizer(classes=all_relations)

def extract_labels_multi(data):
    """Convert relation labels to multi-label binary format."""
    relation_labels = []
    for entry in tqdm(data, desc="Processing Labels"):
        labels = [rel.get('r', 'UNKNOWN') for rel in entry.get('labels', [])]
        relation_labels.append(labels)
    return mlb.fit_transform(relation_labels)

# Directory to save/load embeddings and mlb
embedding_dir = "bert_embeddings"
os.makedirs(embedding_dir, exist_ok=True)  # Create directory if it doesn't exist

# File paths for saved embeddings and mlb
train_embedding_file = os.path.join(embedding_dir, "X_train.npy")
dev_embedding_file = os.path.join(embedding_dir, "X_dev.npy")
test_embedding_file = os.path.join(embedding_dir, "X_test.npy")
mlb_file = os.path.join(embedding_dir, "mlb.pkl")

# Check if mlb and embeddings already exist; load them if they do, otherwise compute and save
if (os.path.exists(mlb_file) and os.path.exists(train_embedding_file) and 
    os.path.exists(dev_embedding_file) and os.path.exists(test_embedding_file)):
    print("Loading existing MultiLabelBinarizer and BERT embeddings...")
    with open(mlb_file, 'rb') as f:
        mlb = pickle.load(f)
    X_train = np.load(train_embedding_file)
    X_dev = np.load(dev_embedding_file)
    X_test = np.load(test_embedding_file)
    y_train = extract_labels_multi(train_data)  # Still need to compute y_train, y_dev, y_test with loaded mlb
    y_dev = extract_labels_multi(dev_data)
    y_test = extract_labels_multi(test_data)
    print(f"Multi-label encoding done with loaded MLB! BERT embeddings loaded! Shape: {X_train.shape}")
else:
    print("Computing multi-label encoding and extracting BERT embeddings...")
    y_train = extract_labels_multi(train_data)
    y_dev = extract_labels_multi(dev_data)
    y_test = extract_labels_multi(test_data)
    print("Multi-label encoding done!")

    # ------------------- BERT Feature Extraction with Saving/Loading -------------------
    tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
    model = BertModel.from_pretrained("bert-base-uncased").to(DEVICE)

    def extract_features(data):
        """Extract BERT embeddings one by one."""
        embeddings = []
        for entry in tqdm(data, desc="Extracting BERT Embeddings"):
            # Combine sentences into a single text string
            text = " ".join([" ".join(sent) for sent in entry.get("sents", [])])
            # Tokenize the text, return as PyTorch tensors, and move to device
            inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True, max_length=512).to(DEVICE)
            with torch.no_grad():
                # Pass through BERT model
                outputs = model(**inputs)
            # Extract CLS token embedding (shape: [1, hidden_size]), move to CPU, and convert to NumPy
            embedding = outputs.last_hidden_state[:, 0, :].cpu().numpy()
            embeddings.append(embedding)
        # Stack all embeddings into a single NumPy array
        return np.vstack(embeddings)

    print("Extracting and saving BERT embeddings...")
    # Extract embeddings for train, dev, and test sets
    X_train = extract_features(train_data)
    X_dev = extract_features(dev_data)
    X_test = extract_features(test_data)
    
    # Save embeddings and mlb to files
    np.save(train_embedding_file, X_train)
    np.save(dev_embedding_file, X_dev)
    np.save(test_embedding_file, X_test)
    with open(mlb_file, 'wb') as f:
        pickle.dump(mlb, f)
    print(f"BERT embeddings and MultiLabelBinarizer extracted and saved! Shape: {X_train.shape}")

Loading existing MultiLabelBinarizer and BERT embeddings...


Processing Labels: 100%|███████████████████████████████████████████████████████| 3050/3050 [00:00<00:00, 295810.65it/s]
Processing Labels: 100%|█████████████████████████████████████████████████████████| 499/499 [00:00<00:00, 158641.53it/s]
Processing Labels: 100%|█████████████████████████████████████████████████████████| 499/499 [00:00<00:00, 194386.34it/s]

Multi-label encoding done with loaded MLB! BERT embeddings loaded! Shape: (3050, 768)





In [5]:
# Assume X_train, X_dev, X_test, y_train, y_dev, y_test, and mlb are already loaded from preprocessing
if 'X_train' not in globals() or 'X_dev' not in globals() or 'X_test' not in globals() or \
   'y_train' not in globals() or 'y_dev' not in globals() or 'y_test' not in globals() or 'mlb' not in globals():
    raise NameError("Required variables (X_train, X_dev, X_test, y_train, y_dev, y_test, mlb) not found in memory. Ensure preprocessing code has run.")

print(f"Using pre-loaded BERT embeddings! Shape: {X_train.shape}")

# ------------------- Compute Class Weights for Imbalance Handling -------------------
class_counts = np.sum(y_train, axis=0)
scale_pos_weights = (len(y_train) - class_counts) / (class_counts + 1e-6)  # Avoid division by zero

# Directly use the provided best hyperparameters from your output
# These values are taken directly from your provided output: {'n_estimators': 700, ...}
xgb_params = {
    "objective": "binary:logistic",
    "eval_metric": "logloss",
    "tree_method": "gpu_hist",
    "device": "cuda" if torch.cuda.is_available() else "cpu",
    "verbosity": 0,
    "n_estimators": 700,
    "max_depth": 6,
    "learning_rate": 0.042104737232958024,
    "subsample": 0.6782658070692928,
    "colsample_bytree": 0.6900286928740272,
    "reg_lambda": 3.2413716914686463,
    "reg_alpha": 0.683997563864875,
    "min_child_weight": 3,
}

# Directory to save/load model and related variables
model_dir = "XGB_model_and_embeddings"
os.makedirs(model_dir, exist_ok=True)

# File paths for saved items
model_file = os.path.join(model_dir, "xgb_multi.pkl")
thresholds_file = os.path.join(model_dir, "optimal_thresholds.npy")

# Check if model and thresholds exist; load them if they do, otherwise train and tune
if os.path.exists(model_file) and os.path.exists(thresholds_file):
    print("Loading existing model and thresholds...")
    with open(model_file, 'rb') as f:
        xgb_multi = pickle.load(f)
    optimal_thresholds = np.load(thresholds_file)
    print(" Model and thresholds loaded!")
else:
    print("Training Multi-Label XGBoost with Tuned Hyperparameters...")

    def train_classifier(i, X_train, y_train, params, scale_pos_weight):
        params = params.copy()
        params["scale_pos_weight"] = scale_pos_weight[i]
        clf = xgb.XGBClassifier(**params)
        clf.fit(X_train, y_train[:, i], verbose=False)
        return clf

    n_jobs = 4
    estimators = [
        train_classifier(i, X_train, y_train, xgb_params, scale_pos_weights)
        for i in tqdm(range(y_train.shape[1]), desc="Training Classifiers")
    ]

    xgb_multi = MultiOutputClassifier(xgb.XGBClassifier(**xgb_params))
    xgb_multi.estimators_ = estimators
    xgb_multi.n_outputs_ = y_train.shape[1]

    print(" Training complete!")

    # ------------------- Refined Threshold Tuning on Dev Set -------------------
    print(" Optimizing thresholds on dev set with refinement...")

    y_dev_pred_prob_raw = xgb_multi.predict_proba(X_dev)
    y_dev_pred_prob = np.array([prob[:, 1] for prob in y_dev_pred_prob_raw]).T  # Shape: (n_samples, n_classes)

    optimal_thresholds = np.zeros(y_train.shape[1])
    beta = 2.0  # Weight for recall (beta > 1 favors recall, beta < 1 favors precision)

    for i in range(y_train.shape[1]):
        precisions, recalls, thresholds = precision_recall_curve(y_dev[:, i], y_dev_pred_prob[:, i])
        f_beta_scores = ((1 + beta**2) * precisions * recalls) / (beta**2 * precisions + recalls + 1e-6)
        optimal_thresholds[i] = thresholds[np.argmax(f_beta_scores)] if len(thresholds) > 0 else 0.5

    # Global threshold adjustment: Shift thresholds to balance micro/macro F1
    global_shift = np.linspace(-0.1, 0.1, 21)  # Test shifts from -0.1 to 0.1
    best_shift = 0.0
    best_macro_f1 = 0.0

    print(" Applying global threshold adjustment...")
    for shift in tqdm(global_shift, desc="Tuning global shift"):
        adjusted_thresholds = np.clip(optimal_thresholds + shift, 0.05, 0.95)
        y_dev_pred = (y_dev_pred_prob >= adjusted_thresholds).astype(int)
        macro_f1 = f1_score(y_dev, y_dev_pred, average="macro")
        if macro_f1 > best_macro_f1:
            best_macro_f1 = macro_f1
            best_shift = shift

    # Apply the best shift
    optimal_thresholds = np.clip(optimal_thresholds + best_shift, 0.05, 0.95)
    print(f"Optimal thresholds tuned with best shift: {best_shift:.3f}, Dev Macro F1: {best_macro_f1:.4f}")

    # Save the model and thresholds
    with open(model_file, 'wb') as f:
        pickle.dump(xgb_multi, f)
    np.save(thresholds_file, optimal_thresholds)
    print(f"Model and thresholds saved to {model_dir}")

Using pre-loaded BERT embeddings! Shape: (3050, 768)
Loading existing model and thresholds...
 Model and thresholds loaded!


In [6]:
import numpy as np
from sklearn.metrics import f1_score, precision_score, recall_score

# Assume X_dev, X_test, y_dev, y_test, xgb_multi, and optimal_thresholds are already defined from your model code
print("\nEvaluating on Dev and Test sets...")

# Predictions for Dev set
y_dev_pred_prob_raw = xgb_multi.predict_proba(X_dev)
y_dev_pred_prob = np.array([prob[:, 1] for prob in y_dev_pred_prob_raw]).T  # Shape: (n_samples, n_classes)
y_dev_pred = (y_dev_pred_prob >= optimal_thresholds).astype(int)

# Predictions for Test set
y_test_pred_prob_raw = xgb_multi.predict_proba(X_test)
y_test_pred_prob = np.array([prob[:, 1] for prob in y_test_pred_prob_raw]).T  # Shape: (n_samples, n_classes)
y_test_pred = (y_test_pred_prob >= optimal_thresholds).astype(int)

# Evaluation metrics function
def compute_metrics(y_true, y_pred, set_name):
    """
    Compute evaluation metrics for multi-label classification.
    
    Parameters:
    - y_true: Ground truth labels (n_samples, n_classes)
    - y_pred: Predicted labels (n_samples, n_classes)
    - set_name: String indicating the dataset (e.g., "Dev" or "Test")
    """
    # Compute F1 scores with different averaging methods
    f1_weighted = f1_score(y_true, y_pred, average="weighted", zero_division=0)
    f1_micro = f1_score(y_true, y_pred, average="micro", zero_division=0)
    f1_micro_ign = f1_score(y_true, y_pred, average="micro", zero_division=0)  # Same as micro, placeholder for consistency
    f1_weighted_ign = f1_score(y_true, y_pred, average="weighted", zero_division=0)  # Same as weighted

    # Compute precision and recall per class and average them
    precisions = []
    recalls = []
    for i in range(y_true.shape[1]):
        precision = precision_score(y_true[:, i], y_pred[:, i], zero_division=0)
        recall = recall_score(y_true[:, i], y_pred[:, i], zero_division=0)
        if not np.isnan(precision) and precision > 0:  # Exclude invalid or zero values
            precisions.append(precision)
        if not np.isnan(recall) and recall > 0:
            recalls.append(recall)

    precision_avg = np.mean(precisions) if precisions else 0.0
    recall_avg = np.mean(recalls) if recalls else 0.0

    # Print results
    print(f"\n {set_name} Set Evaluation Results:")
    print(f"  Micro F1:          {f1_micro:.4f}")
    print(f"  Weighted F1:       {f1_weighted:.4f}")
    print(f"  Micro Ign F1:      {f1_micro_ign:.4f}")  # Ignored divisions handled by zero_division=0
    print(f"  Weighted Ign F1:   {f1_weighted_ign:.4f}")
    print(f"  Average Precision: {precision_avg:.4f}")
    print(f"  Average Recall:    {recall_avg:.4f}")

# Evaluate both sets
compute_metrics(y_dev, y_dev_pred, "Dev")
compute_metrics(y_test, y_test_pred, "Test")


Evaluating on Dev and Test sets...

 Dev Set Evaluation Results:
  Micro F1:          0.6605
  Weighted F1:       0.6708
  Micro Ign F1:      0.6605
  Weighted Ign F1:   0.6708
  Average Precision: 0.4997
  Average Recall:    0.6743

 Test Set Evaluation Results:
  Micro F1:          0.6400
  Weighted F1:       0.6555
  Micro Ign F1:      0.6400
  Weighted Ign F1:   0.6555
  Average Precision: 0.4516
  Average Recall:    0.6572
