# Phase 1: Pipeline Test

Test all modules in the Screener-Reasoner pipeline:
1. Data Loader
2. Normalizer
3. Screener (AllLinLog)
4. Evidence Store
5. Retriever (BM25)

In [None]:
import sys
sys.path.insert(0, '..')

# Import all modules
from src.data_loader import BGLDataLoader, HDFSDataLoader, Session
from src.normalizer import LogNormalizer, get_normalizer
from src.screener import Screener, ScreenerOutput
from src.evidence_store import EvidenceStore, build_evidence_store
from src.retriever import BM25Retriever

print("All modules imported successfully!")

## 1. Test Data Loader (BGL)

In [None]:
# Load BGL dataset
bgl_loader = BGLDataLoader(
    log_file="../logs/BGL.log",
    windows_size=10,
    step_size=10,
    train_ratio=0.7,
    val_ratio=0.15,
    seed=42
)

bgl_loader.load()
bgl_loader.print_stats()

In [None]:
# Sample sessions
train_sessions = bgl_loader.get_train()
test_sessions = bgl_loader.get_test()

print(f"Train sessions: {len(train_sessions)}")
print(f"Test sessions: {len(test_sessions)}")

# Look at a sample session
sample = train_sessions[0]
print(f"\nSample session: {sample}")
print(f"\nFirst 3 log lines:")
for line in sample.lines[:3]:
    print(f"  {line[:100]}...")

## 2. Test Normalizer

In [None]:
# Get BGL normalizer
normalizer = get_normalizer("BGL")

# Test on a sample session
result = normalizer.normalize_session(sample)

print(f"Original length: {result.original_length}")
print(f"Normalized length: {result.normalized_length}")
print(f"Compression ratio: {result.compression_ratio:.2%}")
print(f"\nParam stats: {result.param_stats}")
print(f"\nNormalized text (first 500 chars):")
print(result.normalized_text[:500])

## 3. Test Screener (AllLinLog)

In [None]:
# Load screener with pre-trained model
screener = Screener(
    model_path="../best_model/best_model_20250724_072857.pth",
    windows_size=10
)

screener.load()
print(f"Screener loaded on device: {screener.device}")

In [None]:
# Test inference on a few sessions
from tqdm import tqdm

# Get some normal and anomaly samples
normal_samples = [s for s in test_sessions if s.label == 0][:5]
anomaly_samples = [s for s in test_sessions if s.label == 1][:5]

print("Testing on normal samples:")
for s in normal_samples:
    output = screener.predict(s)
    status = "✓" if output.pred == 0 else "✗"
    print(f"  {s.session_id}: pred={output.pred}, prob={output.anomaly_prob:.4f}, margin={output.margin:.4f} {status}")

print("\nTesting on anomaly samples:")
for s in anomaly_samples:
    output = screener.predict(s)
    status = "✓" if output.pred == 1 else "✗"
    print(f"  {s.session_id}: pred={output.pred}, prob={output.anomaly_prob:.4f}, margin={output.margin:.4f} {status}")

In [None]:
# Full evaluation on test set
print("Running full evaluation on test set...")
metrics = screener.evaluate(test_sessions)

print(f"\n=== Test Set Metrics ===")
print(f"Accuracy: {metrics['accuracy']:.4f}")
print(f"Precision: {metrics['precision']:.4f}")
print(f"Recall: {metrics['recall']:.4f}")
print(f"F1: {metrics['f1']:.4f}")

## 4. Test Evidence Store

In [None]:
# Build evidence store from train sessions
evidence_store = EvidenceStore(dataset="BGL")
evidence_store.build_from_sessions(train_sessions)

# Print stats
stats = evidence_store.stats()
print(f"\n=== Evidence Store Stats ===")
for k, v in stats.items():
    print(f"  {k}: {v}")

In [None]:
# Save evidence store for later use
evidence_store.save("../results/evidence_store_BGL.json")

# Sample evidence document
sample_doc = evidence_store[0]
print(f"\nSample evidence document:")
print(f"  ID: {sample_doc.evidence_id}")
print(f"  Session: {sample_doc.session_id}")
print(f"  Label: {sample_doc.metadata.get('label')}")
print(f"  Text (first 300 chars): {sample_doc.text[:300]}...")

## 5. Test BM25 Retriever

In [None]:
# Build BM25 retriever
retriever = BM25Retriever(evidence_store)
retriever.build_index()

In [None]:
# Test retrieval for an anomaly session
test_anomaly = [s for s in test_sessions if s.label == 1][0]

print(f"Query session: {test_anomaly.session_id}")
print(f"Query label: {test_anomaly.label}")
print(f"\nQuery log lines:")
for line in test_anomaly.lines[:3]:
    print(f"  {line[:80]}...")

# Retrieve top-5 evidence
hits = retriever.retrieve_for_session(test_anomaly, top_k=5)

print(f"\n=== Top-5 Retrieved Evidence ===")
for hit in hits:
    label = hit.metadata.get('label', 'unknown')
    label_str = "ANOMALY" if label == 1 else "NORMAL"
    print(f"\n[{hit.rank}] {hit.evidence_id} (score={hit.score:.4f}, label={label_str})")
    print(f"    {hit.text[:150]}...")

In [None]:
# Analyze retrieval quality: what fraction of top-k are same-label?
from collections import Counter

# Sample 20 anomaly sessions
test_anomalies = [s for s in test_sessions if s.label == 1][:20]

same_label_counts = []
for session in test_anomalies:
    hits = retriever.retrieve_for_session(session, top_k=5)
    same_label = sum(1 for h in hits if h.metadata.get('label') == session.label)
    same_label_counts.append(same_label)

avg_same_label = sum(same_label_counts) / len(same_label_counts)
print(f"Average same-label hits in top-5: {avg_same_label:.2f} / 5 ({avg_same_label/5:.1%})")
print(f"Distribution: {Counter(same_label_counts)}")

## Summary

Phase 1 modules are working:
- ✅ Data Loader (BGL sessions)
- ✅ Normalizer (IP, HEX, etc. → placeholders)
- ✅ Screener (AllLinLog inference)
- ✅ Evidence Store (train corpus)
- ✅ BM25 Retriever (top-k evidence)