In [None]:
import sys
sys.path.append('../..')

import torch
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.manifold import TSNE

from src.models.embeddings import EnergyDomainEmbedding
from src.data.preprocessing import DocumentProcessor
from src.vector_db.milvus_client import MilvusClient
from src.utils.metrics import calculate_metrics


In [None]:
# Initialize components
model = EnergyDomainEmbedding()
processor = DocumentProcessor()
db_client = MilvusClient()

# Load a trained model if available
model_path = '../results/best_model.pt'
try:
    model.load_state_dict(torch.load(model_path))
    print("Loaded trained model")
except:
    print("Using untrained model")

# Set to evaluation mode
model.eval()


In [None]:
# Example texts (replace with your data)
texts = [
    "Example energy infrastructure text 1",
    "Example energy infrastructure text 2",
    # Add more texts
]

# Process texts
processed_data = [processor.process_document(text) for text in texts]
chunks = [chunk for doc in processed_data for chunk in doc['chunks']]

# Generate embeddings
with torch.no_grad():
    embeddings = model.encode_batch(chunks)

print(f"Generated {len(embeddings)} embeddings of dimension {embeddings.shape[1]}")


In [None]:
# Reduce dimensionality for visualization
tsne = TSNE(n_components=2, random_state=42)
embeddings_2d = tsne.fit_transform(embeddings.numpy())

# Plot
plt.figure(figsize=(12, 8))
plt.scatter(embeddings_2d[:, 0], embeddings_2d[:, 1], alpha=0.5)
plt.title('t-SNE Visualization of Embeddings')
plt.xlabel('Component 1')
plt.ylabel('Component 2')
plt.show()


In [None]:
from torch.nn.functional import cosine_similarity

# Calculate similarity matrix
sim_matrix = cosine_similarity(embeddings.unsqueeze(1), embeddings.unsqueeze(0))

# Plot similarity distribution
plt.figure(figsize=(10, 6))
sns.histplot(sim_matrix.numpy().flatten(), bins=50)
plt.title('Distribution of Cosine Similarities')
plt.xlabel('Cosine Similarity')
plt.ylabel('Count')
plt.show()


In [None]:
# Simulate some queries
query_times = []
query_results = []

for i in range(min(5, len(chunks))):
    # Time the query
    import time
    start_time = time.time()
    
    # Get query results
    results = db_client.search(embeddings[i], top_k=5)
    
    query_time = time.time() - start_time
    query_times.append(query_time)
    query_results.append(results)

# Calculate metrics
metrics = calculate_metrics(
    embeddings=embeddings,
    query_results=query_results,
    query_times=query_times,
    chunks=chunks,
    original_text=' '.join(texts)
)

# Display metrics
for category, category_metrics in metrics.items():
    print(f"\n{category.upper()} METRICS:")
    for metric_name, value in category_metrics.items():
        print(f"{metric_name}: {value:.4f}")


In [None]:
# Analyze language distribution
languages = [doc['language'] for doc in processed_data]
lang_dist = pd.Series(languages).value_counts()

plt.figure(figsize=(8, 6))
lang_dist.plot(kind='bar')
plt.title('Distribution of Languages')
plt.xlabel('Language')
plt.ylabel('Count')
plt.xticks(rotation=45)
plt.show()


In [None]:
# Save metrics to CSV
metrics_df = pd.DataFrame()
for category, category_metrics in metrics.items():
    for metric_name, value in category_metrics.items():
        metrics_df.loc[category, metric_name] = value

metrics_df.to_csv('../results/embedding_metrics.csv')
print("Saved metrics to embedding_metrics.csv")
