In [0]:
# Databricks notebook source

# 1. Setup
%pip install -r /Workspace/Repos/vthedataeng@gmail.com/wfa_profile_analyzer/requirements.txt
dbutils.library.restartPython()


In [0]:
# COMMAND ----------
# 2. Imports & config

import sys, logging
from pathlib import Path
import pandas as pd

REPO_ROOT = Path("/Workspace/Repos/vthedataeng@gmail.com/wfa_profile_analyzer")
CONFIG_PATH = Path("/Workspace/Repos/vthedataeng@gmail.com/wfa_profile_analyzer/config/config.yaml")
LOG_LEVEL = "INFO"

# Import our modules
from src.utils.logger import setup_logging
from src.config import load_config
from src.processing.batch_processor import ProfileBatchProcessor
from src.evaluation.compare import evaluate_multi_model_dataframe

level_map = {"DEBUG": logging.DEBUG, "INFO": logging.INFO, "WARNING": logging.WARNING}
setup_logging(level=level_map.get(LOG_LEVEL, logging.INFO))


In [0]:
# COMMAND ----------
# 3. Load config (includes HF token)

config = load_config(CONFIG_PATH)
print("✅ Config loaded")
print(f"HF Token loaded from YAML: {bool(config.hf_token)}")  # For debugging, don't print the token itself!
print(f"Models enabled: {len(config.enabled_models)}")
for i, m in enumerate(config.enabled_models, 1):
    print(f"  {i}. {m.name} -> {m.model_id}")

print("\nGeneration:")
print(f"  batch_size={config.generation.batch_size}, max_new_tokens={config.generation.max_new_tokens}")


In [0]:
# COMMAND ----------
# 4. Initialize processor

processor = ProfileBatchProcessor(config)
print(processor.get_model_summary())


In [0]:
# COMMAND ----------
# 5. Provide input DataFrame
data_path = "/Workspace/Users/vthedataeng@gmail.com/wfa_profile_analyzer/about_me_quality_dataset.csv"

sample_df = pd.read_csv(data_path)

print(f"Processing {len(sample_df)} rows with {len(config.enabled_models)} model(s)...")
result_df = processor.process_dataframe(sample_df, input_column=config.data.dataset_column, parse_outputs=True)

display(result_df)


In [0]:
# COMMAND ----------
# 6. Evaluate (if you have human labels)

evaluate_multi_model_dataframe(result_df, config, config.enabled_models)


In [0]:
# COMMAND ----------
# 7. (Optional) Save results

result_df.to_csv("/Workspace/Users/vthedataeng@gmail.com/wfa_profile_analyzer/results.csv", index=False)
print("Saved to /dbfs/tmp/wfa_profile_analysis_results.csv")

print("🎉 Done")
