In [None]:
# Setup
# Install requirements (uncomment if needed)
# !pip install -r requirements.txt

import pandas as pd
from src.analyzer import analyze_profiles, load_config
from src.evaluate import evaluate_all_models
from dotenv import load_dotenv
from datetime import datetime

# Load environment variables from .env file
load_dotenv()

# Load configuration
config = load_config('config.yaml')

# Get paths from config
INPUT_PATH = config['paths']['input_data']
RESULTS_OUTPUT_PATH = config['paths']['results_output']
METRICS_OUTPUT_PATH = config['paths']['metrics_output']

# Show enabled models
enabled_models = [m for m in config['models'] if m.get('enabled', True)]
print(f"Enabled models: {len(enabled_models)}")
for m in enabled_models:
    print(f"  - {m['name']}: {m['model_id']}")

# Load data
df = pd.read_csv(INPUT_PATH)
print(f"Loaded {len(df)} profiles")
print(f"Columns: {list(df.columns)}")


In [None]:
# Run analysis and evaluate models

# Run analysis
results = analyze_profiles(
    df,
    config,
    input_col=config.get('input_column', 'about_me'),
    batch_size=config.get('batch_size', 10),
    max_new_tokens=config.get('max_new_tokens', 2000)
)

# Evaluate models against human labels

model_names = [m['name'] for m in enabled_models]

comparison = evaluate_all_models(
    results, 
    model_names,
    true_col=config['human_label_column'],
    print_individual_reports=False  # Set to True if you want detailed reports
)

# Save metrics and results with Spark
from pyspark.sql import SparkSession

# Initialize Spark session
spark = SparkSession.builder.appName("ProfileAnalyzer").getOrCreate()

timestamp = datetime.now().strftime('%Y-%m-%d_%H%M%S')

# Convert pandas to Spark DataFrames and save
metrics_filename = METRICS_OUTPUT_PATH.format(timestamp=timestamp).replace('.csv', '')
spark_comparison = spark.createDataFrame(comparison)
spark_comparison.coalesce(1).write.mode('overwrite').option('header', 'true').csv(metrics_filename)
print(f"\nModel metrics saved to: {metrics_filename}")

results_filename = RESULTS_OUTPUT_PATH.format(timestamp=timestamp).replace('.csv', '')
spark_results = spark.createDataFrame(results)
spark_results.coalesce(1).write.mode('overwrite').option('header', 'true').csv(results_filename)
print(f"Results saved to {results_filename}")

# Stop Spark session
spark.stop()

