# Import Libraries

In [None]:
import sys
import os
from pathlib import Path

# Set working directory to project root, if not done already.
project_root = Path('/Users/raymondlow/Documents/talking-to-machines/ai-population').resolve()
os.chdir(project_root)
if str(project_root) not in sys.path:
    sys.path.insert(0, str(project_root))

# Set __package__ so that relative imports work.
__package__ = "ai_population.analysis"

import pandas as pd
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score
from ai_population.src.market_signals_x import (
    perform_x_profile_search,
    perform_x_profile_metadata_search,
    perform_x_onboarding_interview,
)

# Set display options to show all rows and columns
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)

PROJECT_NAME = "market-signals-x"
EXECUTION_DATE = "ground-truth-v2"
# START_DATE = "2024-12-01"
START_DATE = "2025-06-13"
END_DATE = "2025-06-01"

# Download profile metadata and posts for ground truth finfluencers and non-finfluencers

In [None]:
perform_x_profile_metadata_search(
    project_name=PROJECT_NAME,
    execution_date=EXECUTION_DATE,
    input_file_path=f"{EXECUTION_DATE}/ground_truth_profile_list.csv",
    output_file_path="ground_truth_profile_metadata.csv",
)

In [None]:
perform_x_profile_search(
    project_name=PROJECT_NAME,
    execution_date=EXECUTION_DATE,
    input_file_path=f"{EXECUTION_DATE}/ground_truth_profile_list.csv",
    output_file_path="ground_truth_profile_posts-v2.csv",
    start_date=START_DATE,
    end_date=END_DATE,
)

# Conduct Onboarding Interview for ground truth finfluencers and non-finfluencers

In [None]:
perform_x_onboarding_interview(
    project_name=PROJECT_NAME, 
    execution_date=EXECUTION_DATE,
    profile_metadata_file="ground_truth_profile_metadata.csv", 
    post_file="ground_truth_profile_posts.csv", 
    output_file="ground_truth_onboarding_results.csv",
)

# Perform Identification Evaluation

In [None]:
def calculate_metrics(results_df: pd.DataFrame, ground_truth_finfluencer: list) -> None:

    # Exclude provided examples
    results_df = results_df[~results_df['account_id'].isin(["TheStalwart","AswathDamodaran","LizAnnSonders"])].reset_index(drop=True)

    # Convert "Is this a finfluencer? - category" column to binary values
    results_df['Is this a finfluencer? - category'] = results_df['Is this a finfluencer? - category'].map({'Yes': 1, 'No': 0})
    
    # Ensure there are no NaN values in the columns
    results_df = results_df.dropna(subset=['Is this a finfluencer? - category'])
    
    # Extract the true labels and predicted labels
    results_df['Finfluencer'] = [1 if account_id in ground_truth_finfluencer else 0 for account_id in results_df['account_id']]
    y_true = results_df['Finfluencer']
    y_pred = results_df['Is this a finfluencer? - category']
    
    # Calculate accuracy
    accuracy = accuracy_score(y_true, y_pred)
    
    # Calculate macro-averaged F1 score
    macro_f1 = f1_score(y_true, y_pred, average='macro')
    
    # Calculate AUC score
    auc = roc_auc_score(y_true, y_pred)
    
    return accuracy, macro_f1, auc

onboarding_results = pd.read_csv(os.path.join("ai_population/data", PROJECT_NAME, EXECUTION_DATE, "ground_truth_onboarding_results.csv"))
ground_truth = pd.read_csv(os.path.join("ai_population/data", PROJECT_NAME, EXECUTION_DATE, "ground_truth_profile_list.csv"))
ground_truth_finfluencer = ground_truth[ground_truth["finfluencer"]== "Yes"]["account_id"].tolist()
accuracy, macro_f1, auc = calculate_metrics(onboarding_results, ground_truth_finfluencer)
print(f'Accuracy: {accuracy:.5f}')
print(f'Macro-averaged F1 score: {macro_f1:.5f}')
print(f'AUC score: {auc:.5f}')