# Import Libraries

In [None]:
import sys
import os
from pathlib import Path
import matplotlib.pyplot as plt
import numpy as np
from sklearn.model_selection import train_test_split

# Set working directory to project root, if not done already.
project_root = Path('/Users/raymondlow/Documents/talking-to-machines/ai-population').resolve()
os.chdir(project_root)
if str(project_root) not in sys.path:
    sys.path.insert(0, str(project_root))

# Set __package__ so that relative imports work.
__package__ = "ai_population.analysis"

import pandas as pd
from sklearn.metrics import (
    accuracy_score,
    f1_score,
    roc_auc_score,
    precision_score,
    recall_score,
    precision_recall_curve,
    roc_curve,
    auc
)
from ai_population.src.market_signals_tiktok import (
    perform_tiktok_profile_search,
    perform_tiktok_profile_metadata_search,
    perform_tiktok_onboarding_interview,
    generate_expert_reflections,
)
from ai_population.src.utils import (
    perform_video_transcription,
)

# Set display options to show all rows and columns
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)

PROJECT_NAME = "market-signals-tiktok"
EXECUTION_DATE = "ground-truth"
START_DATE = "12-01-2024"
END_DATE = "06-01-2025"

# Download profile metadata and posts for ground truth finfluencers and non-finfluencers

In [None]:
perform_tiktok_profile_metadata_search(
    project_name=PROJECT_NAME,
    execution_date=EXECUTION_DATE,
    input_file_path=f"{EXECUTION_DATE}/ground_truth_profile_list.csv",
    output_file_path="ground_truth_profile_metadata.csv",
)

In [None]:
perform_tiktok_profile_search(
    project_name=PROJECT_NAME,
    execution_date=EXECUTION_DATE,
    input_file_path=f"{EXECUTION_DATE}/ground_truth_profile_list.csv",
    output_file_path="ground_truth_profile_posts.csv",
    start_date=START_DATE,
    end_date=END_DATE,
)

# Download and transcribe videos from ground truth finfluencers and non-finfluencers

In [None]:
perform_video_transcription(
    project_name=PROJECT_NAME,
    execution_date=EXECUTION_DATE,
    video_file="ground_truth_profile_posts.csv",
)

# Generate Expert Reflections

In [None]:
generate_expert_reflections(
    project_name=PROJECT_NAME,
    execution_date=EXECUTION_DATE,
    role="portfolio_manager",
    profile_metadata_file="ground_truth_profile_metadata.csv",
    post_file="ground_truth_profile_posts.csv",
    output_file="ground_truth_expert_reflection.csv",
)
generate_expert_reflections(
    project_name=PROJECT_NAME,
    execution_date=EXECUTION_DATE,
    role="investment_advisor",
    profile_metadata_file="ground_truth_expert_reflection.csv",
    post_file="ground_truth_profile_posts.csv",
    output_file="ground_truth_expert_reflection.csv",
)
generate_expert_reflections(
    project_name=PROJECT_NAME,
    execution_date=EXECUTION_DATE,
    role="financial_analyst",
    profile_metadata_file="ground_truth_expert_reflection.csv",
    post_file="ground_truth_profile_posts.csv",
    output_file="ground_truth_expert_reflection.csv",
)
generate_expert_reflections(
    project_name=PROJECT_NAME,
    execution_date=EXECUTION_DATE,
    role="economist",
    profile_metadata_file="ground_truth_expert_reflection.csv",
    post_file="ground_truth_profile_posts.csv",
    output_file="ground_truth_expert_reflection.csv",
)

# Conduct Onboarding Interview for ground truth finfluencers and non-finfluencers

In [None]:
perform_tiktok_onboarding_interview(
    project_name=PROJECT_NAME, 
    execution_date=EXECUTION_DATE,
    profile_metadata_file="ground_truth_expert_reflection.csv", 
    post_file="ground_truth_profile_posts.csv", 
    output_file="ground_truth_onboarding_results.csv",
)

# Perform Identification Evaluation

In [None]:
def _evaluate_split(y_true, y_score, threshold, split_name="Validation"):
    """
    Compute metrics, plot PR & ROC, and return a dict with the numbers.
    """
    # Predictions at the chosen threshold
    y_pred = (y_score >= threshold).astype(int)

    # Core metrics
    metrics = {
        "accuracy"    : accuracy_score(y_true, y_pred),
        "macro_f1"    : f1_score(y_true, y_pred, average="macro"),
        "precision"   : precision_score(y_true, y_pred, zero_division=0),
        "recall"      : recall_score(y_true, y_pred, zero_division=0),
        "auc"         : roc_auc_score(y_true, y_score),
        "threshold"   : threshold,
        "split"       : split_name
    }

    # ── Precision–Recall curve ────────────────────────────────────────────
    precisions, recalls, _ = precision_recall_curve(y_true, y_score)
    plt.figure(figsize=(6, 4))
    plt.plot(recalls, precisions, label=f"{split_name} PR curve")
    plt.xlabel("Recall")
    plt.ylabel("Precision")
    plt.title(f"Precision–Recall ({split_name})")
    plt.legend()
    plt.grid(True)
    plt.show()

    # ── ROC curve ─────────────────────────────────────────────────────────
    fpr, tpr, _ = roc_curve(y_true, y_score)
    plt.figure(figsize=(6, 4))
    plt.plot(fpr, tpr, label=f"{split_name} ROC (AUC = {metrics['auc']:.2f})")
    plt.plot([0, 1], [0, 1], "k--", label="Random")
    plt.xlabel("False Positive Rate")
    plt.ylabel("True Positive Rate")
    plt.title(f"ROC Curve ({split_name})")
    plt.legend()
    plt.grid(True)
    plt.show()

    print(f"── {split_name} metrics ──")
    for k, v in metrics.items():
        if k not in {"split", "threshold"}:
            print(f"{k:>10s}: {v:.5f}")
    print(f"Threshold used: {threshold:.3f}\n")

    return metrics


def calculate_metrics_with_split(results_df: pd.DataFrame,
                                 ground_truth_finfluencer: list,
                                 val_fraction: float = 0.50,
                                 random_state: int = 42) -> pd.DataFrame:
    # ─── 1.  Prep ────────────────────────────────────────────────────────
    results_df["finfluencer_prob"] = pd.to_numeric(
        results_df["Indicate on a scale of 0 to 100, how likely this creator is a finfluencer (0 means most definitely not a finfluencer and 100 means most definitely a finfluencer)? - value"],
        errors="coerce"
    ) / 100.0
    results_df = results_df.dropna(subset=["finfluencer_prob"]).reset_index(drop=True)
    results_df["finfluencer"] = results_df["account_id"].isin(ground_truth_finfluencer).astype(int)

    EXAMPLE_PROFILES = [
        "fung.money", "humphreytalks", "financialtimes",  # Finfluencers
        "cbsmornings", "grahamstephan", "bellapoarch"  # Non-Finfluencers
    ]
    example_df = results_df[results_df["account_id"].isin(EXAMPLE_PROFILES)].reset_index(drop=True).copy()
    df = results_df[~results_df["account_id"].isin(EXAMPLE_PROFILES)].reset_index(drop=True).copy()

    # ─── 2.  Split ───────────────────────────────────────────────────────
    validate_df, holdout_df = train_test_split(
        df,
        test_size=val_fraction,
        stratify=df["finfluencer"],
        random_state=random_state,
    )
    validate_df = pd.concat([validate_df, example_df], ignore_index=True)

    # ─── 3.  Threshold search on validation (train_df) ───────────────────
    y_val_true  = validate_df["finfluencer"].values
    y_val_score = validate_df["finfluencer_prob"].values

    prec, rec, thresh = precision_recall_curve(y_val_true, y_val_score)
    # NB: thresh has length len(prec) - 1
    f1 = 2 * prec[1:] * rec[1:] / (prec[1:] + rec[1:] + 1e-18)
    best_idx = np.argmax(f1)
    best_threshold = thresh[best_idx]

    # ─── 4.  Evaluate on validation & hold-out ───────────────────────────
    _ = _evaluate_split(y_val_true, y_val_score, best_threshold,
                        split_name="Validation")

    y_hold_true  = holdout_df["finfluencer"].values
    y_hold_score = holdout_df["finfluencer_prob"].values
    _ = _evaluate_split(y_hold_true, y_hold_score, best_threshold,
                        split_name="Hold-out")

    # ─── 5.  Annotate & return full dataframe ───────────────────────────
    validate_df["predicted_label"] = (validate_df["finfluencer_prob"] >= best_threshold).astype(int)
    holdout_df["predicted_label"] = (holdout_df["finfluencer_prob"] >= best_threshold).astype(int)

    return validate_df, holdout_df

In [None]:
onboarding_results = pd.read_csv(os.path.join(
    "ai_population/data", PROJECT_NAME, EXECUTION_DATE, "ground_truth_onboarding_results.csv"))

ground_truth = pd.read_csv(os.path.join(
    "ai_population/data", PROJECT_NAME, EXECUTION_DATE, "ground_truth_profile_list.csv"))

ground_truth_finfluencer = ground_truth.loc[
    ground_truth["finfluencer"] == "Yes", "account_id"
].tolist()

validate_df, holdout_df = calculate_metrics_with_split(
    onboarding_results,
    ground_truth_finfluencer,
    val_fraction=0.50,      # 50 % / 50 % split
    random_state=42         # reproducible
)

validate_df.to_csv(
    os.path.join("ai_population/data", PROJECT_NAME, EXECUTION_DATE, "ground_truth_validation_set.csv"),
    index=False
)
holdout_df.to_csv(
    os.path.join("ai_population/data", PROJECT_NAME, EXECUTION_DATE, "ground_truth_holdout_set.csv"), 
    index=False
)