# Reinforcement Learning
- Training 2 RL models: BASE vs FULL and comparing results
- Simulates VC Portfolio allocation

## Setup

### Install

In [None]:
!pip install --no-cache-dir stable-baselines3

In [None]:
# !pip install pandas numpy joblib scikit-learn gymnasium stable-baselines3 matplotlib seaborn scipy
!pip install gymnasium

In [None]:
!pip install catboost

### Imports

In [None]:
import os
import numpy as np
import pandas as pd
import joblib
import gymnasium
from gymnasium import spaces
from sklearn.preprocessing import MultiLabelBinarizer, StandardScaler
from sklearn.compose import ColumnTransformer
from stable_baselines3 import DQN
from stable_baselines3.common.monitor import Monitor
from stable_baselines3.common.callbacks import EvalCallback
from scipy.stats import mannwhitneyu
import matplotlib.pyplot as plt
from matplotlib.ticker import PercentFormatter
import seaborn as sns
from joblib import load
from sklearn.model_selection import train_test_split

from tqdm.notebook import tqdm
import warnings
warnings.filterwarnings("ignore")

# Ensure display options are set for pandas DataFrames
pd.set_option('display.max_columns', None)
pd.set_option('display.width', 1000)

from google.colab import drive, runtime
drive.mount('/content/drive', force_remount=True)

## Config

In [None]:
# === CONFIG ===
INPUT_DIR = '/content/drive/MyDrive/Senior/Thesis/Code/Data/Input Data/model_data'
MODEL_FOLDER = '/content/drive/MyDrive/Senior/Thesis/Code/Data/Output Data/Model'
OUTPUT_DIR = '/content/drive/MyDrive/Senior/Thesis/Code/Data/Output Data/RL_Model'
DATA_PATH = os.path.join(INPUT_DIR, 'merged_startup_data.csv')

CLASSIFIER_A_PATH = os.path.join(MODEL_FOLDER, 'best_model_success_label_5y_STATIC_MODEL_CatBoost_20250421-004244.pkl')
CLASSIFIER_B_PATH = os.path.join(MODEL_FOLDER, 'best_model_success_label_5y_FULL_MODEL_CatBoost_20250421-004244.pkl')
PREPROCESSOR_A_PATH = os.path.join(MODEL_FOLDER, 'preprocessor_success_label_5y_STATIC_MODEL_CatBoost_20250421-004244.pkl')
PREPROCESSOR_B_PATH = os.path.join(MODEL_FOLDER, 'preprocessor_success_label_5y_FULL_MODEL_CatBoost_20250421-004244.pkl')
SCALER_A_PATH = os.path.join(MODEL_FOLDER,'state_scaler_A.pkl')
SCALER_B_PATH = os.path.join(MODEL_FOLDER, 'state_scaler_B.pkl')

model_folder_contents = os.listdir(MODEL_FOLDER)
print(model_folder_contents)

known_industries = [
   "Life Sciences", "Fintech", "Consumer Goods", "Technology",
   "Cleantech", "Transportation", "Media Entertainment and Gaming",
   "Telecom", "Real Estate"
]

# RL hyperparameters
# 10 years
SIM_TRAIN_START_YEAR = 2004
SIM_TRAIN_END_YEAR = 2013
# 3 years
SIM_VALID_START_YEAR = 2014
SIM_VALID_END_YEAR = 2016
# 4 years
SIM_TEST_START_YEAR = 2017
SIM_TEST_END_YEAR = 2020

# max number of investments is 20 over the time frame
INITIAL_CAPITAL = 20_000_000
INVESTMENT_UNIT = 1_000_000
EPISODE_LENGTH_YEARS = 5

REWARD_MAP = {
    'ipo': +15,
    'acquisition': +12,
    # 'funded_no_exit': +1,
    'closed': -20,
    'still_active': 0,
    'too_young': 0
}

RETURNS_MAP = {
    'ipo': 10.0,
    'acquisition': 8.0,
    # 'funded_no_exit': 4.0,
    'closed': 0.0,
    'still_active': 0.0,
    'too_young': 0.0
}

N_TRAIN_TIMESTEPS = 100000
N_EVAL_EPISODES = 50
DQN_GAMMA = 0.99
DQN_LR = 1e-4
DQN_BUFFER_SIZE = 50000
DQN_BATCH_SIZE = 64
DQN_EXPLORATION_FRACTION = 0.3
DQN_EXPLORATION_FINAL_EPS = 0.05

## Loading

In [None]:
def load_and_preprocess_data(data_path, preprocessor_path, scaler_path, only_resolved=False, still_active_sample=0):
    df = pd.read_csv(data_path)
    df["industry_split"] = df["industry"].fillna("").apply(lambda x: [s.strip() for s in x.split(",")])
    mlb = MultiLabelBinarizer(classes=known_industries)
    industry_df = pd.DataFrame(
        mlb.fit_transform(df["industry_split"]),
        columns=[f"industry_{c.replace(' ', '_').replace('_and_', '_')}" for c in mlb.classes_]
    )
    industry_columns = industry_df.columns.tolist()
    df = pd.concat([df.drop(columns=["industry", "industry_split"]), industry_df], axis=1)

    # --- Optional trimming for resolved outcomes only ---
    if only_resolved:
        initial_count = len(df)

        # Step 1: Keep all startups with known outcomes
        resolved_outcomes = ['ipo', 'acquisition','closed']
        df_resolved = df[df['outcome_type'].isin(resolved_outcomes)].copy()

        # Step 2: Sample still_active startups if requested
        df_sampled = df_resolved
        if still_active_sample > 0:
            df_active_sample = df[df['outcome_type'] == 'still_active'].sample(n=still_active_sample, random_state=42)
            df_young_sample = df[df['outcome_type'] == 'too_young'].sample(n=still_active_sample, random_state=42)
            # df_funded_no_exit_sample = df[df['outcome_type'] == 'funded_no_exit'].sample(n=still_active_sample, random_state=42)
            # df_sampled = pd.concat([df_resolved, df_funded_no_exit_sample, df_active_sample, df_young_sample])
            df_sampled = pd.concat([df_resolved, df_active_sample, df_young_sample])
            # df_sampled = pd.concat([df_resolved, df_active_sample])


        df = df_sampled.sort_values(by="founded_year").reset_index(drop=True)
        print(f"[INFO] Filtered to resolved outcomes + sampled still_active ({still_active_sample}): {len(df)} rows out of {initial_count}")

    else:
        df = df.sort_values(by="founded_year").reset_index(drop=True)

    preprocessor = load(preprocessor_path)
    state_scaler = joblib.load(scaler_path)

    return df, preprocessor, state_scaler, industry_columns

df_A, preprocessor_A, state_scaler_A, industry_columns = load_and_preprocess_data(DATA_PATH, PREPROCESSOR_A_PATH, SCALER_A_PATH, only_resolved=True, still_active_sample=4000)
df_B, preprocessor_B, state_scaler_B, _ = load_and_preprocess_data(DATA_PATH, PREPROCESSOR_B_PATH, SCALER_B_PATH, only_resolved=True, still_active_sample=4000)

# def load_and_preprocess_data(data_path, preprocessor_path):
#     df = pd.read_csv(data_path)
#     df["industry_split"] = df["industry"].fillna("").apply(lambda x: [s.strip() for s in x.split(",")])
#     mlb = MultiLabelBinarizer(classes=known_industries)
#     industry_df = pd.DataFrame(
#         mlb.fit_transform(df["industry_split"]),
#         columns=[f"industry_{c.replace(' ', '_').replace('_and_', '_')}" for c in mlb.classes_]
#     )
#     industry_columns = industry_df.columns.tolist()
#     df = pd.concat([df.drop(columns=["industry", "industry_split"]), industry_df], axis=1)

#     df = df.sort_values(by='founded_year').reset_index(drop=True)
#     preprocessor = load(preprocessor_path)

#     return df, preprocessor, industry_columns

# df_A, preprocessor_A, industry_columns = load_and_preprocess_data(DATA_PATH, PREPROCESSOR_A_PATH)
# df_B, preprocessor_B, _ = load_and_preprocess_data(DATA_PATH, PREPROCESSOR_B_PATH)

features_static = [
    'founded_year', 'city', 'founder_count', 'founder_gender_diversity',
    'has_top_school_founder', 'num_optimal_degrees', 'optimal_degree_ratio',
    'is_repeat_founder', 'founder_gender_missing', 'founder_degree_missing',
    'founder_school_missing', 'founder_desc_missing', 'has_funding_data',
    'num_disclosed_rounds', 'has_disclosed_funding', 'first_funding_amount_bucket',
    'is_startup_hub', 'cohort_funding_density', 'investor_count', 'has_known_investor'
] + industry_columns

features_macro = features_static + [
    'age_at_first_funding', 'first_funding_delay', 'early_series_count',
    'avg_time_to_early_round_months', 'avg_time_between_rounds', 'burn_rate',
    'funding_velocity', 'gdp_growth_avg_15m', 'gdp_growth_delta_3m',
    'interest_rate_fed_funds_avg_15m', 'interest_rate_fed_funds_delta_3m',
    'fed_funds_rate_latest', 'yield_curve_10y_2y_avg_15m', 'yield_curve_inversion_flag',
    'unemployment_rate_avg_15m', 'cpi_inflation_avg_15m', 'consumer_sentiment_avg_15m',
    'consumer_sentiment_z_latest', 'vix_index_max_15m', 'vix_spike_flag', 'vix_latest',
    'sp500_price_change_3m', 'sp500_volatility_3m', 'sp500_momentum_latest',
    'avg_etf_return_3m', 'avg_etf_volatility_3m', 'avg_etf_momentum_latest',
    'avg_etf_golden_cross_flag', 'avg_etf_ma50_to_price_ratio'
]

features_full = features_macro + [
    'org_desc_sentiment_finbert', 'founder_desc_sentiment_finbert',
    'org_desc_sim_exemplar', 'founder_desc_sim_exemplar', 'llm_founder_score',
    'industry_outlook_sentiment_finbert', 'industry_timing_sentiment_finbert',
    'llm_outlook_align_score_avg', 'llm_outlook_align_score_binned'
]

def load_classifier(path):
    try:
        print(f"Loading classifier from {path}")
        return joblib.load(path)
    except Exception as e:
        print(f"Failed to load classifier: {e}")
        return None

classifier_A = load_classifier(CLASSIFIER_A_PATH)
classifier_B = load_classifier(CLASSIFIER_B_PATH)

In [None]:
print(df_A["outcome_type"].value_counts())
# print(df_A.info())

## RL Environment

In [None]:
class StartupInvestmentEnv(gymnasium.Env):
    metadata = {"render_modes": ["human"], "render_fps": 4}

    def __init__(self, df, classifier, feature_list, preprocessor, state_scaler, reward_map,
                 sim_start_year, sim_end_year, initial_capital, investment_unit,
                 episode_length_years=None, is_eval=False):
        super().__init__()
        self.df = df
        self.classifier = classifier
        # self.fixed_feature_list = feature_list_static
        self.feature_list = feature_list # this one varies
        self.preprocessor = preprocessor
        self.state_scaler = state_scaler
        self.reward_map = reward_map
        self.sim_start_year = sim_start_year
        self.sim_end_year = sim_end_year
        self.initial_capital = initial_capital
        self.investment_unit = investment_unit
        self.episode_length_years = episode_length_years
        self.is_eval = is_eval

        self.max_investments_per_year = 5
        self.context_features_dim = 3
        self.state_dim = self._estimate_state_dim()

        self.action_space = spaces.Discrete(2)
        self.observation_space = spaces.Box(low=-np.inf, high=np.inf, shape=(self.state_dim,), dtype=np.float32)

        required_cols = feature_list + ['founded_year', 'outcome_type']
        for col in required_cols:
            if col not in self.df.columns:
                raise ValueError(f"Missing required column: {col}")

        self.sim_data = self.df[(self.df['founded_year'] >= self.sim_start_year) &
                                (self.df['founded_year'] <= self.sim_end_year)].copy()
        if self.sim_data.empty:
            raise ValueError("No data found for specified simulation window.")
        self.sim_data.reset_index(drop=False, inplace=True)

    def _estimate_state_dim(self):
        dummy_row = self.df.iloc[0][self.feature_list]
        transformed = self.preprocessor.transform(pd.DataFrame([dummy_row.to_dict()]))
        return transformed.shape[1] + 1 + self.context_features_dim

    def _get_state(self):
        if self.candidate_idx_in_year >= len(self.current_year_candidates):
            return np.zeros(self.state_dim, dtype=np.float32)

        row = self.current_year_candidates.iloc[self.candidate_idx_in_year]
        try:
            row_data = row[self.feature_list].replace([np.inf, -np.inf], np.nan).fillna(0)
            candidate_df = pd.DataFrame([row_data.to_dict()])
            transformed_features = self.preprocessor.transform(candidate_df).flatten()
        except Exception as e:
            print(f"Preprocessor failed at idx {row['index']}: {e}")
            transformed_features = np.zeros(self.state_dim - 1 - self.context_features_dim)

        try:
            clf_input = transformed_features.reshape(1, -1)
            prob_success = self.classifier.predict_proba(clf_input)[0, 1]
        except Exception as e:
            print(f"Classifier failed at idx {row['index']}: {e}")
            prob_success = 0.0


        # if I wanna fix the feature list for both (just static features), and then only use the full for the classifier on agent B
        # idx = row['index']

        # # === 1. Use fixed (static) features for the RL state ===
        # try:
        #     row_data_state = row[self.rl_feature_list]  # <- static features only
        #     X_state = pd.DataFrame([row_data_state.to_dict()])
        #     transformed_features = self.preprocessor.transform(X_state).flatten()
        # except Exception as e:
        #     print(f"Preprocessor failed on state features at idx {idx}: {e}")
        #     transformed_features = np.zeros(self.state_dim - 1 - self.context_features_dim)

        # # === 2. Use model-specific features for classifier.predict_proba() ===
        # try:
        #     row_data_clf = row[self.feature_list]  # <- full or static, depending on classifier
        #     X_clf = pd.DataFrame([row_data_clf.to_dict()])
        #     clf_input = self.preprocessor.transform(X_clf)
        #     prob_success = self.classifier.predict_proba(clf_input)[0, 1]
        # except Exception as e:
        #     print(f"Classifier failed on input features at idx {idx}: {e}")
        #     prob_success = 0.0

        capital_ratio = self.available_capital / self.initial_capital
        year_progress = (self.current_sim_year - self.start_year_episode) / max(1, self.sim_end_year - self.start_year_episode)
        portfolio_ratio = len(self.portfolio) / max(1, (self.initial_capital // self.investment_unit))

        context_features = np.array([capital_ratio, year_progress, portfolio_ratio], dtype=np.float32)
        raw_state = np.concatenate([transformed_features, [prob_success], context_features])
        state = self.state_scaler.transform(raw_state.reshape(1, -1)).flatten()

        return state.astype(np.float32)

    def _calculate_reward(self, idx):
        outcome_type = self.df.loc[idx, 'outcome_type']
        return self.reward_map.get(outcome_type, 0)

    def reset(self, seed=None, options=None):
        super().reset(seed=seed)
        self.current_sim_year = self.sim_start_year
        self.start_year_episode = self.sim_start_year
        self.available_capital = self.initial_capital
        self.portfolio = {}
        self.investment_log = []
        self.investments_this_year = 0

        # Refresh and shuffle candidates for this year
        self._refresh_candidates_for_year()
        return self._get_state(), {}

    def _refresh_candidates_for_year(self):
        self.current_year_candidates = (
            self.sim_data[self.sim_data['founded_year'] == self.current_sim_year]
            .sample(frac=1, random_state=np.random.randint(0, 1e6))  # shuffle candidates
            .reset_index(drop=False)
        )
        self.candidate_idx_in_year = 0

    def step(self, action):
        terminated = False
        truncated = False
        reward = 0

        if self.candidate_idx_in_year >= len(self.current_year_candidates) or self.investments_this_year >= self.max_investments_per_year:
            self.current_sim_year += 1
            if self.current_sim_year > self.sim_end_year:
                return np.zeros(self.state_dim, dtype=np.float32), 0, True, False, {}

            self.investments_this_year = 0
            self._refresh_candidates_for_year()

        if self.candidate_idx_in_year >= len(self.current_year_candidates):
            return self.step(action=0)  # skip if no startups

        row = self.current_year_candidates.iloc[self.candidate_idx_in_year]
        idx = row['index']
        self.candidate_idx_in_year += 1

        invested = False
        if action == 1 and self.available_capital >= self.investment_unit and self.investments_this_year < self.max_investments_per_year:
            self.available_capital -= self.investment_unit
            reward = self._calculate_reward(idx)
            self.portfolio[idx] = {
                'invest_year': self.current_sim_year,
                'status': 'active',
                'reward': reward,
                'outcome_type': row['outcome_type']
            }
            invested = True
            self.investments_this_year += 1

        self.investment_log.append({
            'year': self.current_sim_year,
            'startup_idx': idx,
            'decision': 'invest' if invested else 'pass'
        })

        if not self.is_eval and self.episode_length_years is not None:
            if (self.current_sim_year - self.start_year_episode) >= self.episode_length_years:
                truncated = True

        next_state = np.zeros(self.state_dim, dtype=np.float32) if truncated else self._get_state()

        info = {
            'capital': self.available_capital,
            'portfolio_size': len(self.portfolio),
            'current_year': self.current_sim_year,
            'invested_this_step': invested,
            'startup_idx': idx,
            'true_outcome': row['outcome_type']
        }

        return next_state, reward, False, truncated, info

    def render(self, mode="human"):
        if mode == "human":
            print(f"Year: {self.current_sim_year}, Capital: {self.available_capital:.0f}, Portfolio: {len(self.portfolio)}")
        else:
            super().render(mode=mode)

    def close(self):
        pass

## Training & Eval

In [None]:
def train_agent(env_id, agent_name, df, classifier, feature_list, preprocessor, state_scaler, reward_map,
                train_start, train_end, valid_start, valid_end, episode_length):
    print(f"\n--- Training Agent: {agent_name} ---")
    model_save_path = os.path.join(OUTPUT_DIR, f'{agent_name}_dqn')
    log_dir = os.path.join(OUTPUT_DIR, f'{agent_name}_logs')
    os.makedirs(log_dir, exist_ok=True)

    train_env = StartupInvestmentEnv(df, classifier, feature_list, preprocessor, state_scaler, reward_map,
                                     train_start, train_end, INITIAL_CAPITAL, INVESTMENT_UNIT,
                                     episode_length_years=episode_length, is_eval=False)
    train_env = Monitor(train_env, log_dir)

    eval_env = StartupInvestmentEnv(df, classifier, feature_list, preprocessor, state_scaler, reward_map,
                                    valid_start, valid_end, INITIAL_CAPITAL, INVESTMENT_UNIT,
                                    is_eval=True)
    eval_env = Monitor(eval_env, os.path.join(log_dir, 'eval'))

    eval_callback = EvalCallback(eval_env,
                                 best_model_save_path=os.path.join(OUTPUT_DIR, f'{agent_name}_best'),
                                 log_path=os.path.join(log_dir, 'eval_results'),
                                 eval_freq=max(N_TRAIN_TIMESTEPS // 20, 500),
                                 n_eval_episodes=5,
                                 deterministic=True,
                                 render=False)

    model = DQN("MlpPolicy", train_env,
                gamma=DQN_GAMMA,
                learning_rate=DQN_LR,
                buffer_size=DQN_BUFFER_SIZE,
                batch_size=DQN_BATCH_SIZE,
                exploration_fraction=DQN_EXPLORATION_FRACTION,
                exploration_final_eps=DQN_EXPLORATION_FINAL_EPS,
                verbose=1,
                tensorboard_log=log_dir)

    model.learn(total_timesteps=N_TRAIN_TIMESTEPS, callback=eval_callback, log_interval=100)
    model.save(model_save_path)

    print(f"Final model saved to {model_save_path}")
    print(f"Best model saved to {os.path.join(OUTPUT_DIR, f'{agent_name}_best')}")

    return os.path.join(OUTPUT_DIR, f'{agent_name}_best', 'best_model.zip')

def evaluate_agent(model_path, agent_name, df, classifier, feature_list, preprocessor, state_scaler,
                   reward_map, returns_map, test_start, test_end):
    print(f"\n--- Evaluating Agent: {agent_name} on Test Set ---")
    model = DQN.load(model_path)

    test_env = StartupInvestmentEnv(
        df, classifier, feature_list, preprocessor, state_scaler, reward_map,
        test_start, test_end, INITIAL_CAPITAL, INVESTMENT_UNIT, is_eval=True
    )

    EVAL_YEARS = test_end - test_start + 1
    results = []
    all_episode_investments = {} # tracking the startups invested in

    for ep in range(N_EVAL_EPISODES):
        obs, _ = test_env.reset()
        terminated = truncated = False
        capital_invested = 0
        reward_total = 0
        total_return = 0

        while not terminated and not truncated:
            action, _ = model.predict(obs, deterministic=True)
            obs, reward, terminated, truncated, info = test_env.step(action)
            reward_total += reward
            if info.get("invested_this_step", False):
                capital_invested += INVESTMENT_UNIT

        # Outcome-based returns
        outcome_counts = {k: 0 for k in returns_map}
        for inv_info in test_env.portfolio.values():
            outcome = inv_info['outcome_type']
            multiplier = returns_map.get(outcome, 1.0)  # fallback to 1.0 if unknown
            total_return += multiplier * INVESTMENT_UNIT

            if outcome in outcome_counts:
                outcome_counts[outcome] += 1
            else:
                print(f"Warning: Unexpected outcome '{outcome}'")

        # Metrics
        moic = total_return / max(capital_invested, 1)
        roi = (total_return - capital_invested) / max(capital_invested, 1)
        approx_irr = (moic ** (1 / EVAL_YEARS)) - 1 if capital_invested > 0 else 0

        investment_rate = len(test_env.portfolio) / max(1, len(test_env.investment_log))

        # track investments
        invested_indices = list(test_env.portfolio.keys())
        all_episode_investments[ep + 1] = invested_indices

        results.append({
            "Episode": ep + 1,
            "Investments": len(test_env.portfolio),
            "Investment Rate": investment_rate,
            "Capital Invested": capital_invested,
            "Total Return": total_return,
            "MOIC": moic,
            "ROI": roi,
            "Approx IRR": approx_irr,
            "Reward Score (non-financial)": reward_total,
            **outcome_counts
        })

        print(f"Episode {ep+1}: MOIC = {moic:.2f}, ROI = {roi:.2%}, IRR ≈ {approx_irr:.2%}, Total Return = ${total_return:,.0f}")

    return pd.DataFrame(results), all_episode_investments

In [None]:
def extract_best_episode_portfolio(eval_df, df, all_episode_investments, metric="Total Return", agent_name="Agent"):
    """
    Extracts the startup rows from the best-performing evaluation episode.

    Args:
        eval_df (pd.DataFrame): Output of evaluate_agent().
        all_episode_investments (dict): Mapping from episode number → list of invested startup indices.
        metric (str): Column to use for selecting best episode (e.g. 'Total Return').
        agent_name (str): Used for file naming and labeling.

    Returns:
        pd.DataFrame: DataFrame of startup rows for the best episode.
    """
    best_ep = eval_df.sort_values(by=metric, ascending=False).iloc[0]["Episode"]
    best_indices = all_episode_investments.get(best_ep, [])

    if not best_indices:
        print(f"[Warning] No investments found for best episode {best_ep} of {agent_name}.")
        return pd.DataFrame()

    portfolio_df = df.loc[best_indices].copy()
    portfolio_df["Episode"] = best_ep
    portfolio_df["Agent"] = agent_name

    # Optional: save to CSV
    filename = f"best_episode_portfolio_{agent_name.replace(' ', '_')}.csv"
    filepath = os.path.join(OUTPUT_DIR, filename)
    portfolio_df.to_csv(filepath, index=False)
    print(f"[Saved] Best episode portfolio for {agent_name} → {filepath}")

    return portfolio_df

## Running

### Agent A

#### Run

In [None]:
# === Run Full Training + Evaluation Pipeline ===
if classifier_A:
  best_model_A_path = train_agent('Agent_A_Static', 'Agent_A_Static', df_A, classifier_A, features_static,
                                  preprocessor_A, state_scaler_A, REWARD_MAP, SIM_TRAIN_START_YEAR, SIM_TRAIN_END_YEAR,
                                  SIM_VALID_START_YEAR, SIM_VALID_END_YEAR, EPISODE_LENGTH_YEARS)

  df_eval_A, investments_A = evaluate_agent(best_model_A_path, 'Agent_A_Static', df_A, classifier_A, features_static,
                              preprocessor_A, state_scaler_A, REWARD_MAP, RETURNS_MAP, SIM_TEST_START_YEAR, SIM_TEST_END_YEAR)

  df_best_A = extract_best_episode_portfolio(df_eval_A, df_A, investments_A, metric="Total Return", agent_name="Agent A")

In [None]:
df_eval_A.to_csv(os.path.join(OUTPUT_DIR, 'STATIC_RL_evaluation.csv'), index=False)

#### Preview

In [None]:
# print(df_eval_A.info())
print(df_eval_A.head(50))

In [None]:
# print(df_best_A.info())
print(df_best_A.head(50))

### Agent B

#### Run

In [None]:
if classifier_B:
  # === Run Full Training + Evaluation Pipeline ===
  best_model_B_path = train_agent('Agent_B_Full', 'Agent_B_Full', df_B, classifier_B, features_full,
                                  preprocessor_B, state_scaler_B, REWARD_MAP, SIM_TRAIN_START_YEAR, SIM_TRAIN_END_YEAR,
                                  SIM_VALID_START_YEAR, SIM_VALID_END_YEAR, EPISODE_LENGTH_YEARS)

  df_eval_B, investments_B = evaluate_agent(best_model_B_path, 'Agent_B_Full', df_B, classifier_B, features_full,
                              preprocessor_B, state_scaler_B, REWARD_MAP, RETURNS_MAP, SIM_TEST_START_YEAR, SIM_TEST_END_YEAR)

  df_best_B = extract_best_episode_portfolio(df_eval_B, df_B, investments_B, metric="Total Return", agent_name="Agent B")

In [None]:
df_eval_B.to_csv(os.path.join(OUTPUT_DIR, 'FULL_RL_evaluation.csv'), index=False)

#### Preview

In [None]:
# print(df_eval_B.info())
print(df_eval_B.head(50))

In [None]:
# print(df_best_B.info())
print(df_best_B.head(50))

## Compare and Visualization

### Functions

In [None]:
def compare_portfolio_returns(df_A, df_B, agent_name_A="Agent A", agent_name_B="Agent B"):
    """
    Compares total returns between two agents.
    Args:
        df_A (pd.DataFrame): Evaluation results from agent A (must include 'Total Return')
        df_B (pd.DataFrame): Evaluation results from agent B (must include 'Total Return')
    """

    ret_A = df_A["Total Return"]
    ret_B = df_B["Total Return"]

    irr_A = df_A["Approx IRR"]
    irr_B = df_B["Approx IRR"]

    reward_A = df_A["Reward Score (non-financial)"]
    reward_B = df_B["Reward Score (non-financial)"]

    moic_A = df_A["MOIC"]
    moic_B = df_B["MOIC"]

    # print("\n--- Total Return Comparison ---")
    # print(f"{agent_name_A}: Mean = ${np.mean(ret_A):,.0f}, Std = ${np.std(ret_A):,.0f}")
    # print(f"{agent_name_B}: Mean = ${np.mean(ret_B):,.0f}, Std = ${np.std(ret_B):,.0f}")

    # u_stat, p_val = mannwhitneyu(ret_A, ret_B, alternative='less')
    # print(f"Mann-Whitney U test ({agent_name_A} < {agent_name_B}): U = {u_stat:.2f}, p = {p_val:.4f}")
    # if p_val < 0.05:
    #     print(f"→ Statistically significant: {agent_name_B} outperforms {agent_name_A}.")
    # else:
    #     print("→ No significant difference detected.")

    results = []

    print("\n--- Total Return + Financial Metrics Comparison ---")

    for agent_df, agent_name in [(df_A, agent_name_A), (df_B, agent_name_B)]:
        total_return_mean = agent_df["Total Return"].mean()
        total_return_std = agent_df["Total Return"].std()
        roi_mean = agent_df["ROI"].mean()
        total_reward_mean = agent_df["Reward Score (non-financial)"].mean()
        moic_mean = agent_df["MOIC"].mean()
        irr_mean = agent_df["Approx IRR"].mean()

        # Outcome columns (based on known outcome types)
        outcome_cols = [col for col in agent_df.columns if col.lower() in ['ipo', 'acquisition', 'closed', 'still_active', 'too_young']]
        outcome_means = agent_df[outcome_cols].mean().to_dict()

        # Store for CSV
        summary_row = {
            "Agent": agent_name,
            "Mean Total Return": total_return_mean,
            "Std Total Return": total_return_std,
            "Mean Reward Score": total_reward_mean,
            "Mean ROI": roi_mean,
            "Mean MOIC": moic_mean,
            "Mean IRR": irr_mean,
            **{f"Avg {k.title()}": v for k, v in outcome_means.items()}
        }
        results.append(summary_row)

        # Print summary
        print(f"\n{agent_name}")
        print(f"- Mean Total Return: ${total_return_mean:,.0f} (±{total_return_std:,.0f})")
        print(f"- Mean Total Reward Score: {total_reward_mean:.2f}")
        print(f"- Mean ROI: {roi_mean:.2%}")
        print(f"- Mean MOIC: {moic_mean:.3f}")
        print(f"- Mean IRR: {irr_mean:.2%}")
        print("\nAverage Outcome Counts per Episode:")
        for outcome, avg_count in sorted(outcome_means.items(), key=lambda x: -x[1]):
            print(f"  - {outcome}: {avg_count:.2f}")

    # Run statistical test
    u_stat, p_val = mannwhitneyu(df_A["Total Return"], df_B["Total Return"], alternative='less')
    print(f"\nMann-Whitney U test ({agent_name_A} < {agent_name_B}): U = {u_stat:.2f}, p = {p_val:.4f}")
    if p_val < 0.05:
        print(f"→ Statistically significant: {agent_name_B} outperforms {agent_name_A}.")
    else:
        print("→ No significant difference detected.")


    results.append({
        "Agent": "Statistical Test",
        "Mann-Whitney U Test": f"\nMann-Whitney U test ({agent_name_A} < {agent_name_B}): U = {u_stat:.2f}, p = {p_val:.4f}",
        "Significance": f"Statistically significant: {agent_name_B} outperforms {agent_name_A}." if p_val < 0.05 else "→ No significant difference detected.",
        "Std Total Return": p_val
    })

    # Save to CSV
    results_df = pd.DataFrame(results)
    csv_path = os.path.join(OUTPUT_DIR, "agent_comparison_summary.csv")
    results_df.to_csv(csv_path, index=False)
    print(f"\n[Saved] Comparison summary CSV → {csv_path}")

    # --- Plotting Function ---
    def plot_metric(dist_A, dist_B, title, xlabel, filename, is_percentage=False):
        plt.figure(figsize=(10, 5))
        sns.histplot(dist_A, kde=True, color='skyblue', label=f'{agent_name_A}\nMean={np.mean(dist_A):.2%}' if is_percentage else f'{agent_name_A}\nMean={np.mean(dist_A):,.2f}')
        sns.histplot(dist_B, kde=True, color='lightcoral', label=f'{agent_name_B}\nMean={np.mean(dist_B):.2%}' if is_percentage else f'{agent_name_B}\nMean={np.mean(dist_B):,.2f}')
        plt.title(title)
        plt.xlabel(xlabel)
        plt.ylabel('Frequency')
        if is_percentage:
            plt.gca().xaxis.set_major_formatter(PercentFormatter(1.0))
        plt.legend()
        plt.tight_layout()
        save_path = os.path.join(OUTPUT_DIR, filename)
        plt.savefig(save_path)
        print(f"{title} plot saved to {save_path}")
        plt.show()

    # --- Generate all four plots ---
    plot_metric(ret_A, ret_B, "Distribution of Total Returns", "Total Return ($)", "return_comparison.png", is_percentage=False)
    plot_metric(irr_A, irr_B, "Distribution of Approximate IRRs", "Approximate IRR", "irr_comparison.png", is_percentage=True)
    plot_metric(reward_A, reward_B, "Distribution of Reward Scores", "Reward Score (Non-Financial)", "reward_score_comparison.png", is_percentage=False)
    plot_metric(moic_A, moic_B, "Distribution of MOIC", "MOIC", "moic_comparison.png", is_percentage=False)


In [None]:
def analyze_evaluation_results(eval_df, agent_name):
    """
    Visualizes performance metrics: reward, MOIC, ROI, IRR, and outcome distribution.
    Args:
        eval_df (pd.DataFrame): Output of evaluate_agent().
        agent_name (str): Name of the agent to use in titles/labels.
    """
    if eval_df is None or eval_df.empty:
        print("Evaluation DataFrame is empty. Skipping analysis.")
        return

    sns.set(style="whitegrid")

    # print(f"\n=== {agent_name} Evaluation Summary ===")
    # print(f"Avg MOIC: {eval_df['MOIC'].mean():.3f}")
    # print(f"Avg ROI: {eval_df['ROI'].mean():.2%}")
    # print(f"Avg IRR: {eval_df['Approx IRR'].mean():.2%}")
    # print(f"Avg Investments per Episode: {eval_df['Investments'].mean():.1f}")

    # --- Print total outcome counts ---
    outcome_cols = [col for col in eval_df.columns if col not in [
        "Episode", "Investments", "Investment Rate", "Capital Invested", "Total Return",
        "MOIC", "ROI", "Approx IRR", "Reward Score (non-financial)"
    ]]
    outcome_totals = eval_df[outcome_cols].sum().sort_values(ascending=False)
    print("\nOutcome Type Distribution Across All Episodes:")
    print(outcome_totals)

    # --- Plot Total Return ---
    plt.figure(figsize=(10, 4))
    sns.lineplot(data=eval_df, x="Episode", y="Total Return", marker="o")
    plt.title(f"{agent_name} – Total Return per Evaluation Episode")
    plt.xlabel("Episode")
    plt.ylabel("Total Return ($)")
    plt.show()

    # --- Plot MOIC per episode ---
    plt.figure(figsize=(10, 4))
    sns.lineplot(data=eval_df, x="Episode", y="MOIC", marker="o")
    plt.title(f"{agent_name} – MOIC per Evaluation Episode")
    plt.xlabel("Episode")
    plt.ylabel("MOIC")
    plt.show()

    # --- Plot ROI per episode ---
    plt.figure(figsize=(10, 4))
    sns.lineplot(data=eval_df, x="Episode", y="ROI", marker="o")
    plt.title(f"{agent_name} – ROI per Evaluation Episode")
    plt.xlabel("Episode")
    plt.ylabel("ROI")
    plt.show()

    # --- Plot IRR per episode ---
    plt.figure(figsize=(10, 4))
    sns.lineplot(data=eval_df, x="Episode", y="Approx IRR", marker="o")
    plt.title(f"{agent_name} – IRR per Evaluation Episode (approx.)")
    plt.xlabel("Episode")
    plt.ylabel("IRR")
    plt.show()

    # --- Plot Outcome Distribution ---
    plt.figure(figsize=(10, 4))
    sns.barplot(x=outcome_totals.index, y=outcome_totals.values)
    plt.title(f"{agent_name} – Total Outcome Counts Across Evaluation")
    plt.ylabel("Count")
    plt.xticks(rotation=45)
    plt.show()


### Visualizations

In [None]:
df_eval_A = pd.read_csv(os.path.join(OUTPUT_DIR, 'STATIC_RL_evaluation.csv'))
df_eval_B = pd.read_csv(os.path.join(OUTPUT_DIR, 'FULL_RL_evaluation.csv'))

In [None]:
compare_portfolio_returns(df_eval_A, df_eval_B)

In [None]:
analyze_evaluation_results(df_eval_A, 'Agent_A_Static')
analyze_evaluation_results(df_eval_B, 'Agent_B_Full')

## (Run Once) Scaler

In [None]:
# === CONFIG ===
TRAIN_YEARS = (2010, 2018) # RL Training Period
SCALER_STATIC_PATH = os.path.join(MODEL_FOLDER,'state_scaler_A.pkl') # Keep A/B naming convention
SCALER_FULL_PATH = os.path.join(MODEL_FOLDER, 'state_scaler_B.pkl') # Keep A/B naming convention

# Maximum number of samples to use for fitting scaler (to avoid memory issues if data is huge)
MAX_SAMPLES_FOR_SCALER = 50000

# === Helper function (Corrected) ===
def fit_and_save_state_scaler(df, feature_list, preprocessor, classifier, scaler_save_path, label):
    print(f"\n--- Fitting state scaler for {label} model ---")
    print(f"Filtering data to years {TRAIN_YEARS[0]}-{TRAIN_YEARS[1]}...")
    df_train = df[(df['founded_year'] >= TRAIN_YEARS[0]) & (df['founded_year'] <= TRAIN_YEARS[1])].copy()

    if df_train.empty:
        print(f"No training data found for years {TRAIN_YEARS}. Cannot fit scaler for {label}.")
        return

    # Sample if the dataset slice is very large
    if len(df_train) > MAX_SAMPLES_FOR_SCALER:
        print(f"Sampling {MAX_SAMPLES_FOR_SCALER} from {len(df_train)} rows for scaler fitting.")
        df_train = df_train.sample(n=MAX_SAMPLES_FOR_SCALER, random_state=42)

    # Check if feature list is empty or None
    if not feature_list:
         print(f"Error: Feature list for {label} is empty or None. Skipping.")
         return

    # Ensure all features are in the dataframe before selection
    missing_features = [f for f in feature_list if f not in df_train.columns]
    if missing_features:
         print(f"Error: Missing features in dataframe for {label}: {missing_features}. Skipping.")
         return

    X_raw = df_train[feature_list]
    X_raw.replace([np.inf, -np.inf], np.nan, inplace=True)

    print(f"Preprocessing {len(X_raw)} samples...")
    try:
        X_transformed = preprocessor.transform(X_raw)
        # Handle sparse matrix output if necessary
        if hasattr(X_transformed, "toarray"):
            X_transformed = X_transformed.toarray()
    except Exception as e:
         print(f"Error during preprocessing transform for {label}: {e}. Skipping.")
         return

    print(f"Getting classifier probabilities for {label}...")
    print("Classifier expected:", classifier.n_features_in_)
    print("Transformed features:", X_transformed.shape[1])

    try:
        # if hasattr(classifier, "predict_proba"):
        #     # Ensure X_transformed has the correct number of features expected by classifier
        #     if hasattr(classifier, 'n_features_in_') and classifier.n_features_in_ != X_transformed.shape[1]:
        #          raise ValueError(f"Feature mismatch for classifier. Expected {classifier.n_features_in_}, got {X_transformed.shape[1]}")

            probs = classifier.predict_proba(X_transformed)[:, 1].reshape(-1, 1)
        # else:
        #     raise ValueError(f"{label} classifier does not support predict_proba")
    except Exception as e:
        print(f"Error getting probabilities for {label}: {e}. Skipping.")
        return


    print(f"Generating sampled context features...")
    n_samples = X_transformed.shape[0]
    # --- Generate VARYING Context Features ---
    np.random.seed(42)
    context = np.hstack([
        np.random.uniform(0.1, 1.0, size=(n_samples, 1)), # capital_ratio varies
        np.random.uniform(0.0, 1.0, size=(n_samples, 1)), # year_progress varies
        np.random.uniform(0.0, 0.8, size=(n_samples, 1))  # portfolio_ratio varies
    ])
    # --- End Corrected Context ---

    print(f"Concatenating state matrix...")
    try:
        state_matrix = np.hstack([X_transformed, probs, context])
    except ValueError as e:
        print(f"Error during hstack (check shapes): {e}")
        print(f"X_transformed shape: {X_transformed.shape}")
        print(f"probs shape: {probs.shape}")
        print(f"context shape: {context.shape}")
        return

    print(f"Fitting StandardScaler on state matrix with shape {state_matrix.shape}...")
    try:
        state_scaler = StandardScaler().fit(state_matrix)
    except Exception as e:
        print(f"Error fitting StandardScaler for {label}: {e}. Skipping.")
        # Check for issues like all-zero columns if error occurs
        # print("Checking for constant columns (std dev = 0):")
        # print(np.std(state_matrix, axis=0))
        return

    print(f"Saving scaler for {label}...")

    try:
        os.makedirs(os.path.dirname(scaler_save_path), exist_ok=True) # Ensure directory exists
        joblib.dump(state_scaler, scaler_save_path)
        print(f"{label} state scaler saved to: {scaler_save_path}")
        print(f"Scaler expects input shape: ({state_scaler.n_features_in_},)") # Info
    except Exception as e:
         print(f"Error saving scaler for {label} to {scaler_save_path}: {e}")

# === Run for both STATIC and FULL ===
fit_and_save_state_scaler(
  df=df_A,
  feature_list=features_static,
  preprocessor=preprocessor_A,
  classifier=classifier_A,
  scaler_save_path=SCALER_STATIC_PATH,
  label="STATIC"
)

fit_and_save_state_scaler(
  df=df_B,
  feature_list=features_full,
  preprocessor=preprocessor_B,
  classifier=classifier_B,
  scaler_save_path=SCALER_FULL_PATH,
  label="FULL"
)
