### Synthetic data code, visualizations and some performance analysis on initial weight matrix.

**Equation:**  
The composite confidence score is defined as:

$$
y = x^T W^* x + \alpha\,\log(\text{exit\_value}+1) + \beta\,\log(\text{funding\_amount}+1) + \epsilon,
$$

 where:
 - $x$ is the 26-feature one-hot encoded vector (from 8 founder categories),
- $W^*$ is a ground-truth symmetric weight matrix,
- $\alpha, \beta$ are scalars ,
- $\epsilon \sim \mathcal{N}(0,\text{noise\_std}^2)$.

**Funding/Exit Model:**  
 - A startup receives funding with probability $p_{funding}$. If funded, the funding amount is sampled from 
    $\text{Funding} \sim \text{LogNormal}(\mu_{\text{funding}}, \sigma_{\text{funding}})$.
 - Conditional on funding, an exit occurs with probability $p_{exit}$; if so, the exit value is sampled from:
   $\text{Exit} \sim \text{LogNormal}(\mu_{\text{exit}}, \sigma_{\text{exit}})$.  
- Otherwise (or if unfunded), exit = 0.

**Success Metric:**  
Higher composite scores should correspond to higher funding and exit values.

All hyperparameters (for founder traits, funding, exit, target scaling) are defined below.


In [265]:
# ! pip install seaborn plotly scipy -q

In [266]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
from scipy import stats
import sys

sys.path.append('..')
from src.config import cfg

In [None]:
MATRIX = cfg.MATRIX
POPULATIONS = cfg.SYNTH['POPULATIONS']
alpha = cfg.SYNTH['alpha']
beta = cfg.SYNTH['beta']
noise_std = cfg.SYNTH['noise_std']

def one_hot_encode_column(values, dimension):
    if dimension == 3:
        indices = values - 1
    else:
        indices = values
    indices = np.clip(indices, 0, dimension - 1)
    return np.eye(dimension, dtype=int)[indices]

K = sum(cfg['DIMENSION'] for cfg in MATRIX.values())
K

## Target def
 
We want the quadratic form $x^T W x$ approximates a confidence score that is related to exit and funding.
$$
 y = x^T W^* x + \alpha \,\log(\text{exit\_value}+1) + \beta \,\log(\text{funding\_amount}+1) + \epsilon
$$

where:
- $x^T W^* x$ is the intrinsic score computed using a ground-truth weight matrix $W^*$ (simulating the combined effect of the founder’s features),
- The exit and funding components capture external signals of success (using log to compress scales),
- $\alpha, \beta$ are scalars
- $\epsilon$ is Gaussian noise.

In [268]:
def sample_ordinal_for_category(cat, sampling_probs):
    d = MATRIX[cat]['DIMENSION']
    p = sampling_probs[cat]
    if d == 3:
        return np.random.choice([1,2,3], p=p)
    else:
        return np.random.choice([0,1,2,3], p=p)

def sample_exit_and_funding(
    p_fund, mu_fund, sig_fund,
    p_exit, mu_exit, sig_exit,
):

    if np.random.rand() < p_fund:
        funding_amt = np.random.lognormal(mu_fund, sig_fund)
        if np.random.rand() < p_exit:
            exit_val = np.random.lognormal(mu_exit, sig_exit)
        else:
            exit_val = 0
    else:
        funding_amt = 0
        exit_val = 0
    return exit_val, funding_amt

def compute_target(x, W_star, pop_cfg):
    # x^T W^* x
    feature_score = x @ W_star @ x
    e_val, f_val  = sample_exit_and_funding(
        pop_cfg["p_funding"], pop_cfg["mu_funding"], pop_cfg["sigma_funding"],
        pop_cfg["p_exit"],    pop_cfg["mu_exit"],    pop_cfg["sigma_exit"],
    )
    exit_comp = alpha * np.log(e_val + 1)
    fund_comp = beta  * np.log(f_val + 1)
    noise_ = np.random.normal(0, noise_std)
    return feature_score + exit_comp + fund_comp + noise_, e_val, f_val

def generate_subpopulation(num_samples, pop_cfg, W_star):
    X_list, y_list, e_list, f_list = [], [], [], []
    for _ in range(num_samples):
        # Build the 26-d feature vector
        x_parts = []
        for cat in MATRIX:
            val = sample_ordinal_for_category(cat, pop_cfg["sampling_probs"])
            oh = one_hot_encode_column(val, MATRIX[cat]['DIMENSION'])
            x_parts.append(oh)
        x = np.concatenate(x_parts)
        
        y_val, e_val, f_val = compute_target(x, W_star, pop_cfg)
        X_list.append(x)
        y_list.append(y_val)
        e_list.append(e_val)
        f_list.append(f_val)
    return np.array(X_list), np.array(y_list), np.array(e_list), np.array(f_list)

def generate_synthetic_dataset(total_samples, populations, seed=42):
    if seed is not None:
        np.random.seed(seed)
    K = sum(MATRIX[c]['DIMENSION'] for c in MATRIX)
    
    # Build W*
    W_star = np.zeros((K, K))
    start_idx = 0
    for cat in MATRIX:
        w = MATRIX[cat]['WEIGHT']
        dim = MATRIX[cat]['DIMENSION']
        end_idx = start_idx + dim
        tiers = np.array(list(range(3, 3-dim, -1))[::-1]) * w
        W_star[np.arange(start_idx, end_idx),
               np.arange(start_idx, end_idx)] = tiers
        start_idx = end_idx
    
    # Add small random noise off-diagonal
    noise = np.random.normal(0, 0.005, (K, K))
    np.fill_diagonal(noise, 0)
    W_star += noise
    W_star = 0.5*(W_star + W_star.T)
    
    X_all, y_all, e_all, f_all = [], [], [], []
    labels = []
    for pop_name, pop_cfg in populations.items():
        n_sub = int(round(pop_cfg["fraction"] * total_samples))
        X_sub, y_sub, e_sub, f_sub = generate_subpopulation(n_sub, pop_cfg, W_star)
        X_all.append(X_sub)
        y_all.append(y_sub)
        e_all.append(e_sub)
        f_all.append(f_sub)
        labels += [pop_name]*n_sub
    
    X_final = np.vstack(X_all)
    y_final = np.concatenate(y_all)
    exit_final = np.concatenate(e_all)
    fund_final = np.concatenate(f_all)
    labels = np.array(labels[:len(y_final)])
    
    return X_final, y_final, exit_final, fund_final, labels, W_star

In [269]:
def get_category_slice(category_name):
    start = 0
    for cat in MATRIX:
        d = MATRIX[cat]['DIMENSION']
        if cat == category_name:
            return slice(start, start+d)
        start += d
    return slice(0, 0)  # fallback

def plot_category_distribution_overall_and_by_pop(X, pop_labels, category_name):
    cat_slice = get_category_slice(category_name)
    d = MATRIX[category_name]['DIMENSION']
    
    cat_indices = X[:, cat_slice].argmax(axis=1)  # 0..(d-1)
    
    counts = np.bincount(cat_indices, minlength=d)
    
    plt.figure(figsize=(8,5))
    bars = plt.bar(range(d), counts, color='cornflowerblue', edgecolor='black')
    plt.title(f"Overall Distribution of {category_name} (tiers=0..{d-1})", fontsize=14)
    plt.xlabel("Tier Index", fontsize=12)
    plt.ylabel("Count", fontsize=12)
    
    # Annotate with counts
    for bar in bars:
        height = bar.get_height()
        plt.text(bar.get_x() + bar.get_width()/2., height + 0.5,
                 f'{int(height)}', ha='center', va='bottom', fontsize=9)
    
    plt.tight_layout()
    plt.show()
    
    # ------ (B) By-population distribution (percentage) ------
    unique_pops = np.unique(pop_labels)
    plt.figure(figsize=(10,6))
    width = 0.8 / len(unique_pops)
    
    # For labeling each pop as % within that pop
    pop_percentages = {}
    for pop_name in unique_pops:
        pop_mask = (pop_labels == pop_name)
        cat_pop_indices = cat_indices[pop_mask]
        ccounts = np.bincount(cat_pop_indices, minlength=d)
        pop_percentages[pop_name] = 100.0 * ccounts / ccounts.sum()
    
    for i, pop_name in enumerate(unique_pops):
        pop_mask = (pop_labels == pop_name)
        x_positions = np.arange(d) + (i - len(unique_pops)/2)*width + width/2
        bars = plt.bar(x_positions, pop_percentages[pop_name], 
                       width=width, label=f"{pop_name} (n={pop_mask.sum()})")
        # Annotate with percentages
        for bar in bars:
            height = bar.get_height()
            if height > 5:  # only label above certain threshold
                plt.text(bar.get_x() + bar.get_width()/2., height + 0.5,
                         f'{height:.1f}%', ha='center', va='bottom', fontsize=8)
    
    plt.title(f"{category_name} Tier Distribution by Population (%)", fontsize=14)
    plt.xlabel("Tier Index", fontsize=12)
    plt.ylabel("Percentage within Population", fontsize=12)
    plt.xticks(np.arange(d), [str(i) for i in range(d)])
    plt.legend(loc='best')
    plt.grid(axis='y', linestyle='--', alpha=0.4)
    plt.tight_layout()
    plt.show()

def plot_hist_overall_and_by_pop(values, pop_labels, title, bins=50, log_scale=False):
    bins = int(bins)
    
    if log_scale:
        raw_vals = values
        plot_vals = np.log1p(values)  # log(1 + x)
        xlabel_str = f"{title} [log(1 + x)]"
    else:
        raw_vals = values
        plot_vals = values
        xlabel_str = title
    
    plt.figure(figsize=(9,5))
    
    # Compute bin edges based on the transformed data
    min_val, max_val = plot_vals.min(), plot_vals.max()
    bin_edges = np.linspace(min_val, max_val, bins)
    
    n, bin_edges, patches = plt.hist(plot_vals, bins=bin_edges,
                                     color='skyblue', edgecolor='black', alpha=0.7)
    plt.title(f"{title} - Overall Distribution", fontsize=14)
    plt.xlabel(xlabel_str, fontsize=12)
    plt.ylabel("Count", fontsize=12)
    
    # Annotate basic stats
    stats_text = (f"Mean: {raw_vals.mean():.2f}\n"
                  f"Median: {np.median(raw_vals):.2f}\n"
                  f"Std: {raw_vals.std():.2f}")
    plt.annotate(stats_text, xy=(0.70, 0.75), xycoords='axes fraction',
                 bbox=dict(boxstyle="round,pad=0.5", facecolor='white', alpha=0.8))
    
    plt.grid(axis='y', linestyle='--', alpha=0.3)
    plt.tight_layout()
    plt.show()
    
    # -----------------------------------------------------------------------
    # (C) By Population - Hist
    # -----------------------------------------------------------------------
    unique_pops = np.unique(pop_labels)
    plt.figure(figsize=(10,6))
    
    # We'll reuse the same bin_edges from above for consistency.
    
    # Distinct color palette
    colors = plt.cm.tab10(np.linspace(0, 1, len(unique_pops)))
    
    for i, pop_name in enumerate(unique_pops):
        mask = (pop_labels == pop_name)
        sub_vals_raw = raw_vals[mask]
        sub_vals_log = plot_vals[mask]  # log(1 + x) if log_scale
        
        label_str = (f"{pop_name} (n={mask.sum()}, "
                     f"mean={sub_vals_raw.mean():.2f}, "
                     f"std={sub_vals_raw.std():.2f})")
        
        plt.hist(sub_vals_log, bins=bin_edges, alpha=0.6,
                 label=label_str, color=colors[i])
    
    plt.xlabel(xlabel_str, fontsize=12)
    plt.ylabel("Count", fontsize=12)
    plt.title(f"{title} by Population", fontsize=14)
    plt.legend(loc='best')
    plt.grid(axis='y', linestyle='--', alpha=0.3)
    plt.tight_layout()
    plt.show()
    
    # -----------------------------------------------------------------------
    # (D) By Population - KDE
    # -----------------------------------------------------------------------
    plt.figure(figsize=(10,6))
    
    for i, pop_name in enumerate(unique_pops):
        mask = (pop_labels == pop_name)
        sub_vals_raw = raw_vals[mask]
        sub_vals_log = plot_vals[mask]
        
        if len(sub_vals_log) > 1:
            kde = stats.gaussian_kde(sub_vals_log)
            x_eval = np.linspace(sub_vals_log.min(), sub_vals_log.max(), 500)
            density = kde(x_eval)
            
            label_str = f"{pop_name} (mean={sub_vals_raw.mean():.2f})"
            plt.plot(x_eval, density, label=label_str,
                     color=colors[i], linewidth=2)
    
    plt.xlabel(xlabel_str, fontsize=12)
    plt.ylabel("Density", fontsize=12)
    plt.title(f"{title} Density by Population (KDE)", fontsize=14)
    plt.grid(True, alpha=0.3)
    plt.legend(loc='best')
    plt.tight_layout()
    plt.show()

def plot_scatter_overall_and_by_pop(xvals, yvals, pop_labels, x_title, y_title, log_y=False):
    plt.figure(figsize=(10,7))
    unique_pops = np.unique(pop_labels)
    colors = plt.cm.tab10(np.linspace(0, 1, len(unique_pops)))
    
    for i, pop_name in enumerate(unique_pops):
        mask = (pop_labels == pop_name)
        x_pop = xvals[mask]
        y_pop = yvals[mask]
        
        plt.scatter(x_pop, y_pop, alpha=0.5, label=pop_name, color=colors[i])
        
        # Fit a line in linear or log scale
        if len(x_pop) > 3:
            if log_y:
                log_y_pop = np.log(y_pop)
                z = np.polyfit(x_pop, log_y_pop, 1)
                p = np.poly1d(z)
                x_range = np.linspace(x_pop.min(), x_pop.max(), 50)
                plt.plot(x_range, np.exp(p(x_range)), '--', color=colors[i], 
                         linewidth=2, alpha=0.7)
            else:
                z = np.polyfit(x_pop, y_pop, 1)
                p = np.poly1d(z)
                x_range = np.linspace(x_pop.min(), x_pop.max(), 50)
                plt.plot(x_range, p(x_range), '--', color=colors[i], 
                         linewidth=2, alpha=0.7)
    
    if log_y:
        plt.yscale('log')
    
    # (If Score is always >= 0, this is fine; else remove xlim(left=0).)
    plt.xlim(left=0)
    
    plt.title(f"{x_title} vs. {y_title} by Population", fontsize=14)
    plt.xlabel(x_title, fontsize=12)
    plt.ylabel(y_title + (" [log scale]" if log_y else ""), fontsize=12)
    plt.grid(True, alpha=0.3)
    plt.legend(loc='best')
    plt.tight_layout()
    plt.show()

def plot_feature_importance(W_star):
    feature_importance = np.sum(np.abs(W_star), axis=0)
    
    feature_names = []
    for cat_name in MATRIX:
        dim = MATRIX[cat_name]['DIMENSION']
        for i in range(dim):
            feature_names.append(f"{cat_name}_{i}")
    
    sorted_indices = np.argsort(feature_importance)[::-1]
    sorted_imp = feature_importance[sorted_indices]
    sorted_names = [feature_names[i] for i in sorted_indices]
    
    top_k = min(15, len(sorted_names))
    plt.figure(figsize=(10, 6))
    bars = plt.bar(range(top_k), sorted_imp[:top_k], color='teal')
    plt.xticks(range(top_k), sorted_names[:top_k], rotation=45, ha='right')
    plt.title('Top Feature Importance (from W*)', fontsize=14)
    plt.xlabel('Feature', fontsize=12)
    plt.ylabel('Importance Score', fontsize=12)
    
    for i, bar in enumerate(bars):
        height = bar.get_height()
        plt.text(bar.get_x() + bar.get_width()/2., height + 0.5,
                 f"{height:.2f}", ha='center', va='bottom', fontsize=8)
    
    plt.tight_layout()
    plt.show()

def plot_correlation_heatmap(X, y, exit_vals, fund_vals, feature_names=None):
    if feature_names is None:
        feature_names = []
        for cat_name in MATRIX:
            d = MATRIX[cat_name]['DIMENSION']
            for i in range(d):
                feature_names.append(f"{cat_name}_{i}")
    
    data = np.column_stack([X, y, exit_vals, fund_vals])
    col_names = feature_names + ['Score', 'Exit', 'Funding']
    df = pd.DataFrame(data, columns=col_names)
    
    corr = df.corr()
    
    plt.figure(figsize=(12, 10))
    mask = np.zeros_like(corr, dtype=bool)
    mask[np.triu_indices_from(mask)] = True
    cmap = sns.diverging_palette(230, 20, as_cmap=True)
    
    sns.heatmap(corr, mask=mask, cmap=cmap, center=0,
                square=True, linewidths=.5, cbar_kws={"shrink": .5}, 
                annot=True, fmt=".2f")
    plt.title('Correlation Heatmap: Features & Targets', fontsize=16)
    plt.tight_layout()
    plt.show()
    
    # Show just the target correlations
    target_cols = ['Score','Exit','Funding']
    target_corr = corr.loc[target_cols, target_cols]
    plt.figure(figsize=(6, 5))
    sns.heatmap(target_corr, cmap=cmap, center=0, square=True,
                linewidths=.5, cbar_kws={"shrink": .5}, annot=True, fmt=".2f")
    plt.title('Correlation among Targets', fontsize=14)
    plt.tight_layout()
    plt.show()

def print_funding_exit_stats(fund_vals, exit_vals, label="Overall"):
    # 5NS
    df = pd.DataFrame({'Funding': fund_vals, 'Exit': exit_vals})
    
    print(f"----- {label} Summary -----")
    print("Funding Stats (in $):")
    print(df['Funding'].describe(percentiles=[0.01, 0.05, 0.25, 0.5, 0.75, 0.95, 0.99]))
    print("\nExit Stats (in $):")
    print(df['Exit'].describe(percentiles=[0.01, 0.05, 0.25, 0.5, 0.75, 0.95, 0.99]))
    print("\n")

In [270]:
N = 5000
X_syn, y_syn, exit_syn, fund_syn, pop_labels, W_star = generate_synthetic_dataset(N, POPULATIONS)

In [None]:
for cat_name in MATRIX.keys():
    plot_category_distribution_overall_and_by_pop(X_syn, pop_labels, cat_name)

# (B) Score histogram (linear scale)
plot_hist_overall_and_by_pop(y_syn, pop_labels, title="Composite Score", bins=50, log_scale=False)

# (C) Funding histogram (log scale in the sense log(1 + x) for plotting)
plot_hist_overall_and_by_pop(fund_syn, pop_labels, title="Funding Amount", bins=50, log_scale=True)

# (D) Exit histogram (log scale in the sense log(1 + x) for plotting)
plot_hist_overall_and_by_pop(exit_syn, pop_labels, title="Exit Value", bins=50, log_scale=True)

# (E) Scatter: Score vs. Funding
plot_scatter_overall_and_by_pop(y_syn, fund_syn, pop_labels, 
                                x_title="Composite Score", y_title="Funding", log_y=True)

# (F) Scatter: Score vs. Exit
plot_scatter_overall_and_by_pop(y_syn, exit_syn, pop_labels,
                                x_title="Composite Score", y_title="Exit", log_y=True)

# (G) Feature importance from W*
plot_feature_importance(W_star)

# (H) Correlation Heatmap (Features + Score/Exit/Funding)
plot_correlation_heatmap(X_syn, y_syn, exit_syn, fund_syn)


print_funding_exit_stats(fund_syn, exit_syn, label="Overall")


In [None]:
feature_names = []
for cat in MATRIX:
    dim = MATRIX[cat]['DIMENSION']
    for i in range(dim):
        feature_names.append(f"{cat}_{i}")

df = pd.DataFrame(X_syn, columns=feature_names)
df["target"] = y_syn


df.to_csv("../data/synth/encoded_founders_composites.csv", index=False)
# df
