This notebook contains all the code needed to generate a synthetic dataset.

All params handled in `src/config.py` `SYNTH` variable.
- Exits and funding are pulled from lognormal distr
- Ordinal founder attrs come from discrete pdfs in hyperparameter config.

Visualizations provided at the end to compare with expectations of real-world data.

Defining success as an exit or `config.SUCCESS_THRESHOLD` in funding. compute a raw score (which could be used to prioritize) and probability of success in `model.ipynb`.

In [1]:
# ! pip install seaborn plotly scipy -q

In [2]:
import sys

sys.path.append("..")

from src.config.config import cfg
from src.datagen import datagen
from src.viz.data_viz import (
    plot_category_distribution_overall_and_by_pop,
    plot_hist_overall_and_by_pop, 
    visualize_successful_cases,
    load_batch_data,
    analyze_batch_data
)


### Generate dataset
Configure params in `src/config/config.SYNTH`

In [3]:
dg = datagen.DataGenerator() 

N = 5000
X_syn, exit_syn, fund_syn, pop_labels = dg.generate_dataset(N, cfg.SYNTH['POPULATIONS'])

In [None]:


for cat_name in cfg.MATRIX.keys():
    plot_category_distribution_overall_and_by_pop(X_syn, pop_labels, cat_name)

plot_hist_overall_and_by_pop(fund_syn, pop_labels, title="Funding Amount", bins=50, log_scale=True)

plot_hist_overall_and_by_pop(exit_syn, pop_labels, title="Exit Value", bins=50, log_scale=True)

visualize_successful_cases(exit_syn, fund_syn)

### Save Dataset

In [None]:
df = dg.save_synthetic_dataset(
    X_syn, 
    exit_syn, 
    fund_syn, 
    cfg.MATRIX, 
    "../data/synth/encoded_founders_composites.csv", 
    success_funding_threshold=cfg.SUCCESS_FUNDING_THRESHOLD
)
df

### EDA for YC data

In [None]:
batch_codes = ['S21', 'W21', 'S17', 'W17', 'top_companies']

df = load_batch_data(batch_codes)
analyze_batch_data(df, batch_codes)