In [None]:
import pandas as pd
import numpy as np
import sys
import os
from dotenv import load_dotenv
from pathlib import Path

sys.path.append('..')

from src.clients.perplexity_client import PerplexityClient
from src.clients.proxycurl_client import ProxycurlClient
from src.clients.yc_client import YCClient
from src.config.config import cfg
from src.data.profile_transforms import ProfileTransforms
from src.utils.profile_utils import load_existing_profiles, get_processed_urls, save_profiles
from src.data.yc_data import process_batch_data, process_batch_file, evaluate_batch_companies

load_dotenv()

### Scraping (www.ycombinator.com/companies)

In [2]:
# Directories setup
OUTPUT_DIR = "../data/live/yc"
LINKEDIN_PROFILES_PATH = "../data/linkedin_profiles.json"
ENCODED_DATA_DIR = "../data/parsed/"
ENCODED_OUTPUT_DIR = "../data/encoded"
SYNTH_DATA_PATH = '../data/synth/encoded_founders_composites.csv'

Path(OUTPUT_DIR).mkdir(parents=True, exist_ok=True)
Path(ENCODED_OUTPUT_DIR).mkdir(parents=True, exist_ok=True)

batch_codes = [f"{season}{str(year).zfill(2)}" for year in range(21, 20, -1) for season in ['W', 'S']]
# batch_codes.append('top_companies')

yc = YCClient(headless=True)
pc = PerplexityClient()
px = ProxycurlClient()
T = ProfileTransforms({}, cfg.MATRIX)


In [None]:
linkedin_profiles = load_existing_profiles(LINKEDIN_PROFILES_PATH)
processed_urls = get_processed_urls(linkedin_profiles)

batch_codes = []
for batch_code in batch_codes:
    print(f"\nProcessing batch {batch_code}")
    batch_file = f"{OUTPUT_DIR}/{batch_code}.csv"
    df = yc.scrape_batch(batch_code)
    if not df.empty:
        df.to_csv(batch_file, index=False)

yc.__del__()  


### Processing YC batches

Convert to raw then encoded data

In [None]:
for batch_code in batch_codes:
    batch_file = f"{OUTPUT_DIR}/{batch_code}.csv"
    if os.path.exists(batch_file):
        cutoff_date = int('20' + batch_code[-2:]) if batch_code != 'top_companies' else None
        

        linkedin_profiles, processed_urls = process_batch_file(
            batch_file,
            linkedin_profiles,
            processed_urls,
            px,
            output_path=LINKEDIN_PROFILES_PATH,
            batch_size=5
        )

        batch_profiles = [p for p in linkedin_profiles if p.get('yc_batch') == batch_code]
        if batch_profiles:
            df = T.process_profiles(
                batch_profiles,
                perplexity_client=pc,
                cutoff_date=cutoff_date,
                output_dir=ENCODED_DATA_DIR,
                batch_code=batch_code
            )
        
        evaluate_batch_companies([batch_code], OUTPUT_DIR, pc)

        result_df, match_log = process_batch_data(
            batch_code=batch_code,
            synth_data_path=SYNTH_DATA_PATH,
            profiles_path=f"{ENCODED_DATA_DIR}/{batch_code}_profiles.csv",
            funding_path=f"{OUTPUT_DIR}/{batch_code}.csv",
            output_path=f"{ENCODED_OUTPUT_DIR}/{batch_code}_encoded_with_outcomes.csv"
        )

In [None]:
# # Handle top companies separately
# if 'top_companies' in batch_codes:
#     top_companies_path = f"{OUTPUT_DIR}/top_companies.csv"
#     T.process_top_companies(
#         top_companies_path,
#         LINKEDIN_PROFILES_PATH,
#         pc,
#         output_path=f"{ENCODED_DATA_DIR}/top_companies_profiles.csv"
#     )