In [5]:
# ! pip install bs4 selenium webdriver-manager -q

In [None]:
import pandas as pd
import numpy as np
import sys
import os
import json
import requests
import time
from dotenv import load_dotenv
from pathlib import Path

sys.path.append('..')

from src.clients.perplexity_client import PerplexityClient
from src.clients.proxycurl_client import ProxycurlClient
from src.config.config import cfg
from src.processing.transforms import ProfileTransforms
from src.clients.yc_client import YCClient

load_dotenv()

### Scraping (www.ycombinator.com/companies)

In [7]:
yc = YCClient()

batch_codes = []

# range [i, j] is 200i to 200j.  W and S.
for year in range(21, 17, -1):
    yr = str(year).zfill(2)
    batch_codes.extend([f"W{yr}", f"S{yr}"])


output_dir = "../data/live/yc"

# df = yc.scrape_batches(batch_codes, output_dir=None)

yc.__del__()

In [8]:
# df

### Get exits and funding for companies

Some of these numbers, especially exits are off. But this data should get close enough to map to ordinals and success.

In [None]:
pc = PerplexityClient()

batch_codes = ['W17', 'S17']
for bc in batch_codes:
    try:
        filename = f'{output_dir}/{bc}.csv'
        
        batch_df = pd.read_csv(filename)
        batch_df = batch_df.drop_duplicates(subset=['Name', 'LinkedIn'])
        batch_df = batch_df.dropna(subset=['Name', 'LinkedIn'])
        
        for company in batch_df['Company'].unique():
            try:
                evaluation = pc.eval_company(f'{company} (YC {bc})')
                
                batch_df.loc[batch_df['Company'] == company, 'exit_value_usd'] = evaluation.get('exit_value_usd')
                batch_df.loc[batch_df['Company'] == company, 'total_funding_usd'] = evaluation.get('total_funding_usd')
                
                batch_df.to_csv(filename, index=False)
                print(f"Processed {company}")
                
            except Exception as e:
                print(f"Error evaluating {company} (YC {bc}): {e}")
        
    except Exception as e:
        print(f"Error loading {bc}: {e}")

In [5]:
MATRIX = cfg.MATRIX
BATCH_SIZE = 5  
LINKEDIN_PROFILES_PATH = "../data/linkedin_profiles.json"
ENCODED_DATA_DIR = "../data/raw/"
YC_DATA_DIR = "../data/live/yc"
CUTOFF_YEAR = 2017  

def load_existing_profiles():
    try:
        with open(LINKEDIN_PROFILES_PATH, 'r') as f:
            profiles = json.load(f)
        print(f"Loaded {len(profiles)} existing profiles")
        return profiles
    except (FileNotFoundError, json.JSONDecodeError):
        print("Starting with empty profiles list")
        return []

def get_processed_urls(profiles):
    urls = set()
    for p in profiles:
        # Check different possible URL fields
        for field in ['input_linkedin_url', 'linkedin_url', 'public_url', 'url']:
            if field in p and p[field]:
                urls.add(p[field])
                break
    return urls

def fetch_linkedin_profile(url, api_key):
    headers = {'Authorization': 'Bearer ' + api_key}
    api_endpoint = 'https://nubela.co/proxycurl/api/v2/linkedin'
    params = {
        'linkedin_profile_url': url,
        'use_cache': 'if-present',
    }
    
    try:
        response = requests.get(api_endpoint, params=params, headers=headers)
        
        if response.status_code == 200 and response.content:
            profile = response.json()
            profile['input_linkedin_url'] = url
            return profile
        else:
            print(f"Error fetching profile {url}: Status {response.status_code}")
            return {
                'input_linkedin_url': url,
                'public_identifier': url.split('/')[-1].split('?')[0],
                'error': f'API Error: {response.status_code}'
            }
    except Exception as e:
        print(f"Exception fetching profile {url}: {e}")
        return None

def save_profiles(profiles):
    with open(LINKEDIN_PROFILES_PATH, 'w') as f:
        json.dump(profiles, f)

def get_yc_batch_files():
    return [f for f in os.listdir(YC_DATA_DIR) if f.endswith('.csv')]

def process_batch_file(file_path, linkedin_profiles, processed_urls):
    print(f"Processing {file_path}...")
    
    # Load founders data
    founders = pd.read_csv(file_path)
    founders = founders.dropna(subset=['LinkedIn'])
    
    if 'exit_value_usd' in founders.columns and 'total_funding_usd' in founders.columns:
        founders = founders[~((founders['exit_value_usd'] == 0) & (founders['total_funding_usd'] == 0))]
    
    batch_code = os.path.basename(file_path).split('.')[0]
    
    new_profiles = []
    
    for idx, row in founders.iterrows():
        linkedin_url = row['LinkedIn']
        
        if linkedin_url in processed_urls:
            print(f"Skipping already processed profile: {linkedin_url}")
            continue
        
        print(f"Fetching profile {idx+1}/{len(founders)}: {linkedin_url}")
        
        api_key = os.getenv('PROXYCURL_API_KEY')
        profile = fetch_linkedin_profile(linkedin_url, api_key)
        
        if profile:
            profile['yc_batch'] = batch_code
            
            profile['company_name'] = row.get('Company', '')
            if 'exit_value_usd' in row:
                profile['exit_value_usd'] = row['exit_value_usd']
            if 'total_funding_usd' in row:
                profile['total_funding_usd'] = row['total_funding_usd']
            
            linkedin_profiles.append(profile)
            new_profiles.append(profile)
            processed_urls.add(linkedin_url)
            
            if len(new_profiles) % BATCH_SIZE == 0:
                print(f"Saving batch with {len(new_profiles)} new profiles (total: {len(linkedin_profiles)})...")
                save_profiles(linkedin_profiles)
        
        time.sleep(1)
    
    if new_profiles:
        save_profiles(linkedin_profiles)
        print(f"Saved {len(linkedin_profiles)} profiles to {LINKEDIN_PROFILES_PATH} ({len(new_profiles)} new from {batch_code})")
    
    return linkedin_profiles, processed_urls

def transform_and_encode_profiles(linkedin_profiles, batch_code=None):
    T = ProfileTransforms(data={}, matrix=MATRIX)
    
    if batch_code:
        batch_profiles = [p for p in linkedin_profiles if p.get('yc_batch') == batch_code]
        print(f"Processing {len(batch_profiles)} profiles for batch {batch_code}")
    else:
        batch_profiles = linkedin_profiles
        print(f"Processing all {len(batch_profiles)} profiles")
    
    valid_profiles = []
    for profile in batch_profiles:
        if profile and 'experiences' in profile and profile['experiences'] is not None:
            valid_profiles.append(profile)
        else:
            print(f"Skipping invalid profile: {profile.get('input_linkedin_url', 'unknown URL')}")
    
    print(f"Found {len(valid_profiles)} valid profiles out of {len(batch_profiles)}")
    
    df = T.transform_person_endpt(profile_list=valid_profiles, cutoff_date=CUTOFF_YEAR)
    T.df = df
    
    pc = PerplexityClient()
    
    print("Starting AI evaluations with Perplexity API...")
    total_profiles = len(df)
    
    ai_evaluations = []
    for idx, row in df.iterrows():
        print(f"Evaluating profile {idx+1}/{total_profiles}: {row.get('Name', 'Unknown')}")
        try:
            evaluation = pc.eval_person(row, MATRIX)
            ai_evaluations.append(evaluation)
            time.sleep(0.5)
        except Exception as e:
            print(f"Error evaluating profile {idx+1}: {e}")
            ai_evaluations.append({"exited_founder": 0, "previous_founder": 1, "startup_experience": 1})
    
    df["EXIT"] = [x.get("exited_founder", 0) for x in ai_evaluations]
    df["FOUNDER"] = [x.get("previous_founder", 1) for x in ai_evaluations]
    df["STARTUP"] = [x.get("startup_experience", 1) for x in ai_evaluations]
    
    print("AI evaluations completed.")
    
    T._add_ordinal_columns()

    print("Creating feature matrix...")
    feature_matrix = T.create_feature_matrix()
    df["feature_vector"] = list(feature_matrix)
    
    output_path = os.path.join(ENCODED_DATA_DIR, f"{batch_code}_profiles.csv")
    df.to_csv(output_path, index=False)
    print(f"Saved encoded profiles to {output_path}")
    
    return df


linkedin_profiles = load_existing_profiles()
processed_urls = get_processed_urls(linkedin_profiles)
print(f"Found {len(linkedin_profiles)} existing profiles with {len(processed_urls)} unique URLs")

if linkedin_profiles:
    print("\nDebug: First profile structure")
    first_profile = linkedin_profiles[0]
    print(f"Keys: {list(first_profile.keys())[:10]}...")
    
    url_fields = ['input_linkedin_url', 'linkedin_url', 'public_url', 'url']
    for field in url_fields:
        if field in first_profile:
            print(f"Found URL field: {field} = {first_profile[field]}")
batch_files = get_yc_batch_files()
print(f"\nFound {len(batch_files)} YC batch files: {batch_files}")


target_batch = "W17.csv"
if target_batch in batch_files:
    file_path = os.path.join(YC_DATA_DIR, target_batch)
    linkedin_profiles, processed_urls = process_batch_file(file_path, linkedin_profiles, processed_urls)
    
    batch_code = target_batch.split('.')[0]
    
    encoded_df = transform_and_encode_profiles(linkedin_profiles, batch_code)
    print(f"Encoded {len(encoded_df)} profiles for batch {batch_code}")

In [None]:
import pandas as pd
import json
from src.clients.perplexity_client import PerplexityClient

with open('../data/linkedin_profiles.json', 'r') as f:
    linkedin_profiles = json.load(f)
print(f"Loaded {len(linkedin_profiles)} LinkedIn profiles")

top_companies = pd.read_csv('../data/live/yc/top_companies.csv')
print(f"Processing {len(top_companies)} top companies")

pc = PerplexityClient()
processed_profiles = []

for idx, company in top_companies.iterrows():
    print(f"\nProcessing {idx+1}/{len(top_companies)}: {company['Company']}")
    
    profile = None
    linkedin_url = company['LinkedIn'].lower().strip().rstrip('/')
    
    for p in linkedin_profiles:
        if any(url and linkedin_url in str(url).lower().strip().rstrip('/') 
               for url in [p.get('linkedin_profile_url'), p.get('public_identifier'), 
                         p.get('input_linkedin_url'), p.get('public_url')]):
            profile = p
            break
    
    if not profile:
        print(f"No profile found for {linkedin_url}")
        continue

    transforms = ProfileTransforms({}, cfg.MATRIX)
    cutoff_date = company['cutoff_date'] if pd.notna(company['cutoff_date']) else None
    
    try:
        processed = transforms.process_person_endpt(profile, cutoff_date)
        if not processed:
            print("Failed to process profile")
            continue
        
        try:
            evaluation = pc.eval_person(processed, cfg.MATRIX)
            time.sleep(0.5)  
        except Exception as e:
            print(f"Error in AI evaluation: {e}")
            evaluation = {"exited_founder": 0, "previous_founder": 1, "startup_experience": 1}
        

        processed.update({
            'Company': company['Company'],
            'exit_value_usd': company['exit_value_usd'],
            'total_funding_usd': company['total_funding_usd'],
            'EXIT': max(2 if company['exit_value_usd'] > 0 else 0, evaluation.get('exited_founder', 0)),
            'FOUNDER': max(3, evaluation.get('previous_founder', 1)),  # Top company founder
            'STARTUP': max(3, evaluation.get('startup_experience', 1))  # Successful startup
        })
        

        transforms.df = pd.DataFrame([processed])
        transforms._add_ordinal_columns()
        feature_matrix = transforms.create_feature_matrix()
        processed['feature_vector'] = feature_matrix[0].tolist()
        
        processed_profiles.append(processed)
        print(f"Successfully processed {processed['Name']}")
        
    except Exception as e:
        print(f"Error processing profile: {str(e)}")
        continue

results_df = pd.DataFrame(processed_profiles)

output_path = '../data/raw/top_companies_profiles.csv'
results_df.to_csv(output_path, index=False)
print(f"\nProcessed {len(results_df)} profiles successfully")
print(f"Saved to {output_path}")

In [None]:
import re
def process_batch_data(batch_code):
    save_df = pd.read_csv('../data/synth/encoded_founders_composites.csv')
    
    profiles_path = f'../data/raw/{batch_code}_profiles.csv'
    profiles_df = pd.read_csv(profiles_path)
    
    funding_path = f'../data/live/yc/{batch_code}.csv'
    funding_df = pd.read_csv(funding_path)
    
    result_df = pd.DataFrame(columns=save_df.columns)
    
    SUCCESS_FUNDING_THRESHOLD = cfg.SUCCESS_FUNDING_THRESHOLD
    
    funding_df['normalized_name'] = funding_df['Name'].apply(normalize_name)
    
    normalized_funding_names = list(funding_df['normalized_name'])
    name_to_idx = {name: idx for idx, name in enumerate(normalized_funding_names) if name}
    
    match_log = []
    
    for index, row in profiles_df.iterrows():
        feature_str = row['feature_vector']
        name = row['Name']
        normalized_name = normalize_name(name)
        
        feature_str = feature_str.replace('[', '').replace(']', '').replace(',', ' ')
        feature_values = [float(x) for x in feature_str.split() if x.strip()]
        
        new_row = {}
        
        for i, col in enumerate(save_df.columns[:26]):
            if i < len(feature_values):
                new_row[col] = feature_values[i]
        
        funding_data, match_type, matched_name = find_matching_funding_data(
            normalized_name, funding_df, normalized_funding_names, name_to_idx
        )
        
        match_log.append({
            'profile_name': name,
            'matched_funding_name': matched_name,
            'match_type': match_type
        })
        
        new_row['exit_value'] = funding_data.iloc[0]['exit_value_usd']
        new_row['funding_amount'] = funding_data.iloc[0]['total_funding_usd']
        
        exit_value = funding_data.iloc[0]['exit_value_usd']
        funding_amount = funding_data.iloc[0]['total_funding_usd']
        new_row['success'] = 1 if (exit_value > 0 or funding_amount > SUCCESS_FUNDING_THRESHOLD) else 0
        
        result_df = pd.concat([result_df, pd.DataFrame([new_row])], ignore_index=True)
    
    print_summary_statistics(result_df, match_log)
    
    output_path = f'../data/encoded/{batch_code}_encoded_with_outcomes.csv'
    result_df.to_csv(output_path, index=False)
    print(f"Saved results to {output_path}")
    
    match_log_df = pd.DataFrame(match_log)
    
    return result_df, match_log_df
def normalize_name(name):
    if not isinstance(name, str):
        return ""
    name = re.sub(r'[^\w\s]', '', name)
    return name.lower().strip()

def find_matching_funding_data(normalized_name, funding_df, normalized_funding_names, name_to_idx):
   
    from difflib import get_close_matches
    
    funding_data = funding_df[funding_df['normalized_name'] == normalized_name]
    
    if not funding_data.empty:
        match_type = "exact"
        matched_name = funding_data.iloc[0]['Name']
    else:
        if normalized_name:
            close_matches = get_close_matches(normalized_name, normalized_funding_names, n=3, cutoff=0.6)
            
            if close_matches:
                best_match = close_matches[0]
                match_idx = name_to_idx[best_match]
                funding_data = funding_df.iloc[[match_idx]]
                match_type = "fuzzy"
                matched_name = funding_data.iloc[0]['Name']
            else:
                first_word = normalized_name.split()[0] if normalized_name.split() else ""
                if first_word:
                    first_word_matches = [name for name in normalized_funding_names if name.startswith(first_word)]
                    if first_word_matches:
                        best_match = first_word_matches[0]
                        match_idx = name_to_idx[best_match]
                        funding_data = funding_df.iloc[[match_idx]]
                        match_type = "first_word"
                        matched_name = funding_data.iloc[0]['Name']
                    else:
                        funding_data = funding_df.iloc[[0]]
                        match_type = "default"
                        matched_name = funding_data.iloc[0]['Name']
                else:
                    funding_data = funding_df.iloc[[0]]
                    match_type = "default"
                    matched_name = funding_data.iloc[0]['Name']
        else:
            funding_data = funding_df.iloc[[0]]
            match_type = "default"
            matched_name = funding_data.iloc[0]['Name']
    
    return funding_data, match_type, matched_name

def print_summary_statistics(result_df, match_log):

    print(f"Processed {len(result_df)} profiles")
    print(f"Found funding data for {result_df['funding_amount'].notna().sum()} profiles")
    print(f"Found exit data for {result_df['exit_value'].notna().sum()} profiles")
    print(f"Successful companies: {result_df['success'].sum()}")

    print("\nMatch Verification (first 20 entries):")
    for i, match in enumerate(match_log[:20]):
        print(f"{i+1}. {match['profile_name']} → {match['matched_funding_name']} ({match['match_type']})")


    match_types = pd.Series([m['match_type'] for m in match_log]).value_counts()
    print("\nMatch type distribution:")
    for match_type, count in match_types.items():
        print(f"{match_type}: {count} ({count/len(match_log)*100:.1f}%)")


batch_code = "top_companies" 
result_df, match_log_df = process_batch_data(batch_code)