# LLM Classification on Startup Description, Founder Education, Industry Outlook

# Setup

#### Checks

In [None]:
from psutil import virtual_memory
!nvidia-smi
ram_gb = virtual_memory().total / 1e9
print(f"RAM: {ram_gb:.1f} GB")

#### TOKEN! PRIVATE!

In [None]:
HUGGING_FACE_CLI_TOKEN = '<token_here>'

#### Hugging Face

In [None]:
# Install necessary packages for LLM inference
!pip install -q transformers accelerate bitsandbytes sentencepiece

# Log in to Hugging Face to access gated models like Hermes
!huggingface-cli login

#### Imports and Folders

In [None]:
# imports
import os
import gc
import pandas as pd
import numpy as np

from transformers import AutoModelForCausalLM, AutoTokenizer, GenerationConfig

import torch
import re
import json
from tqdm.notebook import tqdm
import pprint

# Suppress verbose config warnings from Hugging Face
import logging
logging.getLogger("transformers.generation.configuration_utils").setLevel(logging.ERROR)

from google.colab import drive, runtime
drive.mount('/content/drive', force_remount=True)

In [None]:
# Print current RAM usage to monitor memory pressure
import psutil
def print_memory():
    mem = psutil.virtual_memory()
    print(f"Used: {mem.used/1e9:.1f} GB / {mem.total/1e9:.1f} GB ({mem.percent}%)")

In [None]:
input_folder = '/content/drive/MyDrive/Senior/Thesis/Code/Data/Input Data/'
cleaned_data_folder = input_folder + 'cleaned_data'
output_folder = '/content/drive/MyDrive/Senior/Thesis/Code/Data/Output Data/'

# faulty_founder_path = os.path.join(cleaned_data_folder, 'faulty_rows_founder.csv')
# faulty_outlook_path = os.path.join(cleaned_data_folder, 'faulty_rows_outlook.csv')

# checkpoint_path = os.path.join(output_folder, 'faulty_rows_outlook_alignment_v2.jsonl')
# founder_checkpoint_path = os.path.join(output_folder, 'faulty_rows_founder_score.jsonl')

checkpoint_path = os.path.join(output_folder, 'alignment_scores.jsonl')
# checkpoint_path = os.path.join(output_folder, 'alignment_scores_v2.jsonl')
founder_checkpoint_path = os.path.join(output_folder, 'founder_strength_scores.jsonl')
founder_final_output_path = os.path.join(output_folder,'cb_with_founder_score.csv')
final_output_path = os.path.join(output_folder,'cb_with_alignment_score.csv')
print(checkpoint_path)

# Check contents of folders
cleaned_data_contents = os.listdir(cleaned_data_folder)
output_contents = os.listdir(output_folder)
print(cleaned_data_contents)
print(output_contents)

In [None]:
# File Paths
CB_PATH = os.path.join(cleaned_data_folder, 'cb_final_data.csv')
NLP_PATH = os.path.join(cleaned_data_folder, 'outlook.csv')
cb_df = pd.read_csv(CB_PATH)
nlp_df = pd.read_csv(NLP_PATH)

print(f"Crunchbase shape: {cb_df.shape}")
print(f"Perplexity shape: {nlp_df.shape}")
print("\nSample CB columns:", cb_df.columns.tolist())
print("\nSample NLP columns:", nlp_df.columns.tolist())

In [None]:
# # Loading faulty dfs
# faulty_founder_df = pd.read_csv(faulty_founder_path)
# faulty_outlook_df = pd.read_csv(faulty_outlook_path)

# print(f"Faulty Founder shape: {faulty_founder_df.shape}")
# print(f"Faulty Outlook shape: {faulty_outlook_df.shape}")
# print("\nSample Faulty Founder columns:", faulty_founder_df.columns.tolist())
# print("\nSample Faulty Outlook columns:", faulty_outlook_df.columns.tolist())

# Startup -> Outlook

## Step 0: Prepare Data

In [None]:
# === INDUSTRY LIST ===

target_industries = set([
    'Cleantech', 'Consumer Goods', 'Fintech', 'Life Sciences',
    'Media Entertainment and Gaming', 'Real Estate',
    'Technology', 'Telecom', 'Transportation'
])

# === CLEAN + MAP INDUSTRIES ===
def parse_industries(industry_str):
    if pd.isna(industry_str):
        return []
    return [i.strip() for i in industry_str.split(",") if i.strip() in target_industries]

cb_df["mapped_industries"] = cb_df["industry"].apply(parse_industries)

# === FILTER OUT BAD ROWS ===
cb_df = cb_df[cb_df["mapped_industries"].map(len) > 0]
cb_df = cb_df[cb_df["founded_year"].between(2004, 2025)]

# === INSPECT CLEANED OUTPUT ===
print(f"Filtered Crunchbase shape: {cb_df.shape}")
print(cb_df[["org_uuid", "founded_year", "mapped_industries"]].head(5))

## Step 1: Create Joinable Dataframe

In [None]:
# Ensure founded_on is datetime
cb_df["founded_on"] = pd.to_datetime(cb_df["founded_on"], errors="coerce")

# Extract year into a clean column for join
cb_df["founded_year"] = cb_df["founded_on"].dt.year

# Drop rows with missing or out-of-bounds years
cb_df = cb_df[cb_df["founded_year"].between(2004, 2025)]

# STEP 1A: Drop the original 'industry' string column (we'll replace it with the exploded list)
cb_exploded = cb_df.drop(columns=["industry"]).explode("mapped_industries")

# STEP 1B: Rename mapped_industries → industry and clean whitespace
cb_exploded = cb_exploded.rename(columns={"mapped_industries": "industry"})
cb_exploded["industry"] = cb_exploded["industry"].astype(str).str.strip()


In [None]:
res = cb_exploded[cb_exploded['org_uuid'].str.lower() == 'df662812-7f97-0b43-9d3e-12f64f504fbb']
print(res.head(10))

In [None]:
# STEP 1C: Merge with Perplexity outlooks
cb_exploded = cb_exploded.merge(
    nlp_df,
    left_on=["industry", "founded_year"],
    right_on=["industry", "year"],
    how="inner"
)

In [None]:
cb_exploded = cb_exploded.drop(columns=["year", "tokens_estimate"])

In [None]:
# Preview
print(cb_exploded.shape)
print()
print(cb_exploded.info())
print()
print(cb_exploded.head())


In [None]:
res = cb_exploded[["org_description", "industry", "summary"]]
res.head(2)

In [None]:
cb_nlp_path = os.path.join(output_folder, 'cb_nlp_merge.csv')
cb_exploded.to_csv(cb_nlp_path, index=False)

## Step 2: Using LLM to Compare Description with Outlook
Uses Batching!

#### Load

In [None]:
# Load csv into df
cb_nlp_path = os.path.join(output_folder, 'cb_nlp_merge.csv')
cb_nlp_df = pd.read_csv(cb_nlp_path)
print(f"Loaded {len(cb_nlp_df)} rows.")
print()
print(cb_nlp_df.info())

In [None]:
# """
# Temp for re-running on subsets
# """
# # Ensure org_uuid is string for safe matching
# cb_nlp_df["org_uuid"] = cb_nlp_df["org_uuid"].astype(str)
# faulty_outlook_df["org_uuid"] = faulty_outlook_df["org_uuid"].astype(str)

# # Subset the rows from cb_df that match the faulty sets
# outlook_subset = cb_nlp_df[cb_nlp_df["org_uuid"].isin(faulty_outlook_df["org_uuid"])]

# # Optional: check the counts
# print(f"Original cb_nlp_df: {len(cb_nlp_df)} rows")
# print(f"Faulty outlooks: {len(outlook_subset)} rows")

In [None]:
# Load the LLM model and tokenizer with 8-bit quantization to reduce GPU memory usage

def load_model(model_name="NousResearch/Hermes-2-Pro-Mistral-7B"):
    print(f"Loading model: {model_name}")
    tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=True)

    if tokenizer.pad_token is None:
        tokenizer.pad_token = tokenizer.eos_token
        tokenizer.pad_token_id = tokenizer.eos_token_id

    model = AutoModelForCausalLM.from_pretrained(
        model_name,
        device_map="auto",
        torch_dtype=torch.float16,
        load_in_8bit=True
    )

    gen_config = GenerationConfig(
        max_new_tokens=20,
        temperature=0.01,
        repetition_penalty=0.1,
        do_sample=False,
        pad_token_id=tokenizer.pad_token_id,
        eos_token_id=tokenizer.eos_token_id
    )

    return model, tokenizer, gen_config

#### Prompt

In [None]:
# print(nlp_df.shape)

In [None]:
# # Build a cache of outlook text blocks by (industry, year)
# def build_outlook_cache(df):
#     cache = {}
#     for _, row in df.iterrows():
#         key = (row["industry"], row["founded_year"])
#         body = (
#             f"Industry Context – {row['industry']} in {row['founded_year']}:\n\n"
#             f"Summary:\n{row['summary']}\n\n"
#             f"Trends:\n{row['trends']}\n\n"
#             f"Infrastructure:\n{row['infrastructure']}\n\n"
#             f"Market Outlook:\n{row['outlook']}\n\n"
#             f"Timing Signal:\n{row['timing_signal']}\n"
#         )
#         cache[key] = body
#     return cache

# # Create the outlook cache (do this right after loading cb_df)
# outlook_cache = build_outlook_cache(cb_nlp_df)
# print(len(outlook_cache))


In [None]:
# Preprocess and format each prompt as a Hermes-style chat message

def clean_description(text, word_limit=150):
    if pd.isna(text) or not isinstance(text, str):
      return ""
    text = text.replace('\xa0', ' ').strip()
    words = text.split()
    return ' '.join(words[:word_limit])

def format_messages(row):
    desc = clean_description(row["org_description"])

    system_prompt = (
        "You are a seasoned venture capitalist evaluating startups **at the time they were founded**.\n"
        "\n"
        "Your goal is to rigorously assess whether a startup was likely to succeed **given the conditions of its founding year**.\n"
        "Base your judgement on:\n"
        "- How well the startup's idea aligns with the market conditions, trends, and infrastructure available in the founding year.\n"
        "- The feasibility and clarity of the startup based on its description — but ONLY as it would have been understood at the time.\n"
        "\n"
        "**Important:** Most startups fail. Be skeptical. Startups that sound good on paper often still fail due to poor timing or lack of supporting infrastructure.\n"
        "You should assign a score of **3 or below to at least two-thirds of startups (66%) you evaluate**.\n"
        "\n"
        "If the startup description includes future events (rebrands, acquisitions, IPOs, product launches years later), **ignore them**. Evaluate only what would have been visible to investors **at the time of founding**.\n"
        "\n"
        "FORMAT REQUIREMENTS:\n"
        "- You MUST ALWAYS respond with a numerical score.\n"
        "- Answer MUST be a single digit from 1 to 5\n"
        "- NO explanation, reasoning, jibberish, or special characters\n"
        "- NO full sentences\n"
        "- NO excerpts from the outlook\n"
        "- ONLY return the digit on its own line\n"
        "- Do not prefix with 'Answer:' or similar\n"
        "\n"
        "SCORE INTERPRETATION:\n"
        "1 = Weak or vague idea and description, AND clearly misaligned with the industry conditions (market not ready, infrastructure lacking)\n"
        "2 = Some promise in the idea, but misaligned with trends and poorly timed for success\n"
        "3 = Reasonably strong idea OR some alignment with market conditions, but not both\n"
        "4 = Good idea AND somewhat aligned with trends and infrastructure — has potential, but not perfect timing\n"
        "5 = Excellent idea AND perfectly aligned with the market, trends, and infrastructure at the time — ideal conditions for success\n"
        "\n"
        "EXAMPLES:\n"
        "-1\n"
        "-2\n"
        "-3\n"
        "-4\n"
        "-5"
    )

    user_prompt = (
        f"Startup Description:\n{desc}\n\n"
        f"Industry Context – {row['industry']} in {row['founded_year']}:\n\n"
        f"Summary:\n{row['summary']}\n\n"
        f"Trends:\n{row['trends']}\n\n"
        f"Infrastructure:\n{row['infrastructure']}\n\n"
        f"Market Outlook:\n{row['outlook']}\n\n"
        f"Timing Signal:\n{row['timing_signal']}\n\n"
        "Based on this information, how likely is this startup to succeed in its founding year?\n"
        "Be rigorous. Respond ONLY with a single digit from 1 to 5."
    )

    return [
        {"role": "system", "content": system_prompt},
        {"role": "user", "content": user_prompt}
    ]


#### Helper Functions

In [None]:
# Extract the first clean 1–5 rating from raw model output
def parse_score(output_text):
    match = re.search(r"\b([1-5])\b", output_text)
    return int(match.group(1)) if match else None

# Save intermediate results to Drive in JSONL format
def save_checkpoint(results, filename):
    print(f"Saving {len(results)} rows to {filename}")
    with open(filename, "a") as f:
        for r in results:
            f.write(json.dumps(r) + "\n")

# Track which org_uuids have already been scored
def load_existing_checkpoint(filename):
    seen = set()
    if os.path.exists(filename):
        with open(filename, "r") as f:
            for line in f:
                try:
                    entry = json.loads(line)
                    seen.add(entry["org_uuid"])
                except:
                    continue
    return seen

#### Run Inference

In [None]:
# Main inference loop that resumes from checkpoint and logs raw output + parsed score

def run_inference(df, model, tokenizer, gen_config, checkpoint_path=checkpoint_path, batch_size=25):
    completed = load_existing_checkpoint(checkpoint_path)
    print(f"Resuming from {len(completed)} completed rows.")

    results = []
    total = len(df)

    for i, row in tqdm(df.iterrows(), total=total):
        if row["org_uuid"] in completed:
            continue

        try:
            messages = format_messages(row)
            input_ids = tokenizer.apply_chat_template(messages, return_tensors="pt", tokenize=True).to(model.device)

            with torch.no_grad():
                output = model.generate(
                    input_ids=input_ids,
                    generation_config=gen_config
                )
            # decoded = tokenizer.decode(output[0], skip_special_tokens=True)
            # score = parse_score(decoded)
            output_ids = output[0][input_ids.shape[-1]:] # Remove prompt tokens to isolate just the generated output
            decoded = tokenizer.decode(output_ids, skip_special_tokens=True)
            score = parse_score(decoded)

            results.append({
                "org_uuid": row["org_uuid"],
                "industry": row["industry"],
                "founded_year": row["founded_year"],
                "score": score,
                "raw_output": decoded.strip()
            })

            if len(results) >= batch_size:
                save_checkpoint(results, checkpoint_path)
                results = []

            if i % 500 == 0:
              print_memory()

        except Exception as e:
            print(f"Error at index {i}: {e}")
            continue

    if results:
        save_checkpoint(results, checkpoint_path)


#### Run

In [None]:
model, tokenizer, gen_config = load_model()

##### Test

In [None]:
# Test inference on one row
org_name = "1663f36c-d3b6-0e52-c79a-65a866c90b58"
# sample_row = cb_nlp_df[cb_nlp_df["org_name"] == org_name].iloc[0]
sample_row = cb_nlp_df[cb_nlp_df["org_uuid"] == org_name].iloc[0]
messages = format_messages(sample_row)
input_ids = tokenizer.apply_chat_template(messages, return_tensors="pt", tokenize=True).to(model.device)

with torch.no_grad():
    output = model.generate(
        input_ids=input_ids,
        generation_config=gen_config
    )

output_ids = output[0][input_ids.shape[-1]:]
decoded = tokenizer.decode(output_ids, skip_special_tokens=True)
score = parse_score(decoded)

print("Input:\n", messages[1])
print("Raw output:\n", decoded.strip())
print()
print("Parsed score:", score)

##### Real

In [None]:
# run_inference(cb_nlp_df, model, tokenizer, gen_config)
run_inference(cb_nlp_df, model, tokenizer, gen_config, checkpoint_path=checkpoint_path)

In [None]:
runtime.unassign()

# Founder Description

In [None]:
# """
# Temp for re-running on subsets
# """
# # Ensure org_uuid is string for safe matching
# cb_df["org_uuid"] = cb_df["org_uuid"].astype(str)
# faulty_founder_df["org_uuid"] = faulty_founder_df["org_uuid"].astype(str)

# # Subset the rows from cb_df that match the faulty sets
# founder_subset = cb_df[cb_df["org_uuid"].isin(faulty_founder_df["org_uuid"])]

# # Optional: check the counts
# print(f"Original cb_df: {len(cb_nlp_df)} rows")
# print(f"Faulty founders: {len(founder_subset)} rows")

### Code

In [None]:
# ========== CONFIG ========== #
MODEL_NAME = "NousResearch/Hermes-2-Pro-Mistral-7B"
CHECKPOINT_PATH = founder_checkpoint_path
BATCH_SIZE = 25
MAX_NEW_TOKENS = 20

In [None]:
# ========== LOAD MODEL & TOKENIZER ========== #
def load_model(model_name=MODEL_NAME):
    print(f"Loading model: {model_name}")
    tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=True)

    if tokenizer.pad_token is None:
        tokenizer.pad_token = tokenizer.eos_token
        tokenizer.pad_token_id = tokenizer.eos_token_id

    model = AutoModelForCausalLM.from_pretrained(
        model_name,
        device_map="auto",
        torch_dtype=torch.float16,
        load_in_8bit=True
    )

    # --- Generation Configuration ---
    # We want deterministic, concise output (just the score)
    gen_config = GenerationConfig(
        max_new_tokens=MAX_NEW_TOKENS,  # Only need a few tokens for the score
        temperature=0.01,       # Low temperature for deterministic output
        repetition_penalty=0.1, # Slightly penalize repetition
        do_sample=False,
        pad_token_id=tokenizer.pad_token_id,
        eos_token_id=tokenizer.eos_token_id,
    )

    return model, tokenizer, gen_config

In [None]:
# ========== PROMPT FORMATTING ========== #
def clean_description(text, word_limit=400):
    if pd.isna(text) or not isinstance(text, str):
        return ""
    text = text.replace('\xa0', ' ').strip()
    words = text.split()
    return ' '.join(words[:word_limit])

def format_messages(row):
    founder_blob = clean_description(row["founder_description_blob"])

    system_prompt = (
        "You are an expert analyst evaluating the strength of startup founders' backgrounds.\n"
        "Your task is to assess the collective background strength based *only* on the provided text description.\n"
        "Consider factors like:\n"
        "- **Education:** Prestige of institutions (e.g., Ivy League, Stanford, MIT), relevant degrees (e.g., CS, Engineering, MBA).\n"
        "- **Prior Experience:** Roles at well-known successful companies (e.g., FAANG, successful startups), relevant industry experience, leadership positions.\n"
        "- **Entrepreneurial Track Record:** Previous founding experience, successful exits.\n"
        "- **Accolades/Achievements:** Mention of significant accomplishments, awards, patents etc.\n\n"
        "Assign a score from 1 to 5 based on the overall impression of the founder(s)' potential, derived *strictly* from the text:\n"
        "1 = **Very Weak:** Little to no relevant education or experience mentioned. Background seems unsuitable or irrelevant.\n"
        "2 = **Weak:** Some basic education or unrelated experience. Lacks notable achievements or prestigious affiliations.\n"
        "3 = **Average:** Decent education (may not be top-tier) OR some relevant industry experience. Meets basic expectations but isn't outstanding.\n"
        "4 = **Strong:** Prestigious education (e.g., top university) OR significant relevant experience (e.g., key roles at known companies, prior founding experience). Clear indicators of high potential.\n"
        "5 = **Exceptional:** Multiple strong indicators. Combination of top-tier education, significant relevant experience at high-profile companies, proven entrepreneurial success (e.g., previous exits), clear leadership. Outstanding potential visible from the description.\n\n"
        "FORMAT REQUIREMENTS:\n"
        "- You MUST ALWAYS respond with a score\n"
        "- Answer MUST be a single digit from 1 to 5\n"
        "- NO explanation, reasoning, or special characters\n"
        "- NO full sentences\n"
        "- NO reiteration of the background\n"
        "- ONLY return the digit on its own line\n"
        "- Do not prefix with 'Answer:' or similar\n"
        "\n"
        "EXAMPLE OUTPUTS:\n"
        "- 1\n"
        "- 2\n"
        "- 3\n"
        "- 4\n"
        "- 5"
    )

    user_prompt = (
        f"Founder Description(s):\n```\n{founder_blob}\n```\n\n"
        "Based *only* on the description above, rate the overall strength of the founder(s)' background. Respond ONLY with a single digit from 1 to 5."
    )

    # Format for Hermes-2-Pro (ChatML-like structure)
    # See: https://huggingface.co/NousResearch/Hermes-2-Pro-Mistral-7B#prompt-format
    messages = [
        {"role": "system", "content": system_prompt},
        {"role": "user", "content": user_prompt}
    ]
    return messages


In [None]:
# ========== SCORING & PARSING ========== #
def parse_score(output_text):
    match = re.search(r"\b([1-5])\b", output_text)
    return int(match.group(1)) if match else None

# ========== IO & CHECKPOINTING ========== #
def save_checkpoint(results, filename):
    print(f"Saving {len(results)} rows to {filename}")
    with open(filename, "a") as f:
        for r in results:
            f.write(json.dumps(r) + "\n")

def load_existing_checkpoint(filename):
    seen = set()
    if os.path.exists(filename):
        with open(filename, "r") as f:
            for line in f:
                try:
                    entry = json.loads(line)
                    seen.add(entry["org_uuid"])
                except:
                    continue
    return seen

In [None]:
# ========== MAIN INFERENCE LOOP ========== #
def run_inference(df, model, tokenizer, gen_config, checkpoint_path=CHECKPOINT_PATH, batch_size=BATCH_SIZE):
    completed = load_existing_checkpoint(checkpoint_path)
    print(f"Resuming from {len(completed)} completed rows.")

    results = []

    for i, row in tqdm(df.iterrows(), total=len(df)):
        if row["org_uuid"] in completed:
            continue

        try:
            messages = format_messages(row)
            input_ids = tokenizer.apply_chat_template(
                messages,
                return_tensors="pt",
                tokenize=True
            ).to(model.device)

            with torch.no_grad():
                output = model.generate(input_ids=input_ids, generation_config=gen_config)

            output_ids = output[0][input_ids.shape[-1]:]
            decoded = tokenizer.decode(output_ids, skip_special_tokens=True)
            score = parse_score(decoded)

            results.append({
                "org_uuid": row["org_uuid"],
                "org_name": row["org_name"],
                "score": score,
                "raw_output": decoded.strip(),
            })

            if len(results) >= batch_size:
                save_checkpoint(results, checkpoint_path)
                results = []

        except Exception as e:
            print(f"Error at row {i}: {e}")
            continue

    if results:
        save_checkpoint(results, checkpoint_path)

### Load Model

In [None]:
# ========== RUN EVERYTHING ========== #
model, tokenizer, gen_config = load_model()

### Running

In [None]:
run_inference(cb_df, model, tokenizer, gen_config)

In [None]:
# runtime.unassign()

### Testing

In [None]:
sample_row = cb_df[cb_df["org_name"] == "Zuora"].iloc[0]
# sample_row = founder_subset[founder_subset["org_name"] == "Fabric"].iloc[0]
messages = format_messages(sample_row)

input_ids = tokenizer.apply_chat_template(messages, return_tensors="pt", tokenize=True).to(model.device)
with torch.no_grad():
    output = model.generate(input_ids=input_ids, generation_config=gen_config)

output_ids = output[0][input_ids.shape[-1]:]
decoded = tokenizer.decode(output_ids, skip_special_tokens=True)
score = parse_score(decoded)

print("Prompt:\n", messages[1]["content"])
print()
print("Raw Output:\n", decoded.strip())
print("Score:", score)
