# NLP: 
- Startup Description
- Founder Description
- Industry Outlook

## Setup

In [None]:
import nltk
nltk.download("vader_lexicon")

In [None]:
import os
import pandas as pd
import numpy as np
import pprint
from tqdm import tqdm
import matplotlib.pyplot as plt
import json
import torch

# python -m textblob.download_corpora
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
from nltk.sentiment import SentimentIntensityAnalyzer
from textblob import TextBlob
from transformers import pipeline, BertTokenizer, BertForSequenceClassification
from sentence_transformers import SentenceTransformer, util

In [None]:
# HUGGING FACE TOKEN
HUGGING_FACE_TOKEN = os.getenv("HUGGING_FACE_CLI_TOKEN")

# FOLDERS
OUTPUT_FOLDER = "../Data/Output"
INPUT_FOLDER = "../Data/Input"

OUTPUT_CB_FOLDER = os.path.join(OUTPUT_FOLDER, "Crunchbase")
OUTPUT_NLP_FOLDER = os.path.join(OUTPUT_FOLDER, "NLP")
CHECKPOINT_NLP_FOLDER = os.path.join(OUTPUT_FOLDER, "Checkpoint")

checkpoint_path = os.path.join(OUTPUT_NLP_FOLDER, "nlp_checkpoint.jsonl")
outlook_checkpoint_path = os.path.join(OUTPUT_NLP_FOLDER, "nlp_outlook_checkpoint.csv")
sentiment_score_path = os.path.join(OUTPUT_NLP_FOLDER, "nlp_sentiment_features.csv")

output_contents_cb = os.listdir(OUTPUT_CB_FOLDER)
output_contents_nlp = os.listdir(OUTPUT_NLP_FOLDER)

print(output_contents_cb)
print(output_contents_nlp)

In [None]:
nlp_df_path = os.path.join(OUTPUT_NLP_FOLDER, "perplexity.csv")
nlp_scores_path = os.path.join(OUTPUT_NLP_FOLDER, "alignment_scores_v2.jsonl")
cb_df_path = os.path.join(OUTPUT_CB_FOLDER, "cb_final_data.csv")
nlp_founder_df = os.path.join(OUTPUT_NLP_FOLDER, "founder_strength_scores.jsonl")

cb_df = pd.read_csv(cb_df_path)
nlp_df = pd.read_csv(nlp_df_path)
alignment_df = pd.read_json(nlp_scores_path, lines=True)
nlp_founder_df = pd.read_json(nlp_founder_df, lines=True)

In [None]:
cb_df.info()
print()
nlp_df.info()
print()
alignment_df.info()
print()
nlp_founder_df.info()

## Startup + Founder Description

### Load Model

In [None]:
# ==========================
# 1. Load Models
# ==========================
vader = SentimentIntensityAnalyzer()
finbert_sentiment = pipeline("sentiment-analysis", model="ProsusAI/finbert")
founder_sentiment = pipeline("sentiment-analysis", model="cardiffnlp/twitter-roberta-base-sentiment")
bert_embedder = SentenceTransformer("sentence-transformers/all-mpnet-base-v2")

### Functions

In [None]:
# ==========================
# 2. Scoring Functions
# ==========================

def get_textblob_scores(text):
    """Returns polarity and subjectivity from TextBlob."""
    if not isinstance(text, str): return {"polarity": None, "subjectivity": None}
    blob = TextBlob(text)
    return {
        "polarity": blob.sentiment.polarity,
        "subjectivity": blob.sentiment.subjectivity
    }

def get_vader_scores(text):
    """Returns compound and sub-scores from VADER."""
    if not isinstance(text, str): return {"compound": None, "pos": None, "neg": None, "neu": None}
    return vader.polarity_scores(text)

def get_finbert_score(text):
    """Returns FinBERT sentiment score (5-scale and signed)."""
    if not isinstance(text, str): return {"label": None, "score": None, "numeric_5scale": None, "numeric_signed": None}
    result = finbert_sentiment(text[:512])[0]
    label_map_5 = {"positive": 5, "neutral": 3, "negative": 1}
    label_map_signed = {"positive": 1, "neutral": 0, "negative": -1}
    label = result['label'].lower()
    return {
        # "label": result['label'],
        # "score": result['score'],
        "numeric_5scale": label_map_5.get(label),
        "numeric_signed": label_map_signed.get(label)
    }

def get_founder_sentiment_score(text):
    """Returns founder sentiment score using CardiffNLP."""
    if not isinstance(text, str): return {"label": None, "score": None, "numeric_5scale": None, "numeric_signed": None}
    result = founder_sentiment(text[:512])[0]
    label_map_5 = {
        "POS": 5, "NEU": 3, "NEG": 1,
        "LABEL_2": 5, "LABEL_1": 3, "LABEL_0": 1
    }
    label_map_signed = {
        "POS": 1, "NEU": 0, "NEG": -1,
        "LABEL_2": 1, "LABEL_1": 0, "LABEL_0": -1
    }
    label = result['label'].upper()
    return {
        # "label": result['label'],
        # "score": result['score'],
        "numeric_5scale": label_map_5.get(label),
        "numeric_signed": label_map_signed.get(label)
    }

def get_bert_similarity(text, exemplar_vector):
    """Returns cosine similarity to exemplar startup vector."""
    if not isinstance(text, str): return None
    emb = bert_embedder.encode(text, convert_to_tensor=True)
    sim = util.cos_sim(emb, exemplar_vector).item()
    return sim

In [None]:
# ==========================
# 3. Checkpoint Handling
# ==========================
def load_checkpoint(path):
    if not os.path.exists(path):
        return set(), []
    seen_ids = set()
    with open(path, "r") as f:
        lines = [json.loads(line) for line in f if line.strip()]
        for row in lines:
            seen_ids.add(row["org_uuid"])
    return seen_ids, lines

def save_checkpoint(path, results):
    with open(path, "a") as f:
        for row in results:
            f.write(json.dumps(row) + "\n")

In [None]:
# ==========================
# 4. Exemplar Setup
# ==========================
startup_exemplars = [
    "Instagram is a free photosharing application that enables its users to take photos apply filters and share them on social networks such as Facebook Twitter Foursquare Tumblr Flickr and Posterous The app allows its users to capture and customize their photos and videos with several custombuilt filter effects It is also compatible with iOS and Android devices Instagram was founded by Kevin Systrom and Mike Krieger in 2010 in San Francisco California",
    "Stripe is a technology company that builds economic infrastructure for the Internet Stripe is a platform for commercial finance infrastructure Stripe is used by millions of businesses ranging from the biggest corporations in the world to the most ambitious startups to take payments increase revenue and open up new business prospects",
    "Airbnb operates an online platform that connects hosts with guests seeking shortterm accommodations The company facilitates bookings and provides tools for both parties to manage their transactions Airbnb focuses on various types of lodging including homes and unique stays",
    "Uber develops and operates a ridesharing mobile application that connects consumers with partner drivers The application allows users to submit trip requests which are then routed to available drivers It facilitates the arrangement and scheduling of transportation and logistics services through thirdparty providers Uber operates in various cities worldwide providing applications for multiple platforms including Windows Phone iPhone Blackberry and Android",
    "Liquid Death is the first bold hilarious beverage focused on health and sustainability and it is one of the fastestgrowing nonalcoholic beverage brands of all time They take the healthiest beverage available water and package it in infinitely recyclable tallboy cans that can compete with the fun marketing of unhealthy brands in energy drinks beer and junk food In addition Liquid Death donates 10 of its profits from each can sold to nonprofits that work to eliminate plastic pollution and provide clean drinking water to those in need",
    "WhatsApp is a crossplatform mobile messaging app that allows users to exchange messages without paying for SMS WhatsApp Messenger is available for iPhone Blackberry Android Windows Phone Nokia and Symbian platforms Since WhatsApp Messenger uses the same internet data plan that subscribers use for email and web browsing there is no cost to message allowing them to stay in touch with their friends In addition to basic messaging WhatsApp users can create groups and send each other unlimited images video and audio media messages",
    "Moderna Therapeutics is a biotechnology company that develops messenger RNA therapeutics Every cell in the body uses mRNA to provide realtime instructions to make the proteins necessary to drive all aspects of biology including in human health and disease It provides in vivo drug modality that produces human proteins or antibodies inside patient cells  Moderna Therapeutics also develops various patent applications with various claims ranging from novel nucleotide chemistries to specific drug compositions It focuses on disease areas such as inherited genetic disorders hemophilia and blood factors and oncology  Moderna Therapeutics in 2010 and is headquartered in Cambridge in Massachusetts It has strategic option agreements with AstraZeneca and Alexion Pharmaceuticals and strategic collaborations with Karolinska Institutet Institut Pasteur Karolinska University Hospital and Merck",
    "Roblox is a platform for online gaming and entertainment that provides a communal digital experience that unites individuals via play It enables anyone to imagine create and have fun with friends as they explore interactive 3D experiences produced by developers using their desktop design tool Roblox Studio",
    "Waymo stands for a new way forward in mobility it is a selfdriving technology company with a mission to make it safe and easy for people and things to move around Waymo improves transportation by building software and sensor technology developed in Googles labs since 2009 In October 2015 they achieved the worlds first fully selfdriving trip on public roads in a car without a steering wheel or pedals They refine Waymo technology through one billion miles of simulation testing each year and the cars have selfdriven over two million miles on public roads across four US cities",
    "Brimstone developed a deeply decarbonized process to make ordinary portland cement and another key concrete ingredient supplementary cementitious material The Brimstone Process uses carbonfree calcium silicate rock instead of limestone to deliver a product that is identical to conventional materials and costcompetitive at scale",
    "Suno is a music startup that enables anyone to make the songs they want The company aims to help people rediscover the joy of play and exploration",
    "Snappr is the onestopshop for visual content creation including the largest ondemand photography and photo editing marketplace Snappr Shoots is a selfservice application to book photographers Snappr Workflows is a SaaS product for enterprises to automate their visual content pipeliness Snappr also provides free tools such as the Snappr Photo Analyzer an AI portrait photoanalysis tool Snappr was founded in 2017 and is headquartered in San Francisco California"
]
startup_exemplar_vector = bert_embedder.encode(startup_exemplars, convert_to_tensor=True).mean(dim=0)

founder_blob_1 = "Erik Hazzard is the Founder of Meta Erik Hazzard attended Florida State University Andrew Gadson is the Founder  New Product Experimentation of Meta He attended Stanford University Mike LeBeau is a Founder and Product Lead Horizon Workrooms at Meta He attended Stanford University Mark Zuckerberg is the Founder Chairman and Chief Executive Officer of Facebook He is also the CoFounder of the Breakthrough Energy Coalition Mr Zuckerberg attended Harvard University Eduardo Saverin is a Cofounder at Meta Eduardo attended Harvard University"
founder_blob_2 = "Elon Musk has cofounded companies such as SpaceX and tunneling startup The Boring Company and has played served as CEO of Tesla Motors since 2008 He previously worked as a sales manager at many enterprise LLCs Elon Musk attended the University of Pennsylvania and received bachelors degrees in economics and physics"
founder_blob_3 = "Max Levchin is the Founder and CEO of Affirm He previously worked at the Consumer Financial Protection Bureau as an Advisory Board Member Max Levchin attended the University of Illinois UrbanaChampaign"
founder_blob_4 = "Peter Thiel Is the Managing Partner of The Founders Fund He previously worked at Stanford Law School as a Visitor Peter Thiel attended Stanford University"
founder_blob_5 = "John Collison is a President and Cofounder at Stripe He attended Harvard University John and his brother Patrick Collison started Stripe in 2010 while John was studying physics at Harvard Their goal was to make accepting payments online simpler and more inclusive after learning firsthand how difficult it was Today the 100 person and growing Stripe team powers online businesses around the world Before Stripe John cofounded Auctomatic which was acquired by Live Current Media in March 2008 Originally from Limerick Ireland John lives in San Francisco California where Stripe is based Patrick Collison is the CoFounder CEO and Content Strategist of Stripe He previously worked at Auctomatic as a CoFounder Patrick Collison attended Massachusetts Institute of Technology"
founder_blob_6 = "Kevin is the CEO and cofounder of Instagram a community of more than 300 million who capture and share the worlds moments on the service He is responsible for the companys overall vision and strategy as well as daytoday operations   Prior to founding Instagram Kevin was part of the startup Odeo which later became Twitter and spent two years at Google working on products such as Gmail and Google Reader He graduated from Stanford University with a bachelor of science in management science and engineering"
founder_blob_7 = "Nathan Blecharczyk is the CoFounder and Chief Strategy Officer at Airbnb a trusted community marketplace that connects people with unique accommodations in more than 34000 cities and 191 countries Blecharczyk oversaw the creation of Airbnbs engineering data science and performance marketing teams and currently plays a leading role in the companys business and product strategy He became an entrepreneur early on running a business while he was in high school that sold to clients in more than 20 countries He earned a degree in Computer Science from Harvard University and held several engineering positions before cofounding Airbnb As a guest Blecharczyk has stayed in hundreds of homes using Airbnb and he is also a host in San Francisco where he lives with his family Joe Gebbia is a CoFounder and Chairman of Airbnb An entrepreneur from an early age Airbnbs groundbreaking service began in his San Francisco apartment and spread to 2000000 listings in over 191 countries creating a new economy for thousands of people around the world He is involved in crafting the company culture shaping the design aesthetic and innovating future growth opportunities Joe has spoken globally about both entrepreneurship and design and received numerous distinctions such as the Inc 30 under 30 and Fortune 40 under 40 His lifelong appreciation for art and design led him to the Rhode Island School of Design RISD where he earned dual degrees in Graphic Design and Industrial Design Gebbia now serves on the institutions Board of Trustees Brian is the CoFounder and Chief Executive Officer at Airbnb He attended Rhode Island School of Design"

founder_exmplars = [
    founder_blob_1,
    founder_blob_2,
    founder_blob_3,
    founder_blob_4,
    founder_blob_5,
    founder_blob_6,
    founder_blob_7
]
founder_exemplar_vector = bert_embedder.encode(founder_exmplars, convert_to_tensor=True).mean(dim=0)

In [None]:
# ==========================
# 4. Main Scoring Function
# ==========================
def score_descriptions_with_checkpoint(df, startup_exemplar_vector, founder_exemplar_vector,
                                    checkpoint_path=checkpoint_path, batch_size=25):
    """
    Processes a DataFrame and appends results to checkpoint every N rows.
    Resumes if checkpoint already exists.
    """
    seen_ids, saved_rows = load_checkpoint(checkpoint_path)
    new_results = []

    for idx, row in tqdm(df.iterrows()):
        if row["org_uuid"] in seen_ids:
            continue

        org_text = row.get("org_description", "")
        founder_text = row.get("founder_description_blob", "")

        # --- Startup Scoring ---
        org_vader = get_vader_scores(org_text)
        org_blob = get_textblob_scores(org_text)
        org_finbert = get_finbert_score(org_text)
        org_sim = get_bert_similarity(org_text, startup_exemplar_vector)

        # --- Founder Scoring ---
        founder_vader = get_vader_scores(founder_text)
        founder_blob = get_textblob_scores(founder_text)
        founder_bert = get_founder_sentiment_score(founder_text)
        founder_sim = get_bert_similarity(founder_text, founder_exemplar_vector)

        result = {
            "org_uuid": row["org_uuid"],
            "org_name": row["org_name"],

            "org_vader_compound": org_vader["compound"],
            "org_vader_pos": org_vader["pos"],
            "org_vader_neg": org_vader["neg"],
            "org_vader_neu": org_vader["neu"],
            "org_blob_polarity": org_blob["polarity"],
            "org_blob_subjectivity": org_blob["subjectivity"],
            # "org_finbert_label": org_finbert["label"],
            # "org_finbert_score": org_finbert["score"],
            "org_finbert_numeric_5scale": org_finbert["numeric_5scale"],
            "org_finbert_numeric_signed": org_finbert["numeric_signed"],
            "org_sim_to_exemplar": org_sim,

            "founder_vader_compound": founder_vader["compound"],
            "founder_vader_pos": founder_vader["pos"],
            "founder_vader_neg": founder_vader["neg"],
            "founder_vader_neu": founder_vader["neu"],
            "founder_blob_polarity": founder_blob["polarity"],
            "founder_blob_subjectivity": founder_blob["subjectivity"],
            # "founder_sentiment_label": founder_bert["label"],
            # "founder_sentiment_score": founder_bert["score"],
            "founder_sentiment_numeric_5scale": founder_bert["numeric_5scale"],
            "founder_sentiment_numeric_signed": founder_bert["numeric_signed"],
            "founder_sim_to_exemplar": founder_sim
        }

        new_results.append(result)

        if len(new_results) >= batch_size:
            save_checkpoint(checkpoint_path, new_results)
            new_results = []

    if new_results:
        save_checkpoint(checkpoint_path, new_results)

    return pd.DataFrame(saved_rows + new_results)

### Running

In [None]:
# Run the pipeline
scored_df = score_descriptions_with_checkpoint(
    df=cb_df,
    startup_exemplar_vector=startup_exemplar_vector,
    founder_exemplar_vector=founder_exemplar_vector,
    checkpoint_path=checkpoint_path,
    batch_size=25
)

In [None]:
sentiment_df = pd.read_json(checkpoint_path, lines=True)

In [None]:
sentiment_df.info()

In [None]:
# save it for training or merging
sentiment_df.to_csv(sentiment_score_path, index=False)
print(f"Saved to {sentiment_score_path}")

## Outlook

In [None]:
def load_finbert():
    model = BertForSequenceClassification.from_pretrained("ProsusAI/finbert")
    tokenizer = BertTokenizer.from_pretrained("ProsusAI/finbert")
    return tokenizer, model

def finbert_score(text, tokenizer, model):
    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True)
    with torch.no_grad():
        outputs = model(**inputs)
    probs = torch.nn.functional.softmax(outputs.logits, dim=-1)
    labels = ['negative', 'neutral', 'positive']
    return dict(zip(labels, probs[0].tolist()))

def analyze_text(text):
    if not isinstance(text, str) or text.strip() == "":
        return {
            "vader_neg": None, "vader_neu": None, "vader_pos": None, "vader_compound": None,
            "textblob_polarity": None, "textblob_subjectivity": None
        }
    vader = SentimentIntensityAnalyzer().polarity_scores(text)
    blob = TextBlob(text)
    return {
        "vader_neg": vader["neg"],
        "vader_neu": vader["neu"],
        "vader_pos": vader["pos"],
        "vader_compound": vader["compound"],
        "textblob_polarity": blob.sentiment.polarity,
        "textblob_subjectivity": blob.sentiment.subjectivity
    }

def score_nlp_df(nlp_df, checkpoint_path=outlook_checkpoint_path, batch_size=25):
    tokenizer, model = load_finbert()
    
    if os.path.exists(checkpoint_path):
        df_checkpoint = pd.read_csv(checkpoint_path)
        start_idx = len(df_checkpoint)
        results = df_checkpoint.to_dict(orient="records")
    else:
        start_idx = 0
        results = []

    for i in tqdm(range(start_idx, len(nlp_df))):
        row = nlp_df.iloc[i]
        row_result = {"industry": row["industry"], "year": row["year"]}
        for field in ["summary", "trends", "infrastructure", "outlook", "timing_signal"]:
            sent = analyze_text(row.get(field, ""))
            row_result.update({f"{field}_{k}": v for k, v in sent.items()})
            if field in ["summary", "outlook", "timing_signal"]:
                fin = finbert_score(row.get(field, ""), tokenizer, model)
                row_result.update({f"{field}_finbert_{k}": v for k, v in fin.items()})
        results.append(row_result)

        # Checkpoint every N rows
        if (i + 1) % batch_size == 0 or (i + 1) == len(nlp_df):
            pd.DataFrame(results).to_csv(checkpoint_path, index=False)

    return pd.DataFrame(results)

In [None]:
outlook_sentiment_df = score_nlp_df(nlp_df, checkpoint_path=outlook_checkpoint_path, batch_size=25)

In [None]:
outlook_sentiment_df.info()
print(outlook_sentiment_df.head())

## Cleaing Up Null Values from LLM Classification (Mistral) datasets

#### Outlook

In [None]:
faulty_rows_outlook = alignment_df[
    (alignment_df['score'].isna()) |
    (~alignment_df['raw_output'].str.strip().str.startswith("Answer:"))
]

In [None]:
print(faulty_rows_outlook.shape)
print(faulty_rows_outlook.head())

In [None]:
faulty_rows_outlook_path = os.path.join(OUTPUT_NLP_FOLDER, "faulty_rows_outlook.csv")
faulty_rows_outlook.to_csv(faulty_rows_outlook_path, index=False)

#### Founder

In [None]:
faulty_rows_founder = nlp_founder_df[
    (nlp_founder_df['score'].isna())
]

In [None]:
print(faulty_rows_founder.shape)
print(faulty_rows_founder.head())

In [None]:
faulty_rows_founder_path = os.path.join(OUTPUT_NLP_FOLDER, "faulty_rows_founder.csv")
faulty_rows_founder.to_csv(faulty_rows_founder_path, index=False)

#### Re-merge dataset

In [None]:
nlp_scores_clean_path = os.path.join(OUTPUT_NLP_FOLDER, "alignment_scores_clean.csv")
nlp_founder_clean_path = os.path.join(OUTPUT_NLP_FOLDER, "founder_strength_scores_clean.csv")

In [None]:
faulty_founder_new_score_path = os.path.join(OUTPUT_NLP_FOLDER, "faulty_rows_founder_score.jsonl")
print(faulty_founder_new_score_path)

faulty_outlook_new_score_path = os.path.join(OUTPUT_NLP_FOLDER, "faulty_rows_outlook_alignment_v2.jsonl")
print(faulty_outlook_new_score_path)

In [None]:
# --- Load reprocessed data ---
faulty_founder_df = pd.read_json(faulty_founder_new_score_path, lines=True)
faulty_outlook_df = pd.read_json(faulty_outlook_new_score_path, lines=True)

# --- Fill missing scores with 2 ---
faulty_founder_df["score"] = faulty_founder_df["score"].fillna(2)
faulty_outlook_df["score"] = faulty_outlook_df["score"].fillna(2)

# # --- Reload original full dataframes ---
# alignment_df = pd.read_csv("your_path_to_alignment_df.csv")     # <-- update path
# nlp_founder_df = pd.read_csv("your_path_to_nlp_founder_df.csv") # <-- update path

# --- Clean & Recombine alignment_df ---
alignment_df_clean = alignment_df[~alignment_df['org_uuid'].isin(faulty_outlook_df['org_uuid'])]
alignment_df_final = pd.concat([alignment_df_clean, faulty_outlook_df], ignore_index=True)

# --- Clean & Recombine nlp_founder_df ---
nlp_founder_df_clean = nlp_founder_df[~nlp_founder_df['org_uuid'].isin(faulty_founder_df['org_uuid'])]
nlp_founder_df_final = pd.concat([nlp_founder_df_clean, faulty_founder_df], ignore_index=True)

# --- Sanity Checks ---
print("Remaining null scores in alignment_df_final:", alignment_df_final['score'].isna().sum())
print("Bad raw_output format in alignment_df_final:", (~alignment_df_final['raw_output'].astype(str).str.match(r"^[1-5]$")).sum())

print("Remaining null scores in nlp_founder_df_final:", nlp_founder_df_final['score'].isna().sum())
print("Bad raw_output format in nlp_founder_df_final:", (~nlp_founder_df_final['raw_output'].astype(str).str.match(r"^[1-5]$")).sum())

In [None]:
alignment_df_final.to_csv(nlp_scores_clean_path, index=False)
nlp_founder_df_final.to_csv(nlp_founder_clean_path, index=False)