In [27]:
import pandas as pd

df = pd.read_csv('/Users/wdgstl/GCPE/pefirms.csv')

In [28]:
us_firms = df[(df['country'].str.lower() == 'united states') & (df['website'].notna())]

In [39]:
record = us_firms.iloc[1]

website = record['website']

website

'joingardencity.com'

In [44]:
us_firms.iloc[4]

country                             united states
founded                                      2020
id                                   sugarcapital
industry         venture capital & private equity
linkedin_url    linkedin.com/company/sugarcapital
locality                            san francisco
name                                sugar capital
region                                 california
size                                         1-10
website                              sugarcap.com
Name: 16, dtype: object

In [81]:
import os
def read_txt(folder_path, filename):
    with open(os.path.join(folder_path, filename), "r", encoding="utf-8") as f:
        content = f.read().strip()
    return content

text = read_txt('/Users/wdgstl/GCPE/scraped_pages', 'joingardencity_com_relevant.txt')

len(text)

3716

In [82]:
import requests

def call_mixtral(prompt):
    response = requests.post(
        "http://localhost:11434/api/generate",
        json={
            "model": "mixtral",
            "prompt": prompt,
            "stream": False
        }
    )
    return response.json()["response"]

# Example use
prompt = f"""
You are reviewing a private equity firm's website description to identify its explicitly stated investment areas.

Input text:
{text}

Your task is to extract only the information that is clearly and explicitly stated. Do **not** infer or assume investment focus based on context, vague language, employee backgrounds, or general descriptions. If the industry or thesis is not **directly stated** in the text, you must omit it.

Instructions:
1. Identify and list only the **explicitly stated industry focus areas**—industries the firm directly says it invests in.
2. For each industry, extract any **specific investment thesis areas** (e.g., types of businesses, value creation strategies, market conditions, subsectors) that are **directly mentioned**.

Important:
- Ignore implications, suggestions, or inferred meanings.
- Do not include areas just because they appear in example deals or portfolios unless the firm explicitly states them as a focus.
- Do not include boilerplate descriptions or general language unless tied to a named industry.

Format your response exactly like this:
Industry Focus Areas and Specific Theses:
- [Industry 1]
  - Thesis Area(s): 
    - [Bullet point]
    - [Bullet point]
- [Industry 2]
  - Thesis Area(s): 
    - [Bullet point]
    - [Bullet point]
(...continue for all explicitly stated industries)
"""



print(call_mixtral(prompt))


industry Focus Areas and Specific Theses:
- HVAC:
  - No specific thesis areas mentioned.
- Restaurant remodeling:
  - No specific thesis areas mentioned.
- Pool construction:
  - No specific thesis areas mentioned.
- Businesses with thriving cultures:
  - Long-term investment focus
  - People and purpose prioritized over short-term returns
  - Little to no debt utilized in investments
  - Partnering with existing leaders rather than replacing them
  - Preservation and honoring of founders' legacies
  - Protection and further investment in thriving cultures.


In [76]:
import os
import time
from urllib.parse import urlparse, urljoin
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
from sentence_transformers import SentenceTransformer
import numpy as np

# Keywords to skip crawling
EXCLUDE_KEYWORDS = {
    "careers", "career", "jobs", "team", "people",
    "privacy", "terms", "legal",
    "contact", "support", "help", "faq",
    "blog", "news", "press", "media",
    "events", "webinar", "culture",
    "login", "signup", "register", "subscribe",
    "cookie", "rss", "sitemap",
    "leadership", ".pdf", ".jpg", ".jpeg",
    "branch", ".xlsx", "email", "article", "report"
}

# Ensure output directory exists
OUTPUT_DIR = "scraped_pages"
os.makedirs(OUTPUT_DIR, exist_ok=True)

# Initialize local embedding model (free)
model = SentenceTransformer('all-MiniLM-L6-v2')

def crawl_site(start_url, max_pages=1000):
    visited = set()
    to_visit = [start_url]
    base_domain = urlparse(start_url).netloc.replace("www.", "")
    firm_name = base_domain.replace('.', '_')
    output_file = os.path.join(OUTPUT_DIR, f"{firm_name}.txt")

    # Headless browser setup (disable images, CSS, fonts for speed)
    opts = Options()
    opts.add_argument("--headless")
    opts.add_argument("--disable-gpu")
    opts.add_experimental_option("prefs", {
        "profile.managed_default_content_settings.images": 2,
        "profile.managed_default_content_settings.stylesheets": 2,
        "profile.managed_default_content_settings.fonts": 2,
    })
    driver = webdriver.Chrome(options=opts)

    while to_visit and len(visited) < max_pages:
        url = to_visit.pop(0)
        if url in visited:
            continue
        visited.add(url)
        print("Visiting:", url)

        root = url.split("#")[0]
        driver.get(root)
        try:
            WebDriverWait(driver, 3).until(
                EC.presence_of_element_located((By.TAG_NAME, "a"))
            )
        except:
            pass

        soup = BeautifulSoup(driver.page_source, "html.parser")
        parsed = urlparse(url)

        # Extract paragraphs from full page or in-page fragment
        if parsed.fragment:
            section = soup.find(id=parsed.fragment)
            paras = section.find_all("p") if section else []
        else:
            paras = soup.find_all("p")

        # Append paragraphs to output file
        with open(output_file, "a", encoding="utf-8") as f:
            for p in paras:
                text = p.get_text(separator=" ", strip=True)
                if text:
                    f.write(text + "\n\n")

        # Enqueue new links found in the page
        for tag in soup.find_all("a", href=True):
            href = urljoin(root, tag["href"])
            p2 = urlparse(href)
            root_link = p2._replace(query="", fragment="").geturl()
            frag = p2.fragment

            # Same-site check (allow www and non-www)
            if p2.netloc.replace("www.", "") != base_domain:
                continue
            # Skip noisy or irrelevant paths
            if any(kw in root_link.lower() for kw in EXCLUDE_KEYWORDS):
                continue

            next_url = root_link + (f"#{frag}" if frag else "")
            if next_url not in visited and next_url not in to_visit:
                to_visit.append(next_url)

    driver.quit()
    return output_file


def embed_and_rank_paragraphs(paragraphs, query, top_k=10, min_words=5, min_chars=60, boost_weight=0.2):
    """
    Embed and rank paragraphs by semantic similarity to the query,
    then boost those containing key investment keywords.
    Filters out very short, heading-like, or excessively brief text blocks.
    """
    # Expanded investment-related keywords to boost
    KEYWORDS = {
        "focus", "invest", "strategy", "portfolio", "sector", "thesis",
        "acquire", "acquires", "grows", "grow", "business", "company", "holding", "model", "mission", "goal"
    }

    def is_noise(p):
        if len(p.split()) < min_words:
            return True
        if len(p) < min_chars:
            return True
        letters = [c for c in p if c.isalpha()]
        if letters:
            upper_ratio = sum(1 for c in letters if c.isupper()) / len(letters)
            if upper_ratio > 0.6:
                return True
        return False

    clean = [p for p in paragraphs if not is_noise(p)]
    if not clean:
        clean = paragraphs  # fallback

    # compute embeddings
    query_vec = model.encode(query, convert_to_numpy=True)
    para_embs = model.encode(clean, convert_to_numpy=True, batch_size=100)

    # cosine similarities
    sims = np.dot(para_embs, query_vec) / (
        np.linalg.norm(para_embs, axis=1) * np.linalg.norm(query_vec)
    )

    # keyword flags and combined scores
    keyword_flags = np.array([1 if any(kw in p.lower() for kw in KEYWORDS) else 0 for p in clean])
    combined = sims + boost_weight * keyword_flags

    # sort by combined score and return only text and score
    idxs = np.argsort(combined)[::-1][:top_k]
    return [(clean[i], float(combined[i])) for i in idxs]


if __name__ == "__main__":
    url = "https://blackstone.com"
    txt_file = crawl_site(url, max_pages=30)

    # load paragraphs and deduplicate
    with open(txt_file, "r", encoding="utf-8") as f:
        raw_paras = [p.strip() for p in f.read().split("\n\n") if p.strip()]
    seen = set()
    paragraphs = []
    for p in raw_paras:
        if p not in seen:
            seen.add(p)
            paragraphs.append(p)

    # derive display name from URL
    parsed_url = urlparse(url)
    domain = parsed_url.netloc.replace("www.", "")
    display_name = domain.split('.')[0].replace('-', ' ').replace('_', ' ').title()

    query = f"{display_name} private equity industry focus areas, investment model, and corresponding investment thesis statements."
    top = embed_and_rank_paragraphs(paragraphs, query, top_k=60)

    # Delete raw scraped paragraphs file
    try:
        os.remove(txt_file)
    except OSError:
        pass

    # Write top relevant paragraphs to new file (without scores)
    firm_name = domain.replace('.', '_')
    relevant_file = os.path.join(OUTPUT_DIR, f"{firm_name}_relevant.txt")
    with open(relevant_file, "w", encoding="utf-8") as rf:
        for text, _ in top:
            rf.write(text + "\n\n")

    print(f"Relevant content written to: {relevant_file}")


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Visiting: https://blackstone.com
Visiting: https://blackstone.com#primary
Visiting: https://www.blackstone.com/
Visiting: https://blackstone.com/the-firm/
Visiting: https://blackstone.com/our-clients/overview/
Visiting: https://blackstone.com/financial-advisors/
Visiting: https://blackstone.com/family-offices-endowments-foundations/
Visiting: https://www.blackstone.com/our-businesses/credit-and-insurance-bxci/#Insurance
Visiting: https://blackstone.com/our-impact/building-sustainable-businesses/
Visiting: https://blackstone.com/our-impact/blackstone-charitable-foundation/
Visiting: https://www.blackstone.com/our-impact/blackstone-launchpad/
Visiting: https://blackstone.com/our-businesses/private-equity/
Visiting: https://blackstone.com/our-businesses/real-estate/
Visiting: https://blackstone.com/our-businesses/credit-and-insurance-bxci/
Visiting: https://blackstone.com/our-businesses/blackstone-multi-asset-investing-bxma/
Visiting: https://blackstone.com/our-businesses/strategic-partne

In [111]:
query = "SELECT * FROM firms;"

cursor.execute(query)

rows = cursor.fetchall()

for row in rows:
    print(row)

cursor.close()
conn.close()

(1, 'bansk group', 'banskgroup.com', 'test')
(2, 'garden city companies', 'joingardencity.com', 'test')
(3, 'sileo capital', 'sileocapital.com', 'test')
(4, 'intonation ventures', 'intonationventures.com', 'test')
(5, 'sugar capital', 'sugarcap.com', 'test')
(6, 'fuul capital', 'fuulcapital.com', 'test')


In [110]:
#want to do this in paralell
for i in range(len(us_firms)):
    name = us_firms.iloc[i]['name']
    website = us_firms.iloc[i]['website']
    save_firm_to_db(name, website, 'test')
    # crawl_website(name, f'https://www.{website}')
    if i == 5:
        break
    

Saved: bansk group
Saved: garden city companies
Saved: sileo capital
Saved: intonation ventures
Saved: sugar capital
Saved: fuul capital


In [90]:
cursor.execute("SELECT * FROM firms")
rows = cursor.fetchall()

for row in rows:
    print(row)

(1, 'bansk group', 'banskgroup.com', 'test')
(2, 'garden city companies', 'joingardencity.com', 'test')
(3, 'sileo capital', 'sileocapital.com', 'test')
(4, 'intonation ventures', 'intonationventures.com', 'test')
(5, 'sugar capital', 'sugarcap.com', 'test')
(6, 'fuul capital', 'fuulcapital.com', 'test')
