# Unstructued Sentiment Data Exploratory Data Analysis

## Setup

In [None]:
# imports
import os
import time
import re
import csv
import random
import pandas as pd
import numpy as np
import requests
import pprint
from bs4 import BeautifulSoup
from tqdm import tqdm
from concurrent.futures import ThreadPoolExecutor, as_completed

# nlp librarys
# import nltk
# nltk.download('punkt', download_dir='../.venv/nltk_data')
# from nltk.tokenize import sent_tokenize
# run command in terminal: /Applications/Python\ 3.11/Install\ Certificates.command

In [None]:
# FOLDERS
OUTPUT_FOLDER = "../Data/Output"
INPUT_FOLDER = "../Data/Input"

INPUT_NLP_FOLDER = os.path.join(INPUT_FOLDER, "NLP")
OUTPUT_NLP_FOLDER = os.path.join(OUTPUT_FOLDER, "NLP")
perplexity_output_path = os.path.join(OUTPUT_NLP_FOLDER, "perplexity.csv")

# Check contents of folders
output_contents = os.listdir(OUTPUT_NLP_FOLDER)
print(output_contents)

## Perplexity API
https://docs.perplexity.ai/guides/getting-started

In [None]:
# === SETTINGS ===
api_key = os.getenv("PERPLEXITY_API_KEY")
model = "sonar-pro"
temperature = 0.3
resume_file = os.path.exists(perplexity_output_path)

years = [
    2004, 2005, 2006, 2007, 2008, 2009, 2010,
    2011, 2012, 2013, 2014, 2015,
    2016, 2017, 2018, 2019, 2020,
    2021, 2022, 2023, 2024, 2025
]

industries = [
    'Cleantech', 'Consumer Goods', 'Fintech', 'Life Sciences',
    'Media, Entertainment and Gaming', 'Real Estate',
    'Technology', 'Telecom', 'Transportation'
]


In [None]:
# === PROMPT BUILDER ===
def build_prompt(industry, year):
    return f"""
    You are a business and industry researcher evaluating how the external environment and timing in {year} influenced the viability of launching a startup in the {industry} sector.

    Your task is to review reputable sources (such as government forecasts, analyst reports or outlooks, and industry coverage) and provide insight into the state and outlook of the industry in {year}. Focus on macroeconomic, technological, regulatory, and consumer demand trends that would influence whether a new company could succeed in this space.

    If there was little momentum or coverage for the industry in {year}, briefly state that. Otherwise, structure your findings into the following labeled sections:

    ## Summary  
    What was the overall state of the {industry} industry in {year}?

    ## Trends  
    What key trends were emerging, either in technology, demand, or behavior?

    ## Infrastructure  
    Was the market supported by enabling technologies, regulations, or platforms (e.g., mobile adoption, cloud computing, data infrastructure, regulatory clarity)?

    ## Outlook  
    Were analysts or institutions projecting strong growth, stagnation, or uncertainty for the industry?

    ## Timing Signal  
    Based on the above, would {year} be a promising time to launch a startup in this industry? Summarize with reasoning.
    """.strip()


In [None]:
# === PERPLEXITY API CALL ===
def query_perplexity(prompt):
    url = "https://api.perplexity.ai/chat/completions"
    headers = {
        "Authorization": f"Bearer {api_key}",
        "Content-Type": "application/json"
    }
    payload = {
        "model": model,
        "messages": [{"role": "user", "content": prompt}],
        "temperature": temperature,
        "top_p": 1,
        "top_k": 0,
        "stream": False
    }
    response = requests.post(url, headers=headers, json=payload)
    response.raise_for_status()
    return response.json()["choices"][0]["message"]["content"]

In [None]:
# === SECTION PARSER ===
def extract_section(response, tag):
    pattern = fr"##\s*{tag}\s*(.*?)\s*(?=(##|$))"
    match = re.search(pattern, response, re.DOTALL | re.IGNORECASE)
    return match.group(1).strip() if match else ""

def parse_response(response):
    return {
        "summary": extract_section(response, "Summary"),
        "trends": extract_section(response, "Trends"),
        "infrastructure": extract_section(response, "Infrastructure"),
        "outlook": extract_section(response, "Outlook"),
        "timing_signal": extract_section(response, "Timing Signal")
    }

In [None]:
# === INTERMEDIATE LOGGER ===
def log_row_to_csv(row):
    file_exists = os.path.exists(perplexity_output_path)
    with open(perplexity_output_path, 'a', newline='', encoding='utf-8') as f:
        writer = csv.DictWriter(f, fieldnames=row.keys())
        if not file_exists:
            writer.writeheader()
        writer.writerow(row)

In [None]:
# === MAIN LOOP ===
def run_perplexity():
    results = []
    total_queries = len(industries) * len(years)
    token_total = 0

    for i, (industry, year) in enumerate([(a, b) for a in industries for b in years]):
        prompt = build_prompt(industry, year)
        print(f"[{i+1}/{total_queries}] Querying: {industry} {year}...")

        try:
            response = query_perplexity(prompt)
            parsed = parse_response(response)

            estimated_tokens = int(1.33 * (len(prompt.split()) + len(response.split())))
            token_total += estimated_tokens

            row = {
                "industry": industry,
                "year": year,
                **parsed,
                "tokens_estimate": estimated_tokens
            }

            results.append(row)
            log_row_to_csv(row)

        except Exception as e:
            print(f"Failed: {industry} {year} – {e}")
            row = {
                "industry": industry,
                "year": year,
                "summary": "",
                "trends": "",
                "infrastructure": "",
                "outlook": "",
                "timing_signal": f"ERROR: {e}",
                "tokens_estimate": 0
            }
            results.append(row)
            log_row_to_csv(row)

        time.sleep(random.uniform(1, 3))
    
    # === SUMMARY ===
    print(f"\nComplete! {len(results)} queries processed.")
    print(f"Estimated total tokens used: {token_total:,}")

    return results

In [None]:
perplexity_results = run_perplexity()

In [None]:
pprint.pprint(perplexity_results)