In [None]:
!pip install -U langchain-community

Collecting langchain-community
  Downloading langchain_community-0.3.7-py3-none-any.whl.metadata (2.9 kB)
Collecting SQLAlchemy<2.0.36,>=1.4 (from langchain-community)
  Downloading SQLAlchemy-2.0.35-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (9.6 kB)
Collecting dataclasses-json<0.7,>=0.5.7 (from langchain-community)
  Downloading dataclasses_json-0.6.7-py3-none-any.whl.metadata (25 kB)
Collecting httpx-sse<0.5.0,>=0.4.0 (from langchain-community)
  Downloading httpx_sse-0.4.0-py3-none-any.whl.metadata (9.0 kB)
Collecting langchain-core<0.4.0,>=0.3.17 (from langchain-community)
  Downloading langchain_core-0.3.17-py3-none-any.whl.metadata (6.3 kB)
Collecting pydantic-settings<3.0.0,>=2.4.0 (from langchain-community)
  Downloading pydantic_settings-2.6.1-py3-none-any.whl.metadata (3.5 kB)
Collecting marshmallow<4.0.0,>=3.18.0 (from dataclasses-json<0.7,>=0.5.7->langchain-community)
  Downloading marshmallow-3.23.1-py3-none-any.whl.metadata (7.5 kB)
Collecting ty

In [None]:
!pip install unstructured

Collecting unstructured
  Downloading unstructured-0.16.5-py3-none-any.whl.metadata (24 kB)
Collecting filetype (from unstructured)
  Downloading filetype-1.2.0-py2.py3-none-any.whl.metadata (6.5 kB)
Collecting python-magic (from unstructured)
  Downloading python_magic-0.4.27-py2.py3-none-any.whl.metadata (5.8 kB)
Collecting emoji (from unstructured)
  Downloading emoji-2.14.0-py3-none-any.whl.metadata (5.7 kB)
Collecting python-iso639 (from unstructured)
  Downloading python_iso639-2024.10.22-py3-none-any.whl.metadata (13 kB)
Collecting langdetect (from unstructured)
  Downloading langdetect-1.0.9.tar.gz (981 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m981.5/981.5 kB[0m [31m18.9 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting rapidfuzz (from unstructured)
  Downloading rapidfuzz-3.10.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (11 kB)
Collecting backoff (from unstructured)
  Downlo

In [None]:
from langchain.document_loaders import UnstructuredURLLoader
from bs4 import BeautifulSoup
import requests
import pandas as pd
import os

def extract_investment_content(url, output_folder="extracted_data"):
    # Create a folder for output if it does not exist
    os.makedirs(output_folder, exist_ok=True)

    # Check if the URL is a list (multiple URLs for blogs) or a single URL (for a page with tables)
    if isinstance(url, list):
        print("Detected multiple URLs, treating as blog content...")

        # Use UnstructuredURLLoader for multiple blog URLs
        loader = UnstructuredURLLoader(urls=url)
        documents = loader.load()

        # Concatenate and save each document's content into a single text file
        full_text = "\n\n".join([doc.page_content for doc in documents])
        blog_filename = os.path.join(output_folder, "blog_content.txt")

        with open(blog_filename, "w") as f:
            f.write(full_text)

        print(f"Blog content from multiple URLs saved to {blog_filename}")

        return full_text  # Return the blog content text

    else:
        print("Detected single URL, checking for table data...")

        # Use requests and BeautifulSoup for single URL with table data
        response = requests.get(url)
        html_content = response.content

        # Parse the HTML with BeautifulSoup
        soup = BeautifulSoup(html_content, "html.parser")

        # Check for tables
        tables = soup.find_all("table")
        extracted_tables = []

        if tables:
            print("Detected tables, extracting investment data only...")
            for i, table in enumerate(tables):
                table_data = []
                for row in table.find_all("tr"):
                    cells = row.find_all(["td", "th"])
                    table_data.append([cell.get_text(strip=True) for cell in cells])

                # Convert to DataFrame (assuming first row is the header)
                df = pd.DataFrame(table_data[1:], columns=table_data[0])
                extracted_tables.append(df)

                # Save each table to a separate CSV file
                csv_filename = os.path.join(output_folder, f"table_{i+1}.csv")
                df.to_csv(csv_filename, index=False)
                print(f"Saved table {i+1} to {csv_filename}")

        else:
            print("No tables found and single URL detected. Saving as blog content.")

            # Use UnstructuredURLLoader for the single blog URL
            loader = UnstructuredURLLoader(urls=[url])
            documents = loader.load()
            full_text = documents[0].page_content

            # Save the blog content to a text file
            blog_filename = os.path.join(output_folder, "blog_content_single.txt")
            with open(blog_filename, "w") as f:
                f.write(full_text)

            print(f"Blog content saved to {blog_filename}")

        return extracted_tables if tables else full_text  # Return tables or blog content

# Example usage
urls = [
    "https://www.investopedia.com/articles/investing/010416/angel-investing-vs-crowdfunding-how-raise-money-your-startup.asp",
    "https://www.investopedia.com/articles/investing/030615/investing-social-media-startups-read-first.asp",
    "https://www.investopedia.com/articles/investing/011916/5-questions-ask-investing-startup.asp",
    "https://www.investopedia.com/terms/s/startup-capital.asp",
    "https://www.investopedia.com/how-technology-is-changing-financial-advice-4774011",
    "https://www.investopedia.com/terms/e/entrepreneur.asp",
    "https://www.investopedia.com/articles/basics/11/how-to-pick-a-stock.asp",
    "https://www.investopedia.com/articles/pf/12/private-foundation-start-up.asp"
]
content = extract_investment_content(urls)  # Multiple URLs (for blogs)

single_url = "https://www.openvc.app/search"
content = extract_investment_content(single_url)  # Single URL (for a table)

Detected multiple URLs, treating as blog content...
Blog content from multiple URLs saved to extracted_data/blog_content.txt
Detected single URL, checking for table data...
Detected tables, extracting investment data only...
Saved table 1 to extracted_data/table_1.csv


In [None]:
import pandas as pd
import json

# Load the blog content text file
with open('/content/extracted_data/blog_content (2).txt', 'r', encoding='utf-8') as file:
    blog_content = file.read()

# Load the CSV file as a DataFrame
table_df = pd.read_csv('/content/Investor Details.csv')

# Convert the DataFrame to a list of dictionaries for JSON compatibility
table_data = table_df.to_dict(orient="records")

# Create a combined structure
combined_data = {
    "blog_content": blog_content,
    "table_data": table_data
}

# Save the combined structure to a JSON file
output_file = '/content/combined_Investment_Data/combined_investment_data.json'
with open(output_file, 'w', encoding='utf-8') as json_file:
    json.dump(combined_data, json_file, indent=4)

print(f"Combined data saved to {output_file}")

Combined data saved to /content/combined_Investment_Data/combined_investment_data.json
