In [19]:
import requests
from bs4 import BeautifulSoup as bs
import pandas as pd
import time
import re
import dotenv
import google.generativeai as genai

In [None]:
urls = [
    "https://www.gm.com",
    "https://www.mobility.siemens.com",
    "https://www.morganstanley.com",
    "https://www.basf.com",
    "https://www.linkedin.com",
    "https://www.salesforce.com"
]

def scrape_website(url):
    headers = {"User-Agent": "Mozilla/5.0"}
   
    try:
        response = requests.get(url, headers=headers, timeout=10)
        response.raise_for_status()
    except requests.RequestException as e:
        print(f"Error fetching {url}: {e}")
        return None, None
 
    soup = bs(response.text, 'html.parser')
    js_links = re.findall(r'https?://[^\s"\'<>]+', response.text)
    return soup,js_links

def clean_data(soup):
    for script in soup(['script', 'style']):
        script.decompose()
 
    for a in soup.find_all('a', href=True):
        a.insert_after(f" ({a['href']})")
 
    return soup.get_text(separator=' ', strip=True)

In [74]:

all_scraped_data ={}
for url in urls:
    scraped_data = ''
    soup, js_links = scrape_website(url)
    if soup:
        soup = clean_data(soup)
        scraped_data += soup
    
    print(scraped_data)
    
    for link in js_links:
        data, links = scrape_website(url)
        if data:
            data = clean_data(data)
            scraped_data += data

    all_scraped_data[url] = scraped_data


General Motors: Pushing the Limits of Transportation & Technology TWG Motorsports and GM receive formal approval for Cadillac Formula 1 ™ team Read More (https://news.gm.com/home.detail.html/Pages/news/us/en/2025/mar/0307-f1.html) Super Cruise ® driver assistance technology wins MotorTrend Best Hands-Free Driving Tech Read More (https://news.gm.com/home.detail.html/Pages/topic/us/en/2025/feb/0225-supercruise.html) Record set: Largest hands-free caravan with Super Cruise Read More (https://news.gm.com/home.detail.html/Pages/news/us/en/2024/nov/1118-supercruise.html) Corvette ZR1 hits 233 mph with GM President Mark Reuss driving 1 Read More (https://news.gm.com/home.detail.html/Pages/news/us/en/2024/oct/1015-zr1.html) 1 View important information. We pioneer the innovations that move and connect people to what matters. Simulated, preproduction or concept products shown and subject to change. Certain products not currently available or subject to limited availability. See vehicle websites

In [78]:
GENAI_API_KEY = "AIzaSyDmaEkNRd6GdcmXMF_hMIty0xOfImZwQqQ"

genai.configure(api_key=GENAI_API_KEY)

def chunk_text(text, max_length=1000000):
    """ Splits text into smaller chunks within API token limits """
    print(len(text))
    return [text[i:i+max_length] for i in range(0, len(text), max_length)]

def extract_info_with_gemini(text):
    prompt = f"""
    Extract the following details from the given text:
    1. Company's mission statement or core values.
    2. Products or services offered.
    3. Founding year and founders.
    4. Headquarters location.
    5. Key executives or leadership team.
    6. Notable awards or recognitions.

    Text:
    {text}
    """

    model = genai.GenerativeModel("gemini-2.0-flash")
    time.sleep(12)
    response = model.generate_content(prompt)
    return response.text

structured_data = {}
for url, scraped_data in all_scraped_data.items():
    print(url)
    time.sleep(6)
    for chunk in chunk_text(scraped_data):
        print('chunk')
        structured_data[url] = extract_info_with_gemini(chunk)

print(structured_data)


https://www.gm.com
91170
chunk
https://www.mobility.siemens.com
319800
chunk
https://www.morganstanley.com
897620
chunk
https://www.basf.com
878020
chunk
https://www.linkedin.com
4841760
chunk
chunk
chunk
chunk
chunk
https://www.salesforce.com
2297664
chunk
chunk
chunk
{'https://www.gm.com': 'Okay, here\'s the information extracted from the provided text:\n\n1.  **Company\'s mission statement or core values:** "We pioneer the innovations that move and connect people to what matters."\n\n2.  **Products or services offered:**\n    *   Electric vehicles (EVs) (performance vehicles, work trucks, daily drivers, commercial delivery vehicles)\n    *   Chevrolet Equinox EV\n    *   Super Cruise driver assistance technology\n    *  Cadillac Formula 1 ™ team\n\n3.  **Founding year and founders:** Not found in the provided text.\n\n4.  **Headquarters location:** Not found in the provided text.\n\n5.  **Key executives or leadership team:**\n    *   Mark Reuss (GM President)\n\n6.  **Notable awards

In [79]:
df = pd.DataFrame(structured_data.items(), columns=["Website", "Extracted Details"])
df.to_csv("extracted_company_info.csv", index=False)

print("Extraction complete! Data saved to extracted_company_info.csv")


Extraction complete! Data saved to extracted_company_info.csv
