In [26]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import time

# Function to format market cap
def format_market_cap(value):
    """
    Convert market cap strings like '1.5T' and '500B' into numerical values.
    """
    multipliers = {'T': 1e12, 'B': 1e9, 'M': 1e6}
    if value[-1] in multipliers:
        return float(value[:-1]) * multipliers[value[-1]]
    return float(value.replace(',', ''))

# Function to scrape and clean market cap data
def scrape_market_cap(company_symbol):
    base_url = "https://www.marketcaphistory.com/"
    search_url = base_url + company_symbol.lower()
    
    try:
        # Fetch the webpage
        response = requests.get(search_url)
        response.raise_for_status()

        # Parse the webpage
        soup = BeautifulSoup(response.text, "html.parser")
        
        # Extract market cap data
        outer_table = soup.find('table', class_='infotable')
        inner_table = outer_table.find('table')
        rows = inner_table.find_all('tr')

        market_cap_data = []
    
        for index, row in enumerate(rows):
            if index == 0:  # Skip the header row
                continue

            cells = row.find_all('td')
            if cells:
                date = pd.to_datetime(cells[0].get_text(strip=True)).strftime('%Y-%m-%d')  # Convert to YYYY-MM-DD
                market_cap = format_market_cap(cells[1].get_text(strip=True))  # Convert to number

                market_cap_data.append([company_symbol.upper(), date, market_cap])
        
        print(f"Got {company_symbol} data")

        # Create a DataFrame 
        df = pd.DataFrame(market_cap_data, columns=["Company", "Date", "Market Cap"])
        
        return df
    
    except Exception as e:
        print(f"Error scraping data for {company_symbol}: {e}")

In [27]:
# Example usage:
df = scrape_market_cap("CMAX")
print(df)

# Save to CSV
df.to_csv("CMAX.csv", index=False)

Got CMAX data
   Company        Date   Market Cap
0     CMAX  2021-05-17  181560000.0
1     CMAX  2021-08-12  679890000.0
2     CMAX  2022-03-11  680600000.0
3     CMAX  2022-04-29  575750000.0
4     CMAX  2022-08-05  631880000.0
5     CMAX  2022-11-07  514770000.0
6     CMAX  2023-03-11  262980000.0
7     CMAX  2023-03-22  344100000.0
8     CMAX  2023-08-04  303730000.0
9     CMAX  2023-11-06  210740000.0
10    CMAX  2024-05-06   11750000.0
11    CMAX  2024-08-06   17630000.0


In [32]:
# List of company symbols to scrape
company_symbols = ['AAPL', 'GOOG', 'MSFT', 'AMZN', 'TSLA']

# Loop through each company symbol and scrape data
for company_symbol in company_symbols:
    print(f"Scraping data for {company_symbol}...")
    company_df = scrape_market_cap(company_symbol)

    if not company_df.empty:  # Only save if data is retrieved successfully
        company_df["Company"] = company_symbol  # Add the company symbol
        filename = f"data/{company_symbol}_market_cap.csv"
        company_df.to_csv(filename, index=False)
        print(f"Data saved to {filename}")

    # time.sleep(1)  # Delay to prevent overwhelming the server

Scraping data for AAPL...
Got AAPL data
Data saved to data/AAPL_market_cap.csv
Scraping data for GOOG...
Got GOOG data
Data saved to data/GOOG_market_cap.csv
Scraping data for MSFT...
Got MSFT data
Data saved to data/MSFT_market_cap.csv
Scraping data for AMZN...
Got AMZN data
Data saved to data/AMZN_market_cap.csv
Scraping data for TSLA...
Got TSLA data
Data saved to data/TSLA_market_cap.csv
