# Data Acquisition: Stock Betas and Sector Information
We will pull stock beta for each holding using the yfinance API and scrape S&P 500 sector data from Wikipedia. [cite: 117]

In [5]:
import pandas as pd
import yfinance as yf
import requests
from bs4 import BeautifulSoup
from datetime import datetime

# --- 1. API Pull (yfinance) --- [cite: 119]
holdings_df = pd.read_csv('../../../project/data/raw/holdings.csv')
tickers = holdings_df['Ticker'].tolist()
stock_data = []

for ticker in tickers:
    stock = yf.Ticker(ticker)
    info = stock.info
    stock_data.append({
        'Ticker': ticker,
        'Beta': info.get('beta'),
        'Sector': info.get('sector'),
        'Market Cap': info.get('marketCap'),
        'Forward P/E': info.get('forwardPE')
    })

api_df = pd.DataFrame(stock_data)

# Validate the data [cite: 124]
print("API Data Validation:")
print(f"Shape: {api_df.shape}")
print(f"Missing Betas: {api_df['Beta'].isna().sum()}")

# Save the raw data [cite: 125]
timestamp = datetime.now().strftime("%Y%m%d-%H%M")
api_filename = f'../data/raw/api_BRK-Holdings_{timestamp}.csv'
api_df.to_csv(api_filename, index=False)
print(f"\nAPI data saved to {api_filename}")


# --- 2. Scrape a Small Table (Wikipedia) --- [cite: 126]
URL = "https://en.wikipedia.org/wiki/List_of_S%26P_500_companies"
response = requests.get(URL)
soup = BeautifulSoup(response.text, 'html.parser')
table = soup.find('table', {'id': 'constituents'})

scraped_df = pd.read_html(str(table))[0]
sectors_df = scraped_df[['Symbol', 'GICS Sector']]
sectors_df = sectors_df.rename(columns={'Symbol': 'Ticker'})

# Validate [cite: 129]
print("\nScraped Data Validation:")
print(f"Shape: {sectors_df.shape}")
print(f"Sectors found: {sectors_df['GICS Sector'].nunique()}")

# Save the raw data [cite: 130]
scrape_filename = f'../data/raw/scrape_Wikipedia_SP500-Sectors_{timestamp}.csv'
sectors_df.to_csv(scrape_filename, index=False)
print(f"\nScraped data saved to {scrape_filename}")

DNSError: Failed to perform, curl: (6) Could not resolve host: query1.finance.yahoo.com. See https://curl.se/libcurl/c/libcurl-errors.html first for more details.