# HW04 — Data Acquisition & Ingestion

In [None]:
import os, datetime as dt
import pandas as pd
from bs4 import BeautifulSoup
import requests
import yfinance as yf
from src.config import load_env
paths = load_env()


# 1) API Pull (AAPL via yfinance)

In [None]:
ticker = 'AAPL'
_ts = dt.datetime.now().strftime('%Y%m%d-%H%M')
df_api = yf.download(ticker, period='6mo', interval='1d', auto_adjust=False)
df_api.reset_index(inplace=True)
assert set(['Date','Open','High','Low','Close','Adj Close','Volume']).issubset(df_api.columns)
df_api['Date'] = pd.to_datetime(df_api['Date'])
print('API shape:', df_api.shape)
api_path = os.path.join(paths.raw, f"api_yfinance_{ticker}_{_ts}.csv")
df_api.to_csv(api_path, index=False)
print('Saved:', api_path)

# 2) Scrape S&P 500 table (Wikipedia)

In [None]:
url = 'https://en.wikipedia.org/wiki/List_of_S%26P_500_companies'
html = requests.get(url, timeout=30).text
soup = BeautifulSoup(html, 'lxml')
table = soup.find('table', {'id': 'constituents'})
rows = []
for tr in table.tbody.find_all('tr')[1:]:
    tds = [td.get_text(strip=True) for td in tr.find_all(['td','th'])]
    rows.append(tds)
cols = [th.get_text(strip=True) for th in table.thead.find_all('th')]
df_spx = pd.DataFrame(rows, columns=cols)
assert 'Symbol' in df_spx.columns and 'Security' in df_spx.columns
print('Scrape shape:', df_spx.shape)
scrape_path = os.path.join(paths.raw, f"scrape_wikipedia_sp500_{_ts}.csv")
df_spx.to_csv(scrape_path, index=False)
print('Saved:', scrape_path)

# 3) Document sources & simple validation summary

In [None]:
print('Sources:')
print('- API: yfinance for', ticker)
print('- Scrape:', url)
print('Validation: required columns present, shapes:', df_api.shape, df_spx.shape)