In [1]:
pip install yfinance

Collecting yfinance
  Downloading yfinance-0.2.65-py2.py3-none-any.whl.metadata (5.8 kB)
Collecting pandas>=1.3.0 (from yfinance)
  Downloading pandas-2.3.1-cp311-cp311-win_amd64.whl.metadata (19 kB)
Collecting multitasking>=0.0.7 (from yfinance)
  Downloading multitasking-0.0.12.tar.gz (19 kB)
  Preparing metadata (setup.py): started
  Preparing metadata (setup.py): finished with status 'done'
Collecting pytz>=2022.5 (from yfinance)
  Downloading pytz-2025.2-py2.py3-none-any.whl.metadata (22 kB)
Collecting frozendict>=2.3.4 (from yfinance)
  Downloading frozendict-2.4.6-py311-none-any.whl.metadata (23 kB)
Collecting peewee>=3.16.2 (from yfinance)
  Downloading peewee-3.18.2.tar.gz (949 kB)
     ---------------------------------------- 0.0/949.2 kB ? eta -:--:--
     ------------------------------------- 949.2/949.2 kB 14.9 MB/s eta 0:00:00
  Installing build dependencies: started
  Installing build dependencies: finished with status 'done'
  Getting requirements to build wheel: starte

  DEPRECATION: Building 'multitasking' using the legacy setup.py bdist_wheel mechanism, which will be removed in a future version. pip 25.3 will enforce this behaviour change. A possible replacement is to use the standardized build interface by setting the `--use-pep517` option, (possibly combined with `--no-build-isolation`), or adding a `pyproject.toml` file to the source tree of 'multitasking'. Discussion can be found at https://github.com/pypa/pip/issues/6334


In [3]:
import yfinance as yf

# Create a "ticker" object for Apple
apple_ticker = yf.Ticker("AAPL")

# Get all of its historical price data
apple_df = apple_ticker.history(period="max")

print("Successfully downloaded Apple stock data!")
apple_df.head() # Shows the first 5 rows

Successfully downloaded Apple stock data!


Unnamed: 0_level_0,Open,High,Low,Close,Volume,Dividends,Stock Splits
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
1980-12-12 00:00:00-05:00,0.098485,0.098913,0.098485,0.098485,469033600,0.0,0.0
1980-12-15 00:00:00-05:00,0.093775,0.093775,0.093347,0.093347,175884800,0.0,0.0
1980-12-16 00:00:00-05:00,0.086924,0.086924,0.086495,0.086495,105728000,0.0,0.0
1980-12-17 00:00:00-05:00,0.088636,0.089064,0.088636,0.088636,86441600,0.0,0.0
1980-12-18 00:00:00-05:00,0.091206,0.091634,0.091206,0.091206,73449600,0.0,0.0


In [4]:
apple_df.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 11260 entries, 1980-12-12 00:00:00-05:00 to 2025-08-15 00:00:00-04:00
Data columns (total 7 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   Open          11260 non-null  float64
 1   High          11260 non-null  float64
 2   Low           11260 non-null  float64
 3   Close         11260 non-null  float64
 4   Volume        11260 non-null  int64  
 5   Dividends     11260 non-null  float64
 6   Stock Splits  11260 non-null  float64
dtypes: float64(6), int64(1)
memory usage: 703.8 KB


In [5]:
apple_df.isnull().sum()

Open            0
High            0
Low             0
Close           0
Volume          0
Dividends       0
Stock Splits    0
dtype: int64

In [10]:
import os

# Make sure folder exists
os.makedirs("data/raw", exist_ok=True)

# Now save
apple_df.to_csv("data/raw/apple_stock.csv", index=False)



In [12]:
import requests
from bs4 import BeautifulSoup

url = "https://en.wikipedia.org/wiki/List_of_S%26P_500_companies"
r = requests.get(url)
soup = BeautifulSoup(r.text, "html.parser")

table = soup.find("table", {"id": "constituents"})
rows = table.find_all("tr")

data = []
for row in rows[1:]:
    cols = [col.text.strip() for col in row.find_all(["td","th"])]
    data.append(cols)

df_sp500 = pd.DataFrame(data, columns=["Symbol","Security","SEC filings","GICS Sector","GICS Sub-Industry","Headquarters Location","Date First Added","CIK"])


In [13]:
print(df_sp500.head())
print(df_sp500.shape)
print(df_sp500.isna().sum())


  Symbol             Security             SEC filings  \
0    MMM                   3M             Industrials   
1    AOS          A. O. Smith             Industrials   
2    ABT  Abbott Laboratories             Health Care   
3   ABBV               AbbVie             Health Care   
4    ACN            Accenture  Information Technology   

                      GICS Sector        GICS Sub-Industry  \
0        Industrial Conglomerates    Saint Paul, Minnesota   
1               Building Products     Milwaukee, Wisconsin   
2           Health Care Equipment  North Chicago, Illinois   
3                   Biotechnology  North Chicago, Illinois   
4  IT Consulting & Other Services          Dublin, Ireland   

  Headquarters Location Date First Added          CIK  
0            1957-03-04       0000066740         1902  
1            2017-07-26       0000091142         1916  
2            1957-03-04       0000001800         1888  
3            2012-12-31       0001551152  2013 (1888)  
4   

In [14]:
df_sp500.to_csv("data/raw/sp500_companies.csv", index=False)


Data Sources/URLs

- API: Yahoo Finance (yfinance) for Apple stock

- Scraping: Wikipedia S&P 500 list

Parameters used

- API: ticker = "AAPL", period = "1y"

- Scrape: table with id="constituents"

Validation logic

- Checked column names, dtypes, missing values, shape

Assumptions & Risks

- API may stop working if free tier limits exceeded

- Wikipedia page format may change â†’ scraper could break