# API Pull

In [15]:
import os
import yfinance as yf
import requests
import time
import sys
import pandas as pd
import numpy as np
sys.path.append("..")
from src.utils import * 
from src.config import *

count=0
source="Local"
max_count=5
api_key = get_key("API_KEY")
ticker = "AAPL"

while(count<max_count):
    try:
        url = "https://www.alphavantage.co/query"
        params = {
                "function": "TIME_SERIES_DAILY",
                "symbol": ticker,
                "outputsize": "compact",
                "apikey": api_key,
                "datatype": "json"
        }
        r = requests.get(url, params=params, timeout=30)
        if(r.json().get('Information') is not None and "subscribe" in r.json().get('Information')):
            print("Need subscription for alphavantge, now trying yfinance")
            data = yf.download(ticker, start="2024-01-01", end="2024-02-01", interval="1d").reset_index()
            if( data.empty ):
                raise Exception("Could not get data from yfinance, getting throttled")
            print(f"Got data for {ticker} from yfinance")
            source="YFinance"
            break
        else:
            r.raise_for_status()
            js = r.json()
            key = [k for k in js.keys() if "Time Series" in k]
            assert key, f"Unexpected response keys: {list(js.keys())}"
            series = js[key[0]]
            data = (pd.DataFrame(series).T
                      .rename_axis('date')
                      .reset_index())
            # keep a couple columns and coerce types
            data = data.rename(columns={'date':'Date', '4. close': 'Close', '2. high':'High','3. low':'Low', '1. open':'Open', '5. volume':'Volume'})       
            print(f"Got data for {ticker} from alphavantage")
            source="AlphaVantage"
            break
    except Exception as e:
        print(e)
        print("Sleeping for 5 secs")
        time.sleep(5)
        count+=1
    finally:
        if(count==max_count):
            print(f"Retry count reached {max_count}. Could not get data. Falling back to locally stored backup data.")
            data = pd.read_csv("../data/sample.csv")
        

Need subscription for alphavantge, now trying yfinance


[*********************100%***********************]  1 of 1 completed

1 Failed download:
['AAPL']: YFRateLimitError('Too Many Requests. Rate limited. Try after a while.')


Could not get data from yfinance, getting throttled
Sleeping for 5 secs
Need subscription for alphavantge, now trying yfinance


[*********************100%***********************]  1 of 1 completed

1 Failed download:
['AAPL']: YFRateLimitError('Too Many Requests. Rate limited. Try after a while.')


Could not get data from yfinance, getting throttled
Sleeping for 5 secs
Need subscription for alphavantge, now trying yfinance


[*********************100%***********************]  1 of 1 completed

1 Failed download:
['AAPL']: YFRateLimitError('Too Many Requests. Rate limited. Try after a while.')


Could not get data from yfinance, getting throttled
Sleeping for 5 secs
Need subscription for alphavantge, now trying yfinance


[*********************100%***********************]  1 of 1 completed

1 Failed download:
['AAPL']: YFRateLimitError('Too Many Requests. Rate limited. Try after a while.')


Could not get data from yfinance, getting throttled
Sleeping for 5 secs
Need subscription for alphavantge, now trying yfinance


[*********************100%***********************]  1 of 1 completed

1 Failed download:
['AAPL']: YFRateLimitError('Too Many Requests. Rate limited. Try after a while.')


Could not get data from yfinance, getting throttled
Sleeping for 5 secs
Retry count reached 5. Could not get data. Falling back to locally stored backup data.


In [26]:

dtypes_map = { k[0]:v for k,v in data.dtypes.to_dict().items() }
msgs = validate_df( data, ["Date", "Close", "High", "Low", "Open", "Volume"], dtypes_map, ticker, (21,6) )

# Save CSV
fname = get_filename("api",{'source':source, 'ticker':ticker},"csv")
write_df( data, False, "csv", fname )

Successfully saved file to:  ../data/raw


# Scrape a small table

In [27]:
import pandas as pd
from bs4 import BeautifulSoup
import pdb
url = "https://en.wikipedia.org/wiki/List_of_countries_and_dependencies_by_population"

try:
    # Step 1: Get HTML
    resp = requests.get(url)
    resp.raise_for_status()
    
    # Step 2: Parse HTML
    soup = BeautifulSoup(resp.text, "html.parser")
    
    # Step 3: Select the first wikitable
    table = soup.find("table", {"class": "wikitable"})
    
    # Step 4: Extract rows
    rows = []
    for tr in table.find_all('tr'):
        # Get all 'th' and 'td' cells
        cells = [k.get_text(strip=True) for k in tr.find_all(['th', 'td'])]
        if cells:
            rows.append(cells)
    # Use the first row as header
    header = rows[0]
    
    # Normalize each data row to match header length
    data = []
    for row in rows[1:]:
        if len(row) < len(header):
            row += [""] * (len(header) - len(row))  # pad missing cells
        elif len(row) > len(header):
            row = row[:len(header)]  # truncate extra cells
        data.append(row)
    
    # Step 6: Create DataFrame
    df_scrape = pd.DataFrame(data, columns=header)
except Exception as e:
    print("Error:", e)

In [28]:
print(df_scrape.head())

df = df_scrape.copy()
#Convert location to string
if 'Location' in df.columns:
    df['Location'] = df['Location'].astype('str')

# Convert date to datetime
if 'Population' in df.columns:
    df['Population'] = df['Population'].str.replace(',','').astype('int64')

# Standardize categorical column
if '% ofworld' in df.columns:
    df['% ofworld'] = df['% ofworld'].str.replace('%','').astype('float64')

if 'Date' in df.columns:
    df['Date'] = pd.to_datetime(df['Date'], errors='coerce')
df = df.drop(columns=["Source (official or fromtheUnited Nations)","Notes"])
df.head()

        Location     Population % ofworld         Date  \
0          World  8,232,000,000      100%  13 Jun 2025   
1          India  1,417,492,000     17.3%   1 Jul 2025   
2          China  1,408,280,000     17.2%  31 Dec 2024   
3  United States    340,110,988      4.1%   1 Jul 2024   
4      Indonesia    284,438,782      3.5%  30 Jun 2025   

  Source (official or fromtheUnited Nations) Notes  
0                        UN projection[1][3]        
1                     Official projection[4]   [b]  
2                       Official estimate[5]   [c]  
3                       Official estimate[6]   [d]  
4              National annual projection[7]        


Unnamed: 0,Location,Population,% ofworld,Date
0,World,8232000000,100.0,2025-06-13
1,India,1417492000,17.3,2025-07-01
2,China,1408280000,17.2,2024-12-31
3,United States,340110988,4.1,2024-07-01
4,Indonesia,284438782,3.5,2025-06-30


In [29]:
# Save CSV
fname = get_filename("scrape",{'site':url, 'table':"List of countries and territories by total population"},"csv")
write_df( df, False, "csv", fname )

Successfully saved file to:  ../data/raw


# Documentation
 - Data source for AAPL market data - YFinance
 - Data source for country wise population data - Wikipedia(https://en.wikipedia.org/wiki/List_of_countries_and_dependencies_by_population)
 - Validation logic for stock price data is given in validate_df in utils.py, moreover columsn are checked for completeness and columns are typecasted into correct data types.
 - Assumption & risk
   1. YFinance often throttles the user, which may result in empty data from the api
   2. The validations are tightly coupled with source data, any change in the api or the wikipedia page may break the code