In [77]:
%load_ext dotenv
%dotenv ../../src/.env

The dotenv extension is already loaded. To reload it, use:
  %reload_ext dotenv


In [78]:
import sys
sys.path.append("../../src")

In [79]:
from utils.logger import get_logger
_logs = get_logger(__name__)
_logs.info("Notebook started successfully")

2025-11-13 18:54:48,645, 2927690305.py, 3, INFO, Notebook started successfully


In [80]:
import pandas as pd
import os
from glob import glob

In [81]:
import random

stock_files = glob(os.path.join(os.getenv('SRC_DIR'), "raw/*.csv"))


In [82]:
stock_files

['../../data/raw\\fundamentals.csv',
 '../../data/raw\\prices-split-adjusted.csv',
 '../../data/raw\\prices.csv',
 '../../data/raw\\securities.csv']

In [83]:
fundamentals = pd.read_csv(stock_files[0])
adj_prices = pd.read_csv(stock_files[1])
prices = pd.read_csv(stock_files[2])
securities = pd.read_csv(stock_files[3])

In [84]:
fundamentals.columns


Index(['Unnamed: 0', 'Ticker Symbol', 'Period Ending', 'Accounts Payable',
       'Accounts Receivable', 'Add'l income/expense items', 'After Tax ROE',
       'Capital Expenditures', 'Capital Surplus', 'Cash Ratio',
       'Cash and Cash Equivalents', 'Changes in Inventories', 'Common Stocks',
       'Cost of Revenue', 'Current Ratio', 'Deferred Asset Charges',
       'Deferred Liability Charges', 'Depreciation',
       'Earnings Before Interest and Tax', 'Earnings Before Tax',
       'Effect of Exchange Rate',
       'Equity Earnings/Loss Unconsolidated Subsidiary', 'Fixed Assets',
       'Goodwill', 'Gross Margin', 'Gross Profit', 'Income Tax',
       'Intangible Assets', 'Interest Expense', 'Inventory', 'Investments',
       'Liabilities', 'Long-Term Debt', 'Long-Term Investments',
       'Minority Interest', 'Misc. Stocks', 'Net Borrowings', 'Net Cash Flow',
       'Net Cash Flow-Operating', 'Net Cash Flows-Financing',
       'Net Cash Flows-Investing', 'Net Income', 'Net Income Ad

In [85]:
prices.columns, adj_prices.columns

(Index(['date', 'symbol', 'open', 'close', 'low', 'high', 'volume'], dtype='object'),
 Index(['date', 'symbol', 'open', 'close', 'low', 'high', 'volume'], dtype='object'))

In [86]:
securities.columns

Index(['Ticker symbol', 'Security', 'SEC filings', 'GICS Sector',
       'GICS Sub Industry', 'Address of Headquarters', 'Date first added',
       'CIK'],
      dtype='object')

In [87]:
# Standardize column names
for df in [fundamentals, prices, adj_prices, securities]:
    df.columns = df.columns.str.lower().str.strip()

In [88]:
fundamentals.columns

Index(['unnamed: 0', 'ticker symbol', 'period ending', 'accounts payable',
       'accounts receivable', 'add'l income/expense items', 'after tax roe',
       'capital expenditures', 'capital surplus', 'cash ratio',
       'cash and cash equivalents', 'changes in inventories', 'common stocks',
       'cost of revenue', 'current ratio', 'deferred asset charges',
       'deferred liability charges', 'depreciation',
       'earnings before interest and tax', 'earnings before tax',
       'effect of exchange rate',
       'equity earnings/loss unconsolidated subsidiary', 'fixed assets',
       'goodwill', 'gross margin', 'gross profit', 'income tax',
       'intangible assets', 'interest expense', 'inventory', 'investments',
       'liabilities', 'long-term debt', 'long-term investments',
       'minority interest', 'misc. stocks', 'net borrowings', 'net cash flow',
       'net cash flow-operating', 'net cash flows-financing',
       'net cash flows-investing', 'net income', 'net income ad

In [89]:
# Rename key columns for consistency
fundamentals = fundamentals.rename(columns={
    'ticker symbol': 'ticker',
    'period ending': 'date'
})
prices = prices.rename(columns={'symbol': 'ticker'})
adj_prices = adj_prices.rename(columns={'symbol': 'ticker'})
securities = securities.rename(columns={'ticker symbol': 'ticker'})

In [90]:
securities.columns

Index(['ticker', 'security', 'sec filings', 'gics sector', 'gics sub industry',
       'address of headquarters', 'date first added', 'cik'],
      dtype='object')

In [91]:
adj_prices.shape

(851264, 7)

In [92]:
fundamentals.shape

(1781, 79)

In [93]:
securities.shape

(505, 8)

In [94]:
# Unique tickers in each dataset
tickers_fund = set(fundamentals['ticker'].unique())
tickers_adj_price = set(adj_prices['ticker'].unique())
tickers_price = set(adj_prices['ticker'].unique())
tickers_sec = set(securities['ticker'].unique())

In [95]:
# --- Compare fundamentals vs adjusted prices ---
print("Tickers in fundamentals but not in prices:", tickers_fund - tickers_price)
print("Tickers in prices but not in fundamentals:", tickers_price - tickers_fund)

Tickers in fundamentals but not in prices: {'UA'}
Tickers in prices but not in fundamentals: {'FTI', 'RAI', 'ICE', 'RIG', 'A', 'ACN', 'MDT', 'JCI', 'URI', 'WBA', 'DTE', 'GOOG', 'SLB', 'AGN', 'PLD', 'KHC', 'DOW', 'PSA', 'EVHC', 'NI', 'TROW', 'JNJ', 'FOX', 'MNK', 'GE', 'COH', 'FOXA', 'ADP', 'MSI', 'IR', 'CBS', 'GS', 'BLK', 'CMCSA', 'ORCL', 'AES', 'CA', 'LNC', 'PXD', 'RF', 'COP', 'FTV', 'BEN', 'FITB', 'RTN', 'ENDP', 'GOOGL', 'L', 'AET', 'TWX', 'NOC', 'NRG', 'PRGO', 'ESRX'}


In [96]:
# --- Compare fundamentals vs securities ---
print("Tickers in fundamentals but not in securities:", tickers_fund - tickers_sec)
print("Tickers in securities but not in fundamentals:", len(tickers_sec - tickers_fund))


Tickers in fundamentals but not in securities: set()
Tickers in securities but not in fundamentals: 57


In [97]:
# --- Compare prices vs securities ---
print("Tickers in prices but not in securities:", tickers_price - tickers_sec)
print("Tickers in securities but not in prices:", tickers_sec - tickers_price)

Tickers in prices but not in securities: set()
Tickers in securities but not in prices: {'BRK.B', 'MS', 'UA', 'BF.B'}


In [98]:
# --- Compare prices vs adj_prices ---
print("Tickers in prices but not in adjusted prices:", tickers_price - tickers_adj_price)
print("Tickers in adjusted prices but not in prices:", tickers_adj_price - tickers_price)

Tickers in prices but not in adjusted prices: set()
Tickers in adjusted prices but not in prices: set()


In [99]:
common_tickers = tickers_fund & tickers_adj_price & tickers_sec
print("Number of tickers present in all three datasets:", len(common_tickers))

Number of tickers present in all three datasets: 447


In [100]:
fundamentals = fundamentals[fundamentals['ticker'].isin(common_tickers)]
adj_prices = adj_prices[adj_prices['ticker'].isin(common_tickers)]
securities = securities[securities['ticker'].isin(common_tickers)]