### Import relevant packages, establish connection to WRDS and set overall configurations for the notebook

WRDS Support - https://wrds-www.wharton.upenn.edu/pages/support/programming-wrds/programming-python/querying-wrds-data-python/

In [2]:
# Import packages
import os
import numpy as np
import pandas as pd
import wrds
import yfinance as yf 

# Build WRDS connection

db = wrds.Connection(wrds_username='tomasromeiro')
#db.close()

# Set option to display all columns and rows
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

# Set option to force dataframes to display numbers as floats with thousands separators
pd.set_option('display.float_format', '{:,.2f}'.format)  # Adjust decimal places as needed

Loading library list...
Done


WRDS Quick commands

In [None]:
# List libraries available
sorted(db.list_libraries()) 

# List tables within a library
db.list_tables(library="cboe") 

# describe table metadata
db.describe_table(library="cboe", table="optprice_2024") 

# Execute a sql query against a table (join queries between tables in library can also be performed)
data = db.raw_sql('SELECT date, dji FROM djones.djdaily LIMIT 1', date_cols=['date']) 

# Pass parameters to a sql statement
params = {"tickers": ("0015B", "0030B", "0032A", "0033A", "0038A")}
data = db.raw_sql(
    "SELECT datadate, gvkey, cusip FROM comp.funda WHERE tic IN %(tickers)s LIMIT 1",
    params=params,
)

### 1. FINRA Short Interest Bimonthly Data  
- https://www.finra.org/finra-data/browse-catalog/equity-short-interest/files
- https://www.finra.org/finra-data/browse-catalog/equity-short-interest/glossary

#### a) Collate semi monthly datasets
If the data has already been collated do not run this and skip to b)

In [None]:
# Define the directory containing the CSV files
directory = 'data/finra_short_interest_data'
output_file = os.path.join(directory, 'collated_short_interest_data.csv')

# Check if the collated file already exists and delete it
if os.path.exists(output_file):
    os.remove(output_file)

# Get a list of all pipe-delimited CSV files in the directory
csv_files = [os.path.join(directory, file) for file in os.listdir(directory) if file.endswith('.csv')]

# Read and concatenate all CSV files with proper delimiter handling
df_list = []
for file in csv_files:
    try:
        df = pd.read_csv(file, sep='|')  # Read as pipe-delimited with specific dtype
        df_list.append(df)
    except Exception as e:
        print(f"Error reading {file}: {e}")

if df_list:
    short_interest_df = pd.concat(df_list, ignore_index=True)
    
    # Replace daysToCoverQuantity with blank where averageDailyVolumeQuantity is 0
    short_interest_df.loc[short_interest_df['averageDailyVolumeQuantity'] == 0, 'daysToCoverQuantity'] = None
    
    # Remove all entries where the ticker (symbolCode) is missing and daysToCoverQuantity is NaN or 999.99 
    short_interest_df = short_interest_df.dropna(subset=['symbolCode', 'daysToCoverQuantity'])
    short_interest_df = short_interest_df[short_interest_df['daysToCoverQuantity'] != 999.99]

    # Keep only stocks not traded Over the Counter
    short_interest_df = short_interest_df[short_interest_df['marketClassCode'] != 'OTC']

    # Drop unnecessary fields
    short_interest_df = short_interest_df.drop(columns=['accountingYearMonthNumber', 'issuerServicesGroupExchangeCode', 'stockSplitFlag', 'revisionFlag', 'changePercent', 'changePreviousNumber', 'previousShortPositionQuantity', 'issueName', 'marketClassCode'])
    
    # Move settlementDate to the first column
    columns = ['settlementDate'] + [col for col in short_interest_df.columns if col != 'settlementDate']
    short_interest_df = short_interest_df[columns]

    # Sort by settlementDate and symbolCode
    short_interest_df = short_interest_df.sort_values(by=['settlementDate', 'symbolCode'])

    # Renaming columns
    short_interest_df.rename(columns={"currentShortPositionQuantity": "short_volume"}, inplace=True)
    short_interest_df.rename(columns={"averageDailyVolumeQuantity": "avg_daily_volume"}, inplace=True)
    short_interest_df.rename(columns={"daysToCoverQuantity": "days_to_cover"}, inplace=True)

    # Chaging fields to appropriate data type
    short_interest_df = short_interest_df.astype({"short_volume": "int32", "avg_daily_volume": "int32"})

    # Save the collated DataFrame to the same directory
    short_interest_df.to_csv(output_file, index=False)
    
    print(f"Collated data saved to {output_file}")
else:
    print("No valid CSV files found.")

#### b) Open .csv file to memory

In [3]:
# Define the directory containing the CSV files
directory = 'data/finra_short_interest_data'

# Open the collated file in a DataFrame for viewing
short_interest_file = os.path.join(directory, 'collated_short_interest_data.csv')
short_interest_df = pd.read_csv(short_interest_file)   

Extract tickers and date ranges to use as parameters for remaining data extracts

In [5]:
# Extract unique tickers from the short interest file. Will be used as the main variable to pass through to subsquent queries where tickers are required
tickers = short_interest_df['symbolCode'].unique().tolist()
print(f"{len(tickers)} unique tickers in the short interest file")

# Extract earliest and latest date in the short interest file
earliest_date = '2020-01-01'
latest_date = short_interest_df['settlementDate'].max()

short_interest_start_date = short_interest_df['settlementDate'].min()

print(f"Short interest file date range is {short_interest_start_date} to {latest_date}")

16857 unique tickers in the short interest file
Short interest file date range is 2021-06-15 to 2025-01-15


### 2. WRDS (Wharton) Data

#### i) Company Data (Quarterly) - Fundamentals
https://wrds-www.wharton.upenn.edu/pages/get-data/compustat-capital-iq-standard-poors/compustat/north-america-daily/fundamentals-quarterly/

We'll extract quarterly financial statement data and derive commonly used metrics if not available directly.

Variable references (for the quarterly reporting period, in USD):
- conm: company name
- tic: company ticker symbol
- rdq: report date of quarterly earnings
- revtq: total revenue 
- cogsq: cost of goods sold
- oiadpq: operating income after depreciation and amortisation
- dlcq: short-term (current) debt
- dlttq: long-term debt
- cheq: cash and cash equivalents at reporting point in time

The variables above will be used to calculated the following metrics:

- Gross Margin = (revtq – cogsq) / revtq 
    - "revtq" represents total revenues and "cogsq" represents the Cost of Goods Sold both at quarter level. The difference equals gross profit.
- EBITDA = oiadpq + dpq
    - Earnings Before Interest, Tax, Depreciation and Amortization. Since oiadpq already deducts depreciation and amortisation, adding dpq back returns EBITDA.
- Net Debt = (dlcq + dlttq) – che
    - Net Debt measures a company’s overall debt situation by offsetting its total debt with its liquid assets.


#### a) Download data and save it as a .csv so we avoiding repeated long queries to WRDS in case we clear memory. 
If the data has already been downloaded do not run this and skip to b)

In [None]:
# Define the directory to download into
directory = 'data/wrds_company_fundamentals_data'
output_file = os.path.join(directory, 'wrds_company_fundamentals_data.csv')

# Check if the collated file already exists and delete it
if os.path.exists(output_file):
    os.remove(output_file)

# Pass parameters to a sql statement
params = {
    "tickers": tuple(tickers),
    "start_date": earliest_date,
    "end_date": latest_date
}

# Query WRDS to fetch data
quartely_company_fundamentals_df = db.raw_sql(
    "SELECT rdq as date, tic as ticker, revtq as revenue, ceqq as book_value, niq as net_income, oiadpq as op_income, dlcq as st_debt, dlttq as lt_debt, cheq as cash_eq, atq as total_assets, "
    "(revtq - cogsq) / NULLIF(revtq, 0) as gross_margin, (revtq - cogsq) / NULLIF(atq, 0) as gross_profitability, "
    "oiadpq / NULLIF(atq, 0) as operating_profitability, (dlcq + dlttq) / NULLIF(atq, 0) as leverage, dlcq + dlttq - cheq as net_debt, oiadpq + dpq as ebitda, "
    "(dlcq + dlttq - cheq) / NULLIF((oiadpq + dpq), 0) as netdebt_to_ebitda " 
    "FROM comp_na_daily_current.fundq " 
    "WHERE tic in %(tickers)s and rdq BETWEEN %(start_date)s AND %(end_date)s ",
    params=params
)

# Save the DataFrame to the directory
quartely_company_fundamentals_df.to_csv(output_file, index=False)

#### b) Open .csv file to memory

In [None]:
# Define the directory containing the CSV files
directory = 'data/wrds_company_fundamentals_data'

# Open the file in a DataFrame for viewing
quartely_company_fundamentals_file = os.path.join(directory, 'wrds_company_fundamentals_data.csv')
quartely_company_fundamentals_df = pd.read_csv(quartely_company_fundamentals_file)

Extract unique downloaded tickers and compare to unique ticker list derived so far. Delete unmatched tickers from previous datasets.

In [7]:
# Extract unique tickers from WRDS download
quartely_company_fundamentals_tickers = set(quartely_company_fundamentals_df['ticker'].unique().tolist())
print(f"{len(quartely_company_fundamentals_tickers)} unique tickers found in WRDS quarterly company fundamentals data")

# Find missing tickers (tickers in short_interest_df but NOT in quartely_company_fundamentals_df)
print(f"{len(tickers)} unique tickers found in FINRA short interest data")
missing_tickers = set(tickers) - quartely_company_fundamentals_tickers

print(f"{len(missing_tickers)} tickers missing from the WRDS quarterly company fundamentals compared to the short interest file")

# Remove records with missing tickers from short_interest_df and daily stock data
short_interest_df = short_interest_df[~short_interest_df['symbolCode'].isin(missing_tickers)]

# Update ticker variable
tickers = short_interest_df['symbolCode'].unique().tolist()
print(f"Updated ticker list. New unique ticker count is: {len(tickers)}")

6138 unique tickers found in WRDS quarterly company fundamentals data
16857 unique tickers found in FINRA short interest data
10719 tickers missing from the WRDS quarterly company fundamentals compared to the short interest file
Updated ticker list. New unique ticker count is: 6138


#### ii) Stock Data (Daily Level) - Prices and Volume
https://wrds-www.wharton.upenn.edu/pages/get-data/compustat-capital-iq-standard-poors/compustat/north-america-daily/security-daily/

Variable Reference:
- conm: company name
- datadate: record date
- tic: ticker symbol
- cshoc: shares outstanding
- cshtrd: trading Volume - daily
- prccd: price - close - daily

Filtering the data to fetch only USA stocks in order to be able to inspect the dataset sensibly without the need for currency conversions and consistency of financial statement data.

#### a) Download data and save it as a .csv so we avoiding repeated long queries to WRDS in case we clear memory. 
If the data has already been downloaded do not run this and skip to b)

In [8]:
# Define the directory to download into
directory = 'data/wrds_stock_daily_data'
output_file = os.path.join(directory, 'wrds_stock_daily_data.csv')

# Check if the collated file already exists and delete it
if os.path.exists(output_file):
    os.remove(output_file)

# Pass parameters to a sql statement
params = {
    "tickers": tuple(tickers),
    "start_date": earliest_date,
    "end_date": latest_date
}

# Query WRDS to fetch data
daily_stock_data_df = db.raw_sql(
    "SELECT datadate, conm as company_name, tic as ticker, prccd as price_close, cshtrd as volume, cshoc as shares_outstanding, prccd * cshoc as market_cap, eps " 
    "FROM comp_na_daily_all.secd " 
    "WHERE tic in %(tickers)s and datadate BETWEEN %(start_date)s AND %(end_date)s AND fic = 'USA'",
    params=params
)

# Save the DataFrame to the directory
daily_stock_data_df.to_csv(output_file, index=False)

#### b) Open .csv file to memory

In [28]:
# Define the directory containing the CSV files
directory = 'data/wrds_stock_daily_data'

# Open the file in a DataFrame for viewing
daily_stock_data_file = os.path.join(directory, 'wrds_stock_daily_data.csv')
daily_stock_data_df = pd.read_csv(daily_stock_data_file)

Extract unique downloaded tickers and compare to short interest file dataset. Delete unmatched tickers from previous datasets.

In [9]:
# Extract unique tickers from WRDS download
daily_stock_data_tickers = set(daily_stock_data_df['ticker'].unique().tolist())
print(f"{len(daily_stock_data_tickers)} unique tickers found in WRDS daily stock data")

# Find missing tickers (tickers in ticker list but NOT in daily_stock_data_df)
print(f"{len(tickers)} unique tickers found in FINRA short interest data")
missing_tickers = set(tickers) - daily_stock_data_tickers

print(f"{len(missing_tickers)} tickers missing from the WRDS daily stock data compared to the short interest file")

# Remove records with missing tickers from short_interest_df and quartely_company_fundamentals_df
short_interest_df = short_interest_df[~short_interest_df['symbolCode'].isin(missing_tickers)]
quartely_company_fundamentals_df = quartely_company_fundamentals_df[~quartely_company_fundamentals_df['ticker'].isin(missing_tickers)]

# Update ticker variable
tickers = short_interest_df['symbolCode'].unique().tolist()
print(f"Updated ticker list. New unique ticker count is: {len(tickers)}")

4740 unique tickers found in WRDS daily stock data
6138 unique tickers found in FINRA short interest data
1398 tickers missing from the WRDS daily stock data compared to the short interest file
Updated ticker list. New unique ticker count is: 4740


### 3) Yahoo Finance Data

##### a) Market Data - S&P500 and VIX

In [10]:
# Define the directory containing the CSV files
directory = 'data/yahoo_finance_sp500_vix_data'
output_file = os.path.join(directory, 'yahoo_finance_sp500_vix_data.csv')

# Check if the collated file already exists and delete it
if os.path.exists(output_file):
    os.remove(output_file)

# Define ticker symbols
tickers = ["^GSPC", "^VIX"]  # S&P 500 and VIX

# Fetch data (Closing Prices and Volume)
sp500_vix_data_df = yf.download(tickers, start=earliest_date, end=latest_date, progress=False)[['Close']]

# Flatten the MultiIndex to standard column names
sp500_vix_data_df.columns = [f"{col[0]}_{col[1]}" for col in sp500_vix_data_df.columns]

# Rename columns to match the requested format
sp500_vix_data_df = sp500_vix_data_df.rename(columns={
    'Close_^GSPC': 'sp500_price_close',
    'Close_^VIX': 'vix_price_close'
})

# Reorder columns
sp500_vix_data_df = sp500_vix_data_df[['sp500_price_close', 'vix_price_close']]

# Format date index to YYYY-MM-DD and reset index
sp500_vix_data_df.index = sp500_vix_data_df.index.strftime('%Y-%m-%d')
sp500_vix_data_df = sp500_vix_data_df.reset_index()

# Rename the date column to 'date'
sp500_vix_data_df = sp500_vix_data_df.rename(columns={'index': 'date'})

# Save the collated DataFrame to the same directory
sp500_vix_data_df.to_csv(output_file, index=False)


#### b) Open .csv file to memory

In [30]:
# Define the directory containing the CSV files
directory = 'data/yahoo_finance_sp500_vix_data'

# Open the collated file in a DataFrame for viewing
sp500_vix_data_file = os.path.join(directory, 'yahoo_finance_sp500_vix_data.csv')
sp500_vix_data_df = pd.read_csv(sp500_vix_data_file)

### 4) Join datasets, perform data cleansing and compute additional metrics

a) Join datasets and foward fill datapoints where relevant

In [13]:
# Renaming columns used as join criteria between datasets
daily_stock_data_df.rename(columns={"datadate": "date"}, inplace=True)
sp500_vix_data_df.rename(columns={"Date": "date"}, inplace=True)
short_interest_df.rename(columns={"settlementDate": "date"}, inplace=True)
short_interest_df.rename(columns={"symbolCode": "ticker"}, inplace=True)

# Merge stock data and index data on date
merged_df = pd.merge(daily_stock_data_df, sp500_vix_data_df, on="date", how="left")

# Merge ticker-level data (on date + ticker)
merged_df = pd.merge(merged_df, quartely_company_fundamentals_df, on=["date", "ticker"], how="left")
merged_df = pd.merge(merged_df, short_interest_df, on=["date", "ticker"], how="left")

# Sort by date and ticker
merged_df = merged_df.sort_values(by=["date", "ticker"]).reset_index(drop=True)

# Group by 'ticker' and 'date' and count the number of occurrences.
dup_counts = merged_df.groupby(['ticker', 'date']).size().reset_index(name='count')

# Identify (ticker, date) pairs with more than 1 occurrence.
dup_pairs = dup_counts[dup_counts['count'] > 1]

# Extract the unique tickers that have duplicate (ticker, date) pairs.
tickers_with_duplicates = dup_pairs['ticker'].unique()
print("Count of tickers with duplicate (ticker, date) pairs:", len(tickers_with_duplicates))

# Filter out any rows where the 'ticker' is in the tickers_with_duplicates list.
merged_df = merged_df[~merged_df['ticker'].isin(tickers_with_duplicates)].copy()

# Forward fill ticker-specific, point-in-time values (e.g. short interest, quarterly gross profit, etc.)
ticker_cols = list(merged_df.columns)
ticker_cols.remove("date")  # Exclude date column from filling
ticker_cols.remove("ticker")  # Exclude ticker column from filling
merged_df[ticker_cols] = merged_df.groupby("ticker")[ticker_cols].ffill()

# Scale relevant fields from short interest file and daily stock data to millions in order to match the fundamentals file
divisor = 1_000_000
cols_to_divide = ['volume', 'shares_outstanding', 'market_cap', 'short_volume', 'avg_daily_volume']

merged_df[cols_to_divide] = merged_df[cols_to_divide] / divisor

# Ticker count after joining
tickers_post_joins = merged_df['ticker'].unique().tolist()
print(f"{len(tickers_post_joins)} unique tickers after joining datasets")

Count of tickers with duplicate (ticker, date) pairs: 507
4233 unique tickers after joining datasets


b) Adjust for possible stock splits

In [14]:
# Make sure the DataFrame is sorted by ticker and date.
merged_df = merged_df.sort_values(['ticker', 'date'])

# Step 1: Compute the previous day's shares outstanding within each ticker.
merged_df['prev_shares'] = merged_df.groupby('ticker')['shares_outstanding'].shift(1)
# For the first observation of each ticker, fill missing value with the current day's shares.
merged_df['prev_shares'] = merged_df['prev_shares'].fillna(merged_df['shares_outstanding'])

# Step 2: Compute the day-to-day ratio of shares outstanding.
merged_df['shares_ratio'] = merged_df['shares_outstanding'] / merged_df['prev_shares']

# Step 3: Define a threshold to detect a split.
# If the shares_ratio is greater than or equal to the threshold (e.g., 1.5), assume a split occurred.
split_threshold = 1.5

# For days when a split is detected, compute a multiplier as 1 / shares_ratio.
# Otherwise, the multiplier is 1.
merged_df['split_multiplier'] = np.where(
    (merged_df['shares_ratio'] >= split_threshold).fillna(False),
    1 / merged_df['shares_ratio'],
    1.0
)

# Step 4: Shift the split multiplier backward (i.e. forward in time)
# so that the multiplier detected on the split day is applied to the previous day.
# This means that the day on which the split is reported is treated as the new baseline.
merged_df['split_multiplier_shifted'] = merged_df.groupby('ticker')['split_multiplier'].shift(-1).fillna(1.0)

# Step 5: Compute the reverse cumulative product of the shifted multiplier within each ticker.
def reverse_cumprod(series):
    return series.iloc[::-1].cumprod().iloc[::-1]

merged_df['adjustment_factor'] = merged_df.groupby('ticker')['split_multiplier_shifted'].transform(reverse_cumprod)

# Step 6: Compute the split-adjusted price.
merged_df['price_close_adj'] = merged_df['price_close'] * merged_df['adjustment_factor']

# Optional: Clean up the temporary columns.
# merged_df.drop(columns=['prev_shares', 'shares_ratio', 'split_multiplier', 'split_multiplier_shifted', 'adjustment_factor'], inplace=True)



c) Compute additional metrics

In [15]:
# ==========================================
# 1. Fundamental Ratios
# ==========================================

# Price-to-Earnings (P/E) Ratio: Only compute if EPS != 0; otherwise, set to NaN.
merged_df['pe_ratio'] = np.where((merged_df['eps'] != 0).fillna(False),
                                 merged_df['price_close'] / merged_df['eps'], # can use non-adjusted price_close as eps is as reported at that point in time
                                 np.nan)

# Price-to-Book (P/B) Ratio: Only compute if book_value != 0; otherwise, set to NaN.
merged_df['pb_ratio'] = np.where((merged_df['book_value'] != 0).fillna(False),
                                 merged_df['market_cap'] / merged_df['book_value'],
                                 np.nan)

# Price-to-Sales (P/S) Ratio: Only compute if revenue != 0; otherwise, set to NaN.
merged_df['ps_ratio'] = np.where((merged_df['revenue'] != 0).fillna(False),
                                 merged_df['market_cap'] / merged_df['revenue'],
                                 np.nan)

# Price-to-EBITDA (P/EBITDA) Ratio: Only compute if EBITDA != 0; otherwise, set to NaN.
merged_df['pebitda_ratio'] = np.where((merged_df['ebitda'] != 0).fillna(False),
                                      merged_df['market_cap'] / merged_df['ebitda'],
                                      np.nan)


# Enterprise Value (EV) and EV/EBITDA:

# Compute EV as market_cap + net_debt.
merged_df['enterprise_value'] = merged_df['market_cap'] + merged_df['net_debt']

# Only compute EV/EBITDA if EBITDA != 0.
merged_df['ev_ebitda'] = np.where((merged_df['ebitda'] != 0).fillna(False),
                                  merged_df['enterprise_value'] / merged_df['ebitda'],
                                  np.nan)


# Return on Equity (ROE): net_income / book_value
merged_df['roe'] = merged_df['net_income'] / merged_df['book_value'].replace({0: np.nan})

# Net Margin: net_income / revenue
merged_df['net_margin'] = merged_df['net_income'] / merged_df['revenue'].replace({0: np.nan})

# ==========================================
# 2. Growth Metrics
# ==========================================

def compute_quarterly_growth(series):
    """
    Computes quarterly growth for a forward-filled series.
    The function calculates the percentage change only on days when the value changes
    (i.e., a new quarterly report is available), and then forward fills that value
    until the next change.
    """
    # Create a boolean mask: True when the reported value is different from the previous day.
    mask = series != series.shift(1)
    
    # Compute the percentage change (growth) on a day-by-day basis.
    growth = series.pct_change(fill_method=None)
    
    # Only keep the computed growth on the change days; elsewhere, set to NaN.
    growth = growth.where(mask)
    
    # Forward fill the computed growth so that every day between reports has the same growth value.
    growth = growth.ffill()
    
    return growth

# Compute quarterly growth for EPS and revenue by grouping on ticker.
merged_df['eps_growth'] = merged_df.groupby('ticker')['eps'].transform(compute_quarterly_growth)
merged_df['revenue_growth'] = merged_df.groupby('ticker')['revenue'].transform(compute_quarterly_growth)

# ==========================================
# 3. Volatility Metrics
# ==========================================

# Compute daily returns from price_close with fill_method=None to avoid the warning.
merged_df['daily_return'] = merged_df['price_close_adj'].pct_change(fill_method=None)

# 30-day rolling volatility (standard deviation of daily returns)
merged_df['volatility_30d'] = merged_df['daily_return'].rolling(window=30, min_periods=15).std()

# 90-day rolling volatility
merged_df['volatility_90d'] = merged_df['daily_return'].rolling(window=90, min_periods=45).std()

# ==========================================
# 4. Momentum Metrics
# ==========================================

# 10 day momentum: cumulative return over the past 10 trading days.
merged_df['momentum_10d'] = merged_df.groupby('ticker')['price_close_adj'].pct_change(periods=10, fill_method=None)

# 20 day momentum: cumulative return over the past 20 trading days.
merged_df['momentum_20d'] = merged_df.groupby('ticker')['price_close_adj'].pct_change(periods=20, fill_method=None)

# 50 day momentum: cumulative return over the past 50 trading days.
merged_df['momentum_50d'] = merged_df.groupby('ticker')['price_close_adj'].pct_change(periods=50, fill_method=None)

# 100 day momentum: cumulative return over the past 100 trading days.
merged_df['momentum_100d'] = merged_df.groupby('ticker')['price_close_adj'].pct_change(periods=100, fill_method=None)

# 200 day momentum: cumulative return over the past 200 trading days.
merged_df['momentum_200d'] = merged_df.groupby('ticker')['price_close_adj'].pct_change(periods=200, fill_method=None)

d) Remove rows before cutoff date (short interest file start date)

In [16]:
# Delete rows
merged_df = merged_df[merged_df['date'] >= short_interest_start_date].copy()
tickers_final = merged_df['ticker'].unique().tolist()
print(f"Final count of tickers is: {len(tickers_final)}")

Final count of tickers is: 4233


e) Cleanse dataset and improve readability

In [None]:
# Identify tickers with shares_outstanding == 0 or NaN.
tickers_shares_zero_or_missing = merged_df.loc[(merged_df['shares_outstanding'] == 0) | (merged_df['shares_outstanding'].isna()),'ticker'].unique()
print("Count of tickers with 0 or missing shares outstanding:", len(tickers_shares_zero_or_missing))

merged_df = merged_df[~merged_df['ticker'].isin(tickers_shares_zero_or_missing)].copy()
print("DataFrame shape after removing tickers with 0 or missing shares outstanding:", merged_df.shape)

# Identify and remove tickers where market capitalisation has been below $10M.
tickers_low_market_cap = merged_df.loc[merged_df['market_cap'] < 10, 'ticker'].unique()
print("Count of tickers where market cap has been below $100M:", len(tickers_low_market_cap))

merged_df = merged_df[~merged_df['ticker'].isin(tickers_low_market_cap)].copy()
print("DataFrame shape after removing tickers with low market cap:", merged_df.shape)

# Identify tickers with either revenue / book_value / eps == NaN.
tickers_missing_rev_bk_eps = merged_df.loc[(merged_df['revenue'].isna()) | (merged_df['book_value'].isna()) | (merged_df['eps'].isna()),'ticker'].unique()
print("Count of tickers missing revenue, book_value or eps:", len(tickers_missing_rev_bk_eps))

merged_df = merged_df[~merged_df['ticker'].isin(tickers_missing_rev_bk_eps)].copy()
print("DataFrame shape after removing tickers with missing revenue, book_value or eps:", merged_df.shape)

# Identify tickers with either lt_debt / st_debt / total_assets == NaN.
tickers_missing_debt_assets = merged_df.loc[(merged_df['lt_debt'].isna()) | (merged_df['st_debt'].isna()) | (merged_df['total_assets'].isna()),'ticker'].unique()
print("Count of tickers missing lt_debt, st_debt or total assets:", len(tickers_missing_debt_assets))

merged_df = merged_df[~merged_df['ticker'].isin(tickers_missing_debt_assets)].copy()
print("DataFrame shape after removing tickers with missing lt_debt, st_debt or total_assets:", merged_df.shape)

# Count of tickers post cleanup
tickers_cleanup = merged_df['ticker'].unique().tolist()
print(f"Count of tickers post cleanup is: {len(tickers_cleanup)}")


Count of tickers with 0 or missing shares outstanding: 26
DataFrame shape after removing tickers with 0 or missing shares outstanding: (3440258, 52)
Count of tickers where market cap has been below $100M: 485
DataFrame shape after removing tickers with low market cap: (3031766, 52)
Count of tickers missing revenue, book_value or eps: 778
DataFrame shape after removing tickers with missing revenue, book_value or eps: (2431749, 52)
Count of tickers missing lt_debt or st_debt: 80
DataFrame shape after removing tickers with missing lt_debt or st_debt: (2368027, 52)
Count of tickers post cleanup is: 2864


In [21]:
# ==========================================
# Display a sample of the resulting DataFrame with new factors and check split adjustment
# ==========================================

selected_columns = ['date', 'ticker', 'price_close', 'price_close_adj', 'eps', 'pe_ratio', 'pb_ratio', 'ps_ratio', 'pebitda_ratio', 
                    'ev_ebitda', 'roe', 'net_margin', 'eps_growth', 'revenue_growth', 
                    'volatility_30d', 'volatility_90d', 'momentum_10d', 'momentum_50d', 'momentum_200d', 'shares_outstanding', 'prev_shares', 'shares_ratio', 'split_multiplier', 'split_multiplier_shifted', 'adjustment_factor']

merged_df.loc[merged_df['ticker'] == 'NVDA'].sort_values(by=["date"]).tail(1) #use SMCI or NVIDIA as they've had recent splits

# # Switch column order for better readability
# new_order = ['date', 'ticker', 'company_name', 'price_close', 'shares_outstanding', 'market_cap', 'volume', 'short_volume', 'avg_daily_volume', 'days_to_cover',
#              'sp500_price_close', 'vix_price_close', 'eps', 'book_value', 'revenue', 'gross_margin', 'gross_profitability', 'ebitda', 'operating_profitability', 
#              'net_income', 'net_debt', 'netdebt_to_ebitda', 'leverage'
#             ]
																							
# merged_df = merged_df[new_order]


Unnamed: 0,date,company_name,ticker,price_close,volume,shares_outstanding,market_cap,eps,sp500_price_close,vix_price_close,revenue,book_value,net_income,op_income,st_debt,lt_debt,cash_eq,total_assets,gross_margin,gross_profitability,operating_profitability,leverage,net_debt,ebitda,netdebt_to_ebitda,short_volume,avg_daily_volume,days_to_cover,prev_shares,shares_ratio,split_multiplier,split_multiplier_shifted,adjustment_factor,price_close_adj,pe_ratio,pb_ratio,ps_ratio,pebitda_ratio,enterprise_value,ev_ebitda,roe,net_margin,eps_growth,revenue_growth,daily_return,volatility_30d,volatility_90d,momentum_10d,momentum_20d,momentum_50d,momentum_100d,momentum_200d
5174153,2025-01-15,NVIDIA CORP,NVDA,136.24,183.75,24490.0,3336517.6,2.56,5842.91,18.71,35082.0,65899.0,19309.0,21868.0,273.0,9952.0,38487.0,96013.0,0.76,0.28,0.23,0.11,-28262.0,22346.0,-1.26,287.06,228.36,1.26,24490.0,1.0,1.0,1.0,1.0,136.24,53.12,50.63,95.11,149.31,3308255.6,148.05,0.29,0.55,0.19,0.17,0.03,0.0,0.17,-0.01,0.01,0.03,0.06,0.51


In [22]:
merged_df.isna().sum()

date                             0
company_name                     0
ticker                           0
price_close                      0
volume                           0
shares_outstanding               0
market_cap                       0
eps                              0
sp500_price_close                0
vix_price_close                  0
revenue                          0
book_value                       0
net_income                       0
op_income                        0
st_debt                          0
lt_debt                          0
cash_eq                          0
total_assets                     0
gross_margin                 68923
gross_profitability              0
operating_profitability          0
leverage                         0
net_debt                         0
ebitda                      131400
netdebt_to_ebitda           132004
short_volume                 59996
avg_daily_volume             59996
days_to_cover                59996
prev_shares         