### Import relevant packages, establish connection to WRDS and set overall configurations for the notebook

WRDS Support - https://wrds-www.wharton.upenn.edu/pages/support/programming-wrds/programming-python/querying-wrds-data-python/

In [24]:
# Import packages
import os
import pandas as pd
import wrds
import yfinance as yf 

# Build WRDS connection

db = wrds.Connection(wrds_username='tomasromeiro')
#db.close()

# Set option to display all columns and rows
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

# Set option to force dataframes to display numbers as floats (instead of scientific notation for example)
pd.set_option('display.float_format', '{:.2f}'.format)  # Adjust decimal places as needed


Loading library list...
Done


WRDS Quick commands

In [None]:
# List libraries available
sorted(db.list_libraries()) 

# List tables within a library
db.list_tables(library="cboe") 

# describe table metadata
db.describe_table(library="cboe", table="optprice_2024") 

# Execute a sql query against a table (join queries between tables in library can also be performed)
data = db.raw_sql('SELECT date, dji FROM djones.djdaily LIMIT 1', date_cols=['date']) 

# Pass parameters to a sql statement
params = {"tickers": ("0015B", "0030B", "0032A", "0033A", "0038A")}
data = db.raw_sql(
    "SELECT datadate, gvkey, cusip FROM comp.funda WHERE tic IN %(tickers)s LIMIT 1",
    params=params,
)

### FINRA Short Interest Bimonthly Data - https://www.finra.org/finra-data/browse-catalog/equity-short-interest/files

#### a) Glossary

The glossary for this dataset can be found at https://www.finra.org/finra-data/browse-catalog/equity-short-interest/glossary

#### b) Collate semi monthly datasets

In [22]:
# Define the directory containing the CSV files
directory = 'data/finra_short_interest_data'
output_file = os.path.join(directory, 'collated_short_interest_data.csv')

# Check if the collated file already exists and delete it
if os.path.exists(output_file):
    os.remove(output_file)

# Get a list of all pipe-delimited CSV files in the directory
csv_files = [os.path.join(directory, file) for file in os.listdir(directory) if file.endswith('.csv')]

# Read and concatenate all CSV files with proper delimiter handling
df_list = []
for file in csv_files:
    try:
        df = pd.read_csv(file, sep='|')  # Read as pipe-delimited with specific dtype
        df_list.append(df)
    except Exception as e:
        print(f"Error reading {file}: {e}")

if df_list:
    collated_df = pd.concat(df_list, ignore_index=True)
    
    # Replace daysToCoverQuantity with blank where averageDailyVolumeQuantity is 0
    collated_df.loc[collated_df['averageDailyVolumeQuantity'] == 0, 'daysToCoverQuantity'] = None
    
    # Remove all entries where the ticker (symbolCode) is missing and daysToCoverQuantity is NaN or 999.99 
    collated_df = collated_df.dropna(subset=['symbolCode', 'daysToCoverQuantity'])
    collated_df = collated_df[collated_df['daysToCoverQuantity'] != 999.99]

    # Keep only stocks not traded Over the Counter
    collated_df = collated_df[collated_df['marketClassCode'] != 'OTC']

    # Drop unnecessary fields
    collated_df = collated_df.drop(columns=['accountingYearMonthNumber', 'issuerServicesGroupExchangeCode', 'stockSplitFlag', 'revisionFlag', 'changePercent', 'changePreviousNumber'])
    
    # Move settlementDate to the first column
    columns = ['settlementDate'] + [col for col in collated_df.columns if col != 'settlementDate']
    collated_df = collated_df[columns]

    # Sort by settlementDate and symbolCode
    collated_df = collated_df.sort_values(by=['settlementDate', 'symbolCode'])

    # Save the collated DataFrame to the same directory
    collated_df.to_csv(output_file, index=False)
    
    print(f"Collated data saved to {output_file}")
else:
    print("No valid CSV files found.")

Collated data saved to data/finra_short_interest_data/collated_short_interest_data.csv


#### c) Open .csv file to memory

In [49]:
# Define the directory containing the CSV files
directory = 'data/finra_short_interest_data'

# Open the collated file in a DataFrame for viewing
short_interest_file = os.path.join(directory, 'collated_short_interest_data.csv')
short_interest_df = pd.read_csv(short_interest_file)    

#### d) Check output, summary statistics, etc.

Investigate DataFrame

In [None]:
# Show DataFrame
short_interest_df = short_interest_df.sort_values(by=['settlementDate', 'symbolCode'], ascending=[True, True])
#short_interest_df = short_interest_df[(short_interest_df['symbolCode'] == 'PTON') & (short_interest_df['settlementDate'].str.startswith('2024'))]
short_interest_df.head(10)

Unnamed: 0,settlementDate,symbolCode,issueName,marketClassCode,currentShortPositionQuantity,previousShortPositionQuantity,averageDailyVolumeQuantity,daysToCoverQuantity
0,2021-06-15,A,Agilent Technologies Inc.,NYSE,2556898,2532112,1198057,2.13
1,2021-06-15,AA,Alcoa Corporation,NYSE,10000722,9520100,5371918,1.86
2,2021-06-15,AAA,AAF First Priority CLO Bond ET,ARCA,2403,2944,1080,2.23
3,2021-06-15,AAAU,Goldman Sachs Physical Gold ET,ARCA,461802,237672,360237,1.28
4,2021-06-15,AAC,Ares Acquisition Corporation,NYSE,99741,71715,456777,1.0
5,2021-06-15,AACG,ATA Creativity Global American,NNM,14408,12350,134429,1.0
6,2021-06-15,AACQ,Artius Acquisition Inc. Class,SC,1301840,1047416,2014856,1.0
7,2021-06-15,AACQU,Artius Acquisition Inc. Unit,SC,65,0,6564,1.0
8,2021-06-15,AACQW,Artius Acquisition Inc Warrant,SC,18103,70508,161698,1.0
9,2021-06-15,AACU,Ares Acquisition Corporation U,NYSE,196400,4306,167021,1.18


Summary Statistics

In [27]:
# Generate summary statistics
print("\nSummary Statistics:")
short_interest_df.describe(include='all')


Summary Statistics:


Unnamed: 0,settlementDate,symbolCode,issueName,marketClassCode,currentShortPositionQuantity,previousShortPositionQuantity,averageDailyVolumeQuantity,daysToCoverQuantity
count,965955,965955,965955,965955,965955.0,965955.0,965955.0,965955.0
unique,87,16857,15197,6,,,,
top,2022-08-15,A,First Trust Exchange-Traded Fu,NYSE,,,,
freq,11816,87,4863,280255,,,,
mean,,,,,2552180.41,2540719.9,1023594.19,4.52
std,,,,,8915924.98,8882077.36,5592089.18,19.02
min,,,,,0.0,0.0,1.0,0.0
25%,,,,,6691.0,6434.0,12095.0,1.0
50%,,,,,80690.0,79336.0,78291.0,1.63
75%,,,,,1505510.5,1496211.0,499822.5,4.23


Extract tickers and date ranges to use as parameters for remaining data extracts

In [None]:
# Extract unique tickers from the short interest file. Will be used as the main variable to pass through to subsquent queries where tickers are required
tickers = short_interest_df['symbolCode'].unique().tolist()
print(len(tickers))

# Extract earliest and latest date in the short interest file
earliest_date = short_interest_df['settlementDate'].min()
latest_date = short_interest_df['settlementDate'].max()

print(earliest_date)
print(latest_date)

16857
2021-06-15
2025-01-15


### 2. WRDS (Wharton) Data

#### a) Stock Data (Daily Level) - Prices and Volume
https://wrds-www.wharton.upenn.edu/pages/get-data/compustat-capital-iq-standard-poors/compustat/north-america-daily/security-daily/

Variable Reference:
- conm: company name 
- gvkey: global company key
- datadate: record date
- tic: ticker symbol
- cshoc: shares outstanding
- cshtrd: trading Volume - daily
- eps: current EPS
- epsmo: current EPS month
- prccd: price - close - daily


#### a) Download data and save it as a .csv so we avoiding repeated long queries to WRDS in case we clear memory

In [None]:
# Define the directory containing the CSV files
directory = 'data/wrds_stock_daily_data'
output_file = os.path.join(directory, 'wrds_stock_daily_data.csv')

# Check if the collated file already exists and delete it
if os.path.exists(output_file):
    os.remove(output_file)

# Pass parameters to a sql statement
params = {
    "tickers": tuple(tickers),
    "start_date": earliest_date,
    "end_date": latest_date
}

# Query WRDS to fetch data
daily_stock_data_df = db.raw_sql(
    "SELECT a.datadate, a.conm, a.tic, a.gvkey, a.prccd, a.cshtrd, a.cshoc FROM comp_na_daily_all.secd a WHERE a.tic in %(tickers)s and a.datadate BETWEEN %(start_date)s AND %(end_date)s",
    params=params
)

# Save the collated DataFrame to the same directory
daily_stock_data_df.to_csv(output_file, index=False)

#### b) Open .csv file to memory

In [None]:
# Define the directory containing the CSV files
directory = 'data/wrds_stock_daily_data'

# Open the collated file in a DataFrame for viewing
daily_stock_data_df = os.path.join(directory, 'wrds_stock_daily_data.csv')
daily_stock_data_df = pd.read_csv(daily_stock_data_df)

Unnamed: 0,datadate,conm,tic,gvkey,prccd,cshtrd,cshoc
0,2021-06-15,AAR CORP,AIR,1004,41.72,517758.0,35319000.0
1,2021-06-15,AMERICAN AIRLINES GROUP INC,AAL,1045,22.79,18842570.0,641383000.0
2,2021-06-15,CECO ENVIRONMENTAL CORP,CECO,1050,8.29,108270.0,35605000.0
3,2021-06-15,ASA GOLD AND PRECIOUS METALS,ASA,1062,23.78,34458.0,19290000.0
4,2021-06-15,PINNACLE WEST CAPITAL CORP,PNW,1075,87.88,837068.0,112751000.0
5,2021-06-15,PROG HOLDINGS INC,PRG,1076,52.08,321118.0,67350000.0
6,2021-06-15,ABBOTT LABORATORIES,ABT,1078,110.41,5370039.0,1776820000.0
7,2021-06-15,ACME UNITED CORP,ACU,1104,45.14,22579.0,3484000.0
8,2021-06-15,BK TECHNOLOGIES CORP,BKTI,1117,3.14,136675.0,16786000.0
9,2021-06-15,ADAMS DIVERSIFIED EQUITY FD,ADX,1119,19.51,90697.0,110985000.0


#### c) Check output, summary statistics, etc.

Investigate DataFrame

In [None]:
# Show DataFrame
#short_interest_df = short_interest_df.sort_values(by=['settlementDate', 'symbolCode'], ascending=[True, True])
#short_interest_df = short_interest_df[(short_interest_df['symbolCode'] == 'PTON') & (short_interest_df['settlementDate'].str.startswith('2024'))]
daily_stock_data_df.head(10)

Extract unique downloaded tickers and compare to short interest file dataset. Delete tickers that couldn't be found from tickers parameter variable and short interest DataFrame

In [None]:
# Extract unique tickers from WRDS download
daily_stock_data_tickers = set(daily_stock_data_df['tic'].unique().tolist())
print(f"{len(daily_stock_data_tickers)} unique tickers found in WRDS daily stock data")

# Find missing tickers (tickers in short_interest_df but NOT in secd_tickers)
print(f"{len(tickers)} unique tickers found in FINRA short interest data")
missing_tickers = set(tickers) - daily_stock_data_tickers

print(f"{len(missing_tickers)} tickers missing from the WRDS daily stock data compared to the short interest file")  # Debugging output

# Remove records with missing tickers from short_interest_df
short_interest_df = short_interest_df[~short_interest_df['symbolCode'].isin(missing_tickers)]

# Update main tickers variable for subsquent queries
tickers = short_interest_df['symbolCode'].unique().tolist()
print(f"Updated short_interest_df. New unique ticker count is: {len(tickers)}")  # Check new size


13811 unique tickers found in WRDS daily stock data
16857 unique tickers found in FINRA short interest data
3046 tickers missing from the WRDS daily stock data compared to the short interest file
Updated short_interest_df. New unique ticker count is: 13811


#### b) Company Data (Quarterly) - Fundamentals
https://wrds-www.wharton.upenn.edu/pages/get-data/compustat-capital-iq-standard-poors/compustat/north-america-daily/fundamentals-quarterly/

We'll extract quarterly financial statement data and derive commonly used metrics if not available directly.

Variable references (for the quarterly reporting period, in USD):
- conm: company name
- tic: company ticker symbol
- gvkey: global company key
- rdq: peport date of quarterly earnings
- revtq: total revenue 
- cogsq: cost of goods sold
- oiadpq: operating income after depreciation and amortisation
- dlcq: short-term (current) debt
- dlttq: long-term debt
- che: cash and cash equivalents at reporting point in time

The variables above will be used to calculated the following metrics:

- Gross Profit = revtq – cogsq 
    - "revtq" represents total revenues and "cogsq" represents the Cost of Goods Sold both at quarter level. The difference equals gross profit.
- EBITDA = oiadpq + dpq
    - Earnings Before Interest, Tax, Depreciation and Amortization. Since oiadpq already deducts depreciation and amortisation, adding dpq back returns EBITDA.
- Net Debt = (dlcq + dlttq) – che
    - Net Debt measures a company’s overall debt situation by offsetting its total debt with its liquid assets.


In [None]:
# Extract unique tickers from the short interest file
tickers = short_interest_df['symbolCode'].unique().tolist()


params = {"tickers": tuple(tickers)}
data = db.raw_sql(
    "SELECT a.conm, a.tic, a.gvkey, a.rdq, a.revtq, a.oiadpq FROM comp_na_daily_all.fundq a WHERE a.tic = %(tickers)s and extract(year from a.rdq) = 2025 LIMIT 1000",
    params=params
)
data

#### c) Company Data - Financial Ratios
https://wrds-www.wharton.upenn.edu/pages/get-data/financial-ratios-suite-wrds/financial-ratios/financial-ratios-firm-level-by-wrds-beta/

In [None]:
data = db.raw_sql(
    "SELECT * FROM comp_na_daily_all.wrds_ratios a where a.gvkey = '035627' order by public_date LIMIT 1"
)
data

### 3) Yahoo Finance Data

##### a) Market Data - S&P500

In [None]:
# Define the ticker symbol for the S&P 500 index
ticker = "^GSPC"

# Set the date range from January 1, 2021 to today
start_date = "2021-01-01"
end_date = pd.to_datetime("today").strftime("%Y-%m-%d")

# Fetch historical daily prices for the S&P 500 index
sp500_data = yf.download(ticker, start=start_date, end=end_date)

# Display the first few rows of the data
sp500_data

# Optionally, save the data to a CSV file
#sp500_data.to_csv("sp500_index_prices.csv")