In [2]:
import pandas as pd
import wrds
from settings import config

# Load WRDS credentials from config
WRDS_USERNAME = config("WRDS_USERNAME")

# WRDS Connection
db = wrds.Connection(wrds_username=WRDS_USERNAME)

# Function to pull column names for a given year
def get_column_names(year):
    """Fetch column names from Markit CDS tables for a given year."""
    table_name = f"markit.cds{year}"
    query = f"SELECT * FROM {table_name} LIMIT 5"  # Fetch only a few rows to inspect structure
    df = db.raw_sql(query)
    return df.columns.tolist()

# Pull column names for two periods
years_pre_2009 = list(range(2001, 2009))   # 2001 - 2008
years_post_2009 = list(range(2009, 2025))  # 2009 - 2025

# Extract unique column names for each period
columns_pre_2009 = set()
columns_post_2009 = set()

print("Fetching column names for Markit CDS data (2001-2008)...")
for year in years_pre_2009:
    columns_pre_2009.update(get_column_names(year))
print(" Done!")

print("Fetching column names for Markit CDS data (2009-2025)...")
for year in years_post_2009:
    columns_post_2009.update(get_column_names(year))
print(" Done!")

# Convert to lists for easier viewing
columns_pre_2009 = sorted(columns_pre_2009)
columns_post_2009 = sorted(columns_post_2009)

# Display results
print("\n **Columns in Markit CDS Data (2001-2008):**")
print(columns_pre_2009)

print("\n **Columns in Markit CDS Data (2009-2025):**")
print(columns_post_2009)

# Compare Differences
added_columns = set(columns_post_2009) - set(columns_pre_2009)
removed_columns = set(columns_pre_2009) - set(columns_post_2009)

print("\n **Columns Added After 2009:**")
print(added_columns if added_columns else "None")

print("\n **Columns Removed After 2009:**")
print(removed_columns if removed_columns else "None")

# Close WRDS Connection
db.close()
print("\n Analysis Complete!")


Loading library list...
Done
Fetching column names for Markit CDS data (2001-2008)...
 Done!
Fetching column names for Markit CDS data (2009-2025)...
 Done!

 **Columns in Markit CDS Data (2001-2008):**
['_52wkhigh5y', '_52wklow5y', 'avrating', 'bankcontributionsdepthcontr', 'bankcontributionsdepthpassed', 'baspreaddepthpassed', 'batch', 'bidasktype5y', 'carriedforward', 'ccybasis', 'ccybasis10thpctlpassed', 'ccybasis90thpctlpassed', 'ccybasiscarriedforward', 'ccybasisdepthpassed', 'ccybasisstddevpassed', 'cdsassumedrecovery', 'cdsrealrecovery', 'clearingsubmissionsdepthcontr', 'clearingsubmissionsdepthpassed', 'compositecurverating', 'compositedepth5y', 'compositepricerating', 'contr10pctlpassedrecovery', 'contr10thpctlpassed', 'contr90thpctlpassed', 'contr90thpctlpassedrecovery', 'contrlqrangepassed', 'contrlqrangepassedrecovery', 'contrmaxminrangepassed', 'contrmaxminrangepassedrecovery', 'contrstddevpassedrecovery', 'convbaspread10thpctlpassed', 'convbaspread90thpctlpassed', 'convb

In [3]:
import pandas as pd
import wrds
from settings import config

# WRDS Connection
WRDS_USERNAME = config("WRDS_USERNAME")
db = wrds.Connection(wrds_username=WRDS_USERNAME)

# Define periods
years_pre_2009 = list(range(2001, 2009))   # 2001 - 2008
years_post_2009 = list(range(2009, 2025))  # 2009 - 2025

# Function to pull sample data
def get_sample_data(year, limit=5000):
    """Fetches a sample of CDS data for a given year."""
    table_name = f"markit.cds{year}"
    query = f"""
    SELECT * FROM {table_name} 
    WHERE tenor = '5Y' AND country = 'United States' 
    LIMIT {limit}
    """
    df = db.raw_sql(query)
    return df

# Fetch sample data
print("Fetching Pre-2009 Sample Data...")
df_pre_2009 = pd.concat([get_sample_data(year) for year in years_pre_2009], ignore_index=True)
print(" Done!")

print("Fetching Post-2009 Sample Data...")
df_post_2009 = pd.concat([get_sample_data(year) for year in years_post_2009], ignore_index=True)
print(" Done!")

# Function to analyze missing values
def analyze_missing_values(df, period):
    missing_values = df.isna().sum()
    total_rows = len(df)
    missing_percent = (missing_values / total_rows) * 100
    missing_summary = pd.DataFrame({'Missing Count': missing_values, 'Missing %': missing_percent})
    print(f"\n Missing Data Analysis ({period}):")
    display(missing_summary[missing_summary['Missing Count'] > 0].sort_values(by='Missing %', ascending=False))

# Compare missing values
analyze_missing_values(df_pre_2009, "2001-2008")
analyze_missing_values(df_post_2009, "2009-2025")

# Function to compute summary statistics
def compute_summary_stats(df, period):
    print(f"\n Summary Statistics for {period}:")
    display(df.describe())

# Compare summary statistics
compute_summary_stats(df_pre_2009, "2001-2008")
compute_summary_stats(df_post_2009, "2009-2025")

# Function to compare unique values across key fields
def compare_unique_values(col_name):
    unique_pre = set(df_pre_2009[col_name].dropna().unique())
    unique_post = set(df_post_2009[col_name].dropna().unique())
    added_values = unique_post - unique_pre
    removed_values = unique_pre - unique_post
    print(f"\n Unique Values Analysis for {col_name}:")
    print(f"Added after 2009: {added_values if added_values else 'None'}")
    print(f"Removed after 2009: {removed_values if removed_values else 'None'}")

# Compare unique values for key fields
compare_unique_values("ticker")
compare_unique_values("sector")
compare_unique_values("region")

# Function to compare distributions of key financial fields
def compare_distributions(col_name):
    print(f"\n Distribution Comparison for {col_name}:")
    display(pd.DataFrame({
        "Pre-2009": df_pre_2009[col_name].describe(),
        "Post-2009": df_post_2009[col_name].describe()
    }))

# Compare distributions for key financial metrics
compare_distributions("parspread")
compare_distributions("creditdv01")
compare_distributions("riskypv01")
compare_distributions("upfront")

# Close connection
db.close()
print("\n Analysis Complete!")


Loading library list...
Done
Fetching Pre-2009 Sample Data...


  df_pre_2009 = pd.concat([get_sample_data(year) for year in years_pre_2009], ignore_index=True)


 Done!
Fetching Post-2009 Sample Data...


  df_post_2009 = pd.concat([get_sample_data(year) for year in years_post_2009], ignore_index=True)


 Done!

 Missing Data Analysis (2001-2008):


Unnamed: 0,Missing Count,Missing %
dealersquotescountcurve2mma,40000,100.0000
upfrontbaspreadhigh,40000,100.0000
dealersborcountcurve1mma,40000,100.0000
dealersborcountcurve2mma,40000,100.0000
upfrontbaspreadaverage,40000,100.0000
...,...,...
runningcoupon,39571,98.9275
avrating,5350,13.3750
cdsrealrecovery,2085,5.2125
primarycoupon,1578,3.9450



 Missing Data Analysis (2009-2025):


Unnamed: 0,Missing Count,Missing %
isdabasis,80000,100.00000
tierbasis,80000,100.00000
_52wkhigh5y,80000,100.00000
_52wklow5y,80000,100.00000
ccybasis90thpctlpassed,80000,100.00000
...,...,...
avrating,4799,5.99875
cdsrealrecovery,55,0.06875
compositecurverating,1,0.00125
compositepricerating,1,0.00125



 Summary Statistics for 2001-2008:


Unnamed: 0,runningcoupon,parspread,convspreard,upfront,cdsrealrecovery,cdsassumedrecovery,compositedepth5y,creditdv01,riskypv01,irdv01,rec01,dp,jtd,dtz
count,429.0,40000.0,399.0,399.0,37915.0,429.0,40000.0,378.0,378.0,378.0,378.0,378.0,378.0,378.0
mean,0.020163,0.009344,0.008132,-0.052124,0.396974,0.4,6.471025,4845.569974,4.626614,136.283148,6657.717302,0.070348,6564507.0,10532420.0
std,0.017434,0.009233,0.001848,0.080214,0.033448,5.557596000000001e-17,4.471891,359.24049,0.08797,207.915757,1324.213262,0.014376,799121.9,812484.0
min,0.01,0.001436,0.004575,-0.208388,0.2,0.4,2.0,4360.58,4.45,-58.93,5144.29,0.056416,5766689.0,9766689.0
25%,0.01,0.004033,0.007441,-0.015494,0.385714,0.4,4.0,4632.35,4.56,25.775,6039.56,0.064648,6100286.0,10100290.0
50%,0.01,0.007903,0.007639,-0.011077,0.4,0.4,5.0,4697.515,4.64,28.47,6226.695,0.066345,6122897.0,10110570.0
75%,0.05,0.009396,0.007863,-0.010147,0.403125,0.4,8.0,4949.3125,4.67,39.105,6987.75,0.069269,6319585.0,10153720.0
max,0.05,0.065807,0.015232,0.023337,0.5,0.4,26.0,5794.5,4.9,553.07,11384.23,0.124697,8083787.0,12083790.0



 Summary Statistics for 2009-2025:


Unnamed: 0,runningcoupon,parspread,convspreard,upfront,cdsrealrecovery,cdsassumedrecovery,carriedforward,compositedepth5y,primarydepthcontributed,primarydepthpassed,...,wklycontr,wklycontr1m,wklycontr1y,creditdv01,riskypv01,irdv01,rec01,dp,jtd,dtz
count,63841.0,80000.0,63781.0,63781.0,79945.0,63841.0,40060.0,79999.0,7212.0,7212.0,...,3598.0,3598.0,3984.0,19060.0,19060.0,19060.0,19060.0,19060.0,19060.0,19060.0
mean,0.030007,0.014935,0.012854,-0.080691,0.397649,0.4,0.0,4.869986,6.408625,3.488491,...,803.902724,824.962757,961.660141,5107.429285,4.711312,247.702312,7681.538122,0.076314,7019084.0,10971000.0
std,0.02,0.012383,0.010984,0.105233,0.010106,0.0,0.0,2.767964,2.7959,2.05209,...,1091.067898,1108.953161,1253.698056,455.771973,0.192836,249.247782,3781.059072,0.038914,984674.8,974386.3
min,0.01,0.002349,0.002216,-0.238516,0.35,0.4,0.0,2.0,3.0,2.0,...,14.0,16.0,14.0,3227.71,4.05,-508.09,1906.53,0.019459,3888497.0,7888497.0
25%,0.01,0.007721,0.007048,-0.190474,0.4,0.4,0.0,2.0,4.0,2.0,...,87.0,93.0,119.0,4740.565,4.57,14.09,5587.0775,0.055405,6072184.0,10056620.0
50%,0.05,0.009902,0.008999,-0.03467,0.4,0.4,0.0,4.0,6.0,3.0,...,192.0,192.0,332.0,5036.28,4.73,91.405,7212.7,0.071824,6575461.0,10356000.0
75%,0.05,0.018159,0.015405,-0.003312,0.4,0.4,0.0,7.0,9.0,5.0,...,1807.0,1881.0,2245.0,5515.6775,4.86,496.55,8756.9925,0.085204,7968889.0,11944780.0
max,0.05,0.118127,0.114672,0.320047,0.41,0.4,0.0,15.0,14.0,10.0,...,3654.0,3717.0,3925.0,6066.41,5.13,639.76,41121.27,0.434919,8612198.0,12384820.0



 Unique Values Analysis for ticker:
Added after 2009: {'AAHI', 'ABCLL', 'ABBVINC', 'ABBINS'}
Removed after 2009: {'ABF', 'ADI', 'ABK-AssurCorp', 'AIG', 'AES', 'ADM', 'AGN', 'AEP', 'ABK', 'ABS', 'AHC', 'AET'}

 Unique Values Analysis for sector:
Added after 2009: None
Removed after 2009: {'Utilities', 'Financials', 'Technology'}

 Unique Values Analysis for region:
Added after 2009: None
Removed after 2009: None

 Distribution Comparison for parspread:


Unnamed: 0,Pre-2009,Post-2009
count,40000.0,80000.0
mean,0.009344,0.014935
std,0.009233,0.012383
min,0.001436,0.002349
25%,0.004033,0.007721
50%,0.007903,0.009902
75%,0.009396,0.018159
max,0.065807,0.118127



 Distribution Comparison for creditdv01:


Unnamed: 0,Pre-2009,Post-2009
count,378.0,19060.0
mean,4845.569974,5107.429285
std,359.24049,455.771973
min,4360.58,3227.71
25%,4632.35,4740.565
50%,4697.515,5036.28
75%,4949.3125,5515.6775
max,5794.5,6066.41



 Distribution Comparison for riskypv01:


Unnamed: 0,Pre-2009,Post-2009
count,378.0,19060.0
mean,4.626614,4.711312
std,0.08797,0.192836
min,4.45,4.05
25%,4.56,4.57
50%,4.64,4.73
75%,4.67,4.86
max,4.9,5.13



 Distribution Comparison for upfront:


Unnamed: 0,Pre-2009,Post-2009
count,399.0,63781.0
mean,-0.052124,-0.080691
std,0.080214,0.105233
min,-0.208388,-0.238516
25%,-0.015494,-0.190474
50%,-0.011077,-0.03467
75%,-0.010147,-0.003312
max,0.023337,0.320047



 Analysis Complete!
