In [None]:
import wrds
import pandas as pd
import numpy as np

In [None]:
db = wrds.Connection()

In [155]:
np.linspace(1972, 2020, 6, dtype=int)

array([1972, 1981, 1991, 2000, 2010, 2020])

In [159]:


start_years = [1972, 1982, 1992, 2002, 2012]

missing_threshold = 10

for start_year in start_years:
    print(f"Processing data for years {start_year}")

    
    final_year = start_year+3
    end_year = final_year+9
        

    all_monthly_data = []
    unique_permnos = set()

    # Step 1: Load CRSP monthly stock file data for each year
    for year in range(start_year, end_year+1):
        start_date = f'{year}-01-01'
        end_date = f'{year}-12-31'
        
        crsp_query = f"""
            SELECT a.permno, a.date, a.ret, a.prc, a.shrout, 
                a.vol, a.cfacshr, a.altprc
            FROM crsp.msf AS a
            WHERE a.date BETWEEN '{start_date}' AND '{end_date}'
            AND a.permno IN (
                SELECT permno FROM crsp.msenames 
                WHERE exchcd BETWEEN 1 AND 3  
                    AND shrcd IN (10, 11)
            )
            """
        crsp_data = db.raw_sql(crsp_query)
        
        # Basic processing for identifying top stocks
        crsp_data['date'] = pd.to_datetime(crsp_data['date'])
        crsp_data['mktcap'] = (crsp_data['prc'].abs() * crsp_data['shrout'] * 1000) / 1e6
        
        # Get top stocks by market cap for this year
        crsp_data_filtered = crsp_data.dropna(subset=['mktcap'])
        if len(crsp_data_filtered) > 0:
            top_permnos = crsp_data_filtered.groupby('permno')['mktcap'].max().nlargest(400).index.tolist()
            yearly_permnos = []
            for permno in top_permnos:
                if len(crsp_data_filtered[crsp_data_filtered['permno'] == permno]) == 12:
                    yearly_permnos.append(permno)
            
            unique_permnos.update(yearly_permnos)
            
            # Only keep data for top stocks
            year_data = crsp_data[crsp_data['permno'].isin(yearly_permnos)]
            all_monthly_data.append(year_data)
        
    combined_monthly_data = pd.concat(all_monthly_data, axis=0)

    # Convert unique_permnos to a list for SQL
    permnos_list = list(unique_permnos)
    permnos_str = ", ".join(str(p) for p in permnos_list)

    # Step 2: Get ticker information for all relevant PERMNOs
    query_ticker = f"""
        SELECT permno, ticker, MAX(nameenddt) as max_nameenddt
        FROM crsp.stocknames
        WHERE permno IN ({permnos_str})
        GROUP BY permno, ticker
    """
    stocknames = db.raw_sql(query_ticker)

    # Merge ticker info
    combined_monthly_data = combined_monthly_data.merge(stocknames, on='permno', how='left')
    combined_monthly_data = combined_monthly_data.dropna(subset=['ticker'])
    combined_monthly_data['year'] = pd.to_datetime(combined_monthly_data['date']).dt.year
    combined_monthly_data['turnover'] = combined_monthly_data['vol'] / combined_monthly_data['shrout']

    start_date = f'{start_year}-01-01'
    end_date = f'{end_year}-12-31'

    # Step 3: Link table to connect CRSP and Compustat
    link_query = """
        SELECT lpermno AS permno, gvkey, linkdt, linkenddt
        FROM crsp.ccmxpf_linktable
        WHERE linktype IN ('LU', 'LC') AND linkprim IN ('P', 'C')
    """
    link = db.raw_sql(link_query)

    # Filter for relevant permnos
    link_filtered = link[link['permno'].isin(permnos_list)]
    gvkeys = list(link_filtered['gvkey'].unique())
    gvkeys_str = "', '".join(gvkeys)
    # Step 4: Load Compustat quarterly data
    fund_query = f"""
        SELECT gvkey, datadate, rdq, atq, cheq, lctq, ceqq, lseq, niq, saleq,
                oiadpq, saleq, dlcq, dlttq, ceqq, ppegtq, epsfiy,
                invtq, revtq, cogsq, icaptq, piq, pstkq, rectq, req
        FROM comp.fundq WHERE indfmt = 'INDL' AND datafmt = 'STD' AND popsrc = 'D' AND consol = 'C' 
        AND datadate BETWEEN '{start_date}' AND '{end_date}' AND rdq IS NOT NULL AND gvkey IN ('{gvkeys_str}')
        """
    fund = db.raw_sql(fund_query)
    fund['rdq'] = pd.to_datetime(fund['rdq'])
    fund['datadate'] = pd.to_datetime(fund['datadate'])
    fund = pd.merge(fund, link_filtered, on='gvkey', how='left')

    # Step 5: Load Financial Ratios data
    ratio_query = f"""
        SELECT gvkey, public_date, permno,
            bm, evm, pe_inc, pcf, npm, opmad, roa, roe, roce, debt_invcap, int_totdebt, cash_lt, invt_act,
            debt_at, debt_ebitda, fcf_ocf, intcov_ratio, curr_ratio, cash_conversion, inv_turn, ptb, sale_invcap, cash_ratio,
            quick_ratio, at_turn, gprof  
        FROM wrdsapps_finratio.firm_ratio
        WHERE public_date BETWEEN '{start_date}' AND '{end_date}'
        AND gvkey IN ('{gvkeys_str}')
        """
    ratios = db.raw_sql(ratio_query)
    ratios['public_date'] = pd.to_datetime(ratios['public_date'])
    ratios['permno'] = ratios['permno'].astype(int)

    # Sort ratios by date for merge_asof
    sorted_ratios = ratios.sort_values(by='public_date')

    # Merge CRSP data with financial ratios
    data_and_ratios = pd.merge_asof(
        combined_monthly_data,
        sorted_ratios,
        left_on='date', 
        right_on='public_date', 
        by='permno', 
        direction='backward',
        suffixes=('', '_ratio')
    )

    # Merge with fundamentals data
    fund = fund.dropna(subset=['permno'])
    combined_data_sorted = data_and_ratios.sort_values('date')
    fund_sorted = fund.sort_values('rdq')
    fund_sorted['permno'] = fund_sorted['permno'].astype(int)

    merged = pd.merge_asof(
        combined_data_sorted,
        fund_sorted,
        left_on='date',
        right_on='rdq',
        by='permno',
        direction='backward',
        suffixes=('', '_fund')
    )
    merged = merged.sort_values(by='date')
    merged['linkdt'] = pd.to_datetime(merged['linkdt'])
    merged = merged[merged['linkdt'] < (merged['date'] - pd.Timedelta(days=900))]

    # Clean up the merged data
    merged = merged.drop(columns=['linkdt', 'linkenddt', 'cfacshr', 'ticker', 'max_nameenddt', 'public_date', 'gvkey_fund', 'datadate'])
    merged = merged.dropna(subset=['permno'])
    merged.drop_duplicates(subset=['permno', 'date'], inplace=True)


    year_in_data = merged.groupby('permno')['year'].agg(['max', 'min'])
    year_in_data['diff'] = year_in_data['max'] - year_in_data['min']
    filtered_by_year_in_data_perms = year_in_data[year_in_data['diff'] >= 2].index
    merged_sub1 = merged[merged['permno'].isin(filtered_by_year_in_data_perms)]


    missing_by_permno = merged_sub1.sort_values(by='date').groupby(['permno']).apply(lambda x: x.isna().mean() * 100)
    missing_by_permno_descp = missing_by_permno.describe()

    mask = (missing_by_permno.mean(axis=1) > missing_threshold).astype(int)
    permnos_with_misisng_greaterthan_thresh = mask[mask==1].index

    filtered_permnos_after_missing = missing_by_permno.loc[~missing_by_permno.index.isin(permnos_with_misisng_greaterthan_thresh)].index

    merged_sub2 = merged_sub1[merged_sub1['permno'].isin(filtered_permnos_after_missing)]
    merged_sub2 = merged_sub2[merged_sub2['date'].dt.year >= final_year]

    merged_sub2['date_copy'] = merged_sub2['date']
    merged_sub2['permno_copy'] = merged_sub2['permno']

    final_data = merged_sub2.groupby(['permno', 'date']).ffill().fillna(0).reset_index()
    final_data['date'] = final_data['date_copy']
    final_data['permno'] = final_data['permno_copy']

    final_data = final_data.drop(columns=['index', 'date_copy', 'permno_copy'])

    final_data.to_csv(f"cleanedFinalData_{final_year}-{end_year}.csv")

Processing data for years 1972
Processing data for years 1982
Processing data for years 1992
Processing data for years 2002
Processing data for years 2012


In [158]:
final_data

Unnamed: 0,index,ret,prc,shrout,vol,altprc,mktcap,year,turnover,gvkey,bm,evm,pe_inc,pcf,npm,opmad,roa,roe,roce,debt_invcap,int_totdebt,cash_lt,invt_act,debt_at,debt_ebitda,fcf_ocf,intcov_ratio,curr_ratio,cash_conversion,inv_turn,ptb,sale_invcap,cash_ratio,quick_ratio,at_turn,gprof,rdq,atq,cheq,lctq,ceqq,lseq,niq,saleq,oiadpq,saleq.1,dlcq,dlttq,ceqq.1,ppegtq,epsfiy,invtq,revtq,cogsq,icaptq,piq,pstkq,rectq,req
0,20990,0.333333,17.000,27292.0,3645.0,17.000,463.964000,1975,0.133556,001279,1.571662,8.232906,8.713480,4.514366,0.138346,0.340911,0.116225,0.117513,0.105156,0.523526,0.065927,0.035040,0.278393,0.485958,4.471429,-0.051068,2.578963,0.908802,0.000000,0.000000,0.816345,0.314350,0.255230,0.655798,0.274685,0.112696,1974-10-24,1702.955,0.000,0.000,510.259,1702.955,9.950,129.839,0.000,129.839,81.700,755.954,510.259,2018.490,0.00,0.000,129.839,0.000,1459.189,0.000,192.976,0.000,0.000
1,20991,0.136564,32.250,27556.0,5523.0,32.250,888.681000,1975,0.200428,001300,1.330664,4.210632,6.039322,3.464319,0.071376,0.119150,0.204222,0.157642,0.184992,0.309059,0.065161,0.145501,0.355192,0.246821,1.208593,0.174966,8.726038,2.462307,58.508230,6.912781,0.888187,1.511577,0.484367,1.587716,1.177853,0.288113,1975-01-16,1968.449,72.790,329.105,1011.135,1968.449,25.936,574.808,34.295,574.808,10.957,438.194,1011.135,2562.657,0.00,287.919,574.808,468.938,1449.329,30.171,0.000,269.838,656.000
2,20994,0.215447,37.375,23853.0,6229.0,37.375,891.505875,1975,0.261141,001365,1.094498,9.700099,7.271398,5.859234,0.127508,0.108719,0.123750,0.171134,0.118944,0.323774,0.086110,0.330935,0.264055,0.256585,2.414107,-0.572494,3.551564,2.618838,69.178570,5.953415,0.966005,0.890089,1.121936,1.927322,0.679109,0.185186,1974-10-15,0.000,0.000,0.000,922.879,0.000,36.490,290.650,0.000,290.650,11.090,405.900,922.879,0.000,3.97,0.000,290.650,0.000,1330.549,49.870,1.770,0.000,633.770
3,20995,0.210744,36.625,26304.0,2403.0,36.625,963.384000,1975,0.091355,001408,1.321524,4.416657,7.598548,20.024610,0.063948,0.176957,0.181298,0.141616,0.213618,0.294533,0.089190,0.029262,0.742268,0.344108,1.810279,0.522050,7.029912,2.143637,314.537838,1.270013,0.994380,1.412981,0.054823,0.552485,0.962592,0.342929,1974-10-23,0.000,0.000,0.000,968.829,0.000,33.591,578.729,0.000,578.729,303.100,478.564,968.829,0.000,0.00,0.000,578.729,0.000,1522.293,74.259,74.900,0.000,808.758
4,20999,0.424528,18.875,17137.0,5248.0,18.875,323.460875,1975,0.306238,001409,1.191198,4.263680,6.229372,6.461206,0.054122,0.115149,0.210949,0.172510,0.262708,0.254692,0.065462,0.260627,0.379059,0.192236,0.943024,-0.323871,14.571572,2.599412,36.127231,27.242506,1.003216,2.294404,0.479307,1.614082,1.669893,0.476182,1974-10-21,0.000,0.000,0.000,322.424,0.000,11.380,225.796,0.000,225.796,4.560,105.205,322.424,0.000,0.00,0.000,225.796,0.000,427.629,23.619,0.000,0.000,200.875
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
31161,91154,0.044444,47.000,40030.0,11070.0,47.000,1881.410000,1984,0.276543,008431,1.156878,4.899311,0.000000,8.016268,0.023591,0.106022,0.134315,0.038842,0.149518,0.141518,0.118485,0.160461,0.433666,0.114650,0.831656,0.654963,7.045985,1.848432,92.921009,4.888733,1.381765,1.247719,0.335812,1.046830,0.902781,0.193376,1984-10-19,2701.900,14.100,502.599,1290.899,2701.900,35.600,614.500,56.400,614.500,39.600,320.700,1290.899,1475.099,2.85,443.899,614.500,487.500,1913.698,66.300,277.399,328.799,434.000
31162,91159,0.020548,37.250,28621.0,23257.0,37.250,1066.132250,1984,0.812585,003352,0.198287,15.934055,28.875969,45.847263,0.085780,0.128247,0.209937,0.191981,0.239672,0.144787,0.101020,0.107155,0.236955,0.141557,0.718790,0.335396,11.763646,1.766110,104.823738,4.832312,4.312397,1.834043,0.147770,1.347621,1.311692,0.742184,1984-10-25,428.245,1.920,116.413,247.225,428.245,4.692,137.133,17.486,137.133,18.635,63.522,247.225,232.878,0.92,55.716,137.133,62.403,310.747,15.992,0.000,148.468,168.856
31163,91161,-0.195313,38.625,53827.0,73991.0,38.625,2079.067875,1984,1.374608,010639,0.189505,11.085462,20.878378,25.590417,0.067234,0.123249,0.252150,0.227748,0.377219,0.121037,0.044048,0.299597,0.738169,0.088931,0.386083,0.342968,54.392223,1.604983,59.102693,2.522775,4.076412,2.881086,0.366141,0.420234,1.728767,0.572985,1984-11-26,1077.844,43.839,493.355,494.838,1077.844,10.279,321.726,18.761,321.726,47.168,74.464,494.838,0.000,0.55,605.478,321.726,219.651,569.302,19.395,0.000,19.640,298.111
31164,91208,0.085714,57.000,21477.0,6180.0,57.000,1224.189000,1984,0.287750,003863,0.287165,5.541814,14.921466,12.270603,0.127738,0.224374,0.438502,0.270605,0.508547,0.027380,0.061051,0.865977,0.072707,0.020960,0.048879,0.682992,290.567515,2.892762,39.198036,21.497866,3.770343,2.229155,1.343296,2.682437,1.657143,0.849842,1984-10-22,409.202,94.441,75.822,290.504,409.202,23.657,174.765,41.386,174.765,0.242,8.691,290.504,302.854,2.89,14.339,174.765,82.878,299.195,44.203,0.000,85.544,268.916


Unnamed: 0,index,ret,prc,shrout,vol,altprc,mktcap,year,turnover,gvkey,bm,evm,pe_inc,pcf,npm,opmad,roa,roe,roce,debt_invcap,int_totdebt,cash_lt,invt_act,debt_at,debt_ebitda,fcf_ocf,intcov_ratio,curr_ratio,cash_conversion,inv_turn,ptb,sale_invcap,cash_ratio,quick_ratio,at_turn,gprof,rdq,atq,cheq,lctq,dpq,ceqq,lseq,niq,saleq,oiadpq,saleq.1,dlcq,dlttq,ceqq.1,ppegtq,epsfiy,invtq,revtq,xsgaq,cogsq,icaptq,piq,pstkq,rectq,req,permno_copy,date_copy
0,20991,0.333333,17.000,27292.0,3645.0,17.000,463.964000,1975,0.133556,001279,1.571662,8.232906,8.713480,4.514366,0.138346,0.340911,0.116225,0.117513,0.105156,0.523526,0.065927,0.035040,0.278393,0.485958,4.471429,-0.051068,2.578963,0.908802,0.000000,0.000000,0.816345,0.314350,0.255230,0.655798,0.274685,0.112696,1974-10-24,1702.955,0.000,0.000,13.836,510.259,1702.955,9.950,129.839,0.000,129.839,81.700,755.954,510.259,2018.490,0.00,0.000,129.839,0.000,0.000,1459.189,0.000,192.976,0.000,0.000,10137,1975-01-31
1,20993,0.136564,32.250,27556.0,5523.0,32.250,888.681000,1975,0.200428,001300,1.330664,4.210632,6.039322,3.464319,0.071376,0.119150,0.204222,0.157642,0.184992,0.309059,0.065161,0.145501,0.355192,0.246821,1.208593,0.174966,8.726038,2.462307,58.508230,6.912781,0.888187,1.511577,0.484367,1.587716,1.177853,0.288113,1975-01-16,1968.449,72.790,329.105,30.602,1011.135,1968.449,25.936,574.808,34.295,574.808,10.957,438.194,1011.135,2562.657,0.00,287.919,574.808,40.973,468.938,1449.329,30.171,0.000,269.838,656.000,10145,1975-01-31
2,20996,0.215447,37.375,23853.0,6229.0,37.375,891.505875,1975,0.261141,001365,1.094498,9.700099,7.271398,5.859234,0.127508,0.108719,0.123750,0.171134,0.118944,0.323774,0.086110,0.330935,0.264055,0.256585,2.414107,-0.572494,3.551564,2.618838,69.178570,5.953415,0.966005,0.890089,1.121936,1.927322,0.679109,0.185186,1974-10-15,0.000,0.000,0.000,0.000,922.879,0.000,36.490,290.650,0.000,290.650,11.090,405.900,922.879,0.000,3.97,0.000,290.650,0.000,0.000,1330.549,49.870,1.770,0.000,633.770,10161,1975-01-31
3,21002,0.424528,18.875,17137.0,5248.0,18.875,323.460875,1975,0.306238,001409,1.191198,4.263680,6.229372,6.461206,0.054122,0.115149,0.210949,0.172510,0.262708,0.254692,0.065462,0.260627,0.379059,0.192236,0.943024,-0.323871,14.571572,2.599412,36.127231,27.242506,1.003216,2.294404,0.479307,1.614082,1.669893,0.476182,1974-10-21,0.000,0.000,0.000,0.000,322.424,0.000,11.380,225.796,0.000,225.796,4.560,105.205,322.424,0.000,0.00,0.000,225.796,0.000,0.000,427.629,23.619,0.000,0.000,200.875,10233,1975-01-31
4,20998,0.210744,36.625,26304.0,2403.0,36.625,963.384000,1975,0.091355,001408,1.321524,4.416657,7.598548,20.024610,0.063948,0.176957,0.181298,0.141616,0.213618,0.294533,0.089190,0.029262,0.742268,0.344108,1.810279,0.522050,7.029912,2.143637,314.537838,1.270013,0.994380,1.412981,0.054823,0.552485,0.962592,0.342929,1974-10-23,0.000,0.000,0.000,11.154,968.829,0.000,33.591,578.729,0.000,578.729,303.100,478.564,968.829,0.000,0.00,0.000,578.729,0.000,0.000,1522.293,74.259,74.900,0.000,808.758,10225,1975-01-31
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
49522,136712,0.061177,56.375,85693.0,139954.0,56.375,4830.942875,1990,1.633202,003282,0.497969,5.899031,12.403731,8.536290,0.117899,0.169958,0.328890,0.297773,0.389804,0.108483,0.152805,0.339468,0.367359,0.133308,0.434712,0.386744,12.368667,2.352594,110.184732,3.592496,2.704246,2.108779,0.484307,1.488348,1.482434,0.670080,1990-10-24,2524.770,396.976,664.218,34.783,1705.683,2524.770,123.635,862.960,157.800,862.960,0.000,74.123,1705.683,0.000,3.62,557.408,862.960,218.240,452.137,1779.806,182.619,0.000,524.648,1127.353,68347,1990-12-31
49523,136762,0.180556,42.500,104875.0,11968.0,42.500,4457.187500,1990,0.114117,012342,0.131202,17.698118,28.523490,27.432391,0.242610,0.310322,0.414389,0.293377,0.350672,0.000000,0.085796,0.015561,0.844726,0.070689,0.174846,0.663107,48.841913,1.123412,74.886195,3.448722,7.138890,1.201148,0.021405,0.174437,0.954548,0.466069,1990-10-30,685.463,1.312,56.193,18.743,593.810,685.463,36.662,160.889,51.267,160.889,0.000,0.000,593.810,0.000,1.11,102.086,160.889,12.104,78.775,593.810,50.912,0.000,0.803,389.535,69585,1990-12-31
49524,136761,0.028571,18.000,138411.0,27436.0,18.000,2491.398000,1990,0.198221,003108,2.092832,8.932648,11.538462,4.145711,0.119496,0.234704,0.074356,0.058804,0.077312,0.542518,0.084821,0.007829,0.203502,0.377956,5.111029,0.368748,1.466736,0.618886,6.326070,10.555881,0.516945,0.311764,0.058140,0.492941,0.200344,0.073949,1990-10-23,11772.598,66.584,1123.069,76.724,2752.781,11772.598,115.022,673.015,230.888,673.015,369.283,4138.367,2752.781,10282.418,1.48,138.224,673.015,0.000,365.403,7566.773,185.241,675.624,285.441,595.560,69243,1990-12-31
49525,136774,-0.060606,11.625,105374.0,60801.0,11.625,1224.972750,1990,0.577002,010329,1.139912,4.357329,10.287611,12.255365,0.065295,0.101125,0.176522,0.103340,0.146662,0.073713,0.102801,0.135129,0.218009,0.114740,0.698085,-2.014407,9.644058,1.744723,124.896896,3.298691,0.951701,1.436304,0.184201,1.364357,1.067373,0.701949,1990-10-25,1877.409,91.089,494.509,34.705,1203.320,1877.409,31.834,506.069,48.123,506.069,119.654,95.759,1203.320,1026.459,1.13,188.094,506.069,274.800,148.441,1299.079,46.610,0.000,494.958,650.622,74617,1990-12-31
