Setup

In [151]:
%matplotlib inline 

#%% Packages 
import wrds
import datetime
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
plt.rcParams['figure.figsize'] = [10, 6]

import time
from time import strptime, strftime

# Setups
pd.set_option("display.max_rows", 200) # max number or rows to be displayed 
#%% Set WRDS Connection
db = wrds.Connection(wrds_username='zrsong') # make sure to configure wrds connector before hand.


Loading library list...
Done


Load in Compustat Data

In [152]:
compm_tables = db.list_tables(library="comp") # compd: Compustat daily update 
pd.DataFrame({'tables':compm_tables}).head # Transform libs to a Pandas data frame to have a better display.

<bound method NDFrame.head of                tables
0            aco_amda
1            aco_imda
2         aco_indfnta
3         aco_indfntq
4       aco_indfntytd
..                ...
282      wrds_seg_geo
283  wrds_seg_product
284    wrds_segmerged
285        xfl_column
286         xfl_table

[287 rows x 1 columns]>

In [153]:
# import thomson reuters 13f data from WRDS
# tr_13f: Thomson Reuters 13F Holdings Data
# tr_13f_tables = db.list_tables(library="tr_13f")


In [154]:
fund_table = 'fundq'

varlist = ['conm', 'tic', 'cusip','fyearq', 'fqtr', 'fyr', 'atq','capxy', 'ceqq', 'cogsq', 
           'cshoq', 'dlcq', 'dlcchy','dlttq', 'dpq', 'ibq', 'itccy', 'fic',
           'ltq', 'mibq', 'niq', 'prstkccy', 'pstkq', 'req', 'revtq', 'saleq',
           'seqq', 'txdbq', 'txdiq', 'txditcq', 'wcapchy', 'xinty', 'xrdq', 'xsgaq',
           'mkvaltq', 'epspxq', 'epsfxq', 'ajexq', 'prccq', 'oancfy', 'ivncfy', 'rdq', 'prstkcy', 'sstky', 'tstkq', 'dvpy', 'dvy']


query = """SELECT gvkey, datadate, {}
           FROM comp.{}
           WHERE datafmt = 'STD'
           AND popsrc = 'D'
           AND indfmt = 'INDL'
           AND consol = 'C'
           AND fyearq <= 2023
           AND fyearq >= 1988;""".format(", ".join(varlist), fund_table)

compq = db.raw_sql(query, date_cols=['datadate'])

del(fund_table, varlist, query)
# output csv. format
# compq.to_csv("compustat_quarterly.csv", index=False)

  full_df = pd.concat([full_df, chunk])
  full_df = pd.concat([full_df, chunk])
  full_df = pd.concat([full_df, chunk])


In [155]:
# check whether gvkey datadate is a unique key
compq.duplicated(['gvkey', 'datadate']).sum()
compq[compq.duplicated(['gvkey', 'fyearq', 'fqtr'])]
# How to deal with duplicates? Keep last available entry (datadate)
compq.dropna(subset=['fyearq','fqtr'], inplace=True)
compq.sort_values(['gvkey','fyearq','fqtr','atq'], inplace=True)
compq = compq[~compq.duplicated(['gvkey', 'fyearq','fqtr'], keep='first')]
compq.duplicated(['gvkey', 'fyearq','fqtr']).sum()

# output pickle format
compq.to_pickle("compustat_quarterly.pkl")

In [158]:
# keep only naics code 52 (merge with compa annual data to get naics code) using compa_annual.pkl in the same folder
compa = pd.read_pickle("compa_annual.pkl")
compa = compa[['gvkey', 'fyear', 'naicsh', 'prstkc', 'sstk','dvp','tstk']]
# merge compq and compa
# rename fyear fyearq to merge
compa.rename(columns={'fyear':'fyearq'}, inplace=True)
compq1 = pd.merge(compq, compa, how='left', on=['gvkey', 'fyearq'])
# keep only those with six-digit naics code that starts with 52 (change to string first)
compq1['naicsh'] = compq1['naicsh'].astype(str)
compq1 = compq1[compq1['naicsh'].str.startswith('52')]


In [159]:
compq = pd.read_pickle("compustat_quarterly.pkl")

In [160]:
# number of observations in compq
compq1.shape[0]
compq1.duplicated(['gvkey', 'fyearq','fqtr']).sum()

0

In [161]:
# save compq to csv file
compq1.to_csv("compustat_quarterly.csv", index=False)