In [92]:
import os
import datetime

import numpy as np
from scipy.special import gammaln
import pandas as pd
from numpy import log, exp
import wrds
from pandas import DataFrame

try:
    from itertools import imap
except ImportError:
    imap = map

# optimization
from numba import jit


def lfact(x):
    """Compute the log factorial using the scipy gammaln function.

    This is commonly referred to as Stirlings approximation/formula for factorials."""
    return gammaln(x + 1)


def nanexp(x):
    """Computes the exponential of x, and replaces nan and inf with finite numbers.

    Returns an array or scalar replacing Not a Number (NaN) with zero, (positive) infinity with a very large number and negative infinity with a very small (or negative) number."""
    return np.nan_to_num(np.exp(x))

In [68]:
# GPIN method
def _lf_gpin(th_b, th_s, r, p, n_buys, n_sells, pdenom=1):
    res = log(th_b) * n_buys + log(1 - th_s) * n_sells - lfact(n_buys) - lfact(n_sells) - gammaln(r) + log(
        1 - p) * r + log(p) * (n_buys + n_sells) + gammaln(r + n_buys + n_sells) - log(pdenom) * r - log(pdenom) * (
                  n_buys + n_sells)
    return res


def _ll_gpin(a, r, p, eta, d, th, n_buys, n_sells):
    return np.array([log(1 - a) + _lf_gpin(th, th, r, p, n_buys, n_sells),
                     log(a * d) + _lf_gpin(th + eta, th, r, p, n_buys, n_sells, 1 + eta * p),
                     log(a * (1 - d)) + _lf_gpin(th, th - eta, r, p, n_buys, n_sells, 1 + eta * p)])


def compute_alpha_gpin(a, r, p, eta, d, th, n_buys, n_sells):
    """Compute the conditional alpha given parameters, buys, and sells.

    """
    ys = _ll_gpin(a, r, p, eta, d, th, n_buys, n_sells)

    ymax = ys.max(axis=0)
    lik = exp(ys - ymax)
    alpha = lik[1:].sum(axis=0) / lik.sum(axis=0)

    return alpha

In [126]:
# OWR calculation
def det3_owr(a):
    return (a[0][0] * (a[1][1] * a[2][2] - a[2][1] * a[1][2])
            - a[1][0] * (a[0][1] * a[2][2] - a[2][1] * a[0][2])
            + a[2][0] * (a[0][1] * a[1][2] - a[1][1] * a[0][2]))


def inv3_owr(a):
    invdet = 1 / det3_owr(a)
    m = np.zeros((3, 3))
    m[0, 0] = a[1, 1] * a[2, 2] - a[2, 1] * a[1, 2]
    m[0, 1] = a[0, 2] * a[2, 1] - a[0, 1] * a[2, 2]
    m[0, 2] = a[0, 1] * a[1, 2] - a[0, 2] * a[1, 1]
    m[1, 0] = a[1, 2] * a[2, 0] - a[1, 0] * a[2, 2]
    m[1, 1] = a[0, 0] * a[2, 2] - a[0, 2] * a[2, 0]
    m[1, 2] = a[1, 0] * a[0, 2] - a[0, 0] * a[1, 2]
    m[2, 0] = a[1, 0] * a[2, 1] - a[2, 0] * a[1, 1]
    m[2, 1] = a[2, 0] * a[0, 1] - a[0, 0] * a[2, 1]
    m[2, 2] = a[0, 0] * a[1, 1] - a[1, 0] * a[0, 1]
    return m * invdet


def _compute_cov_owr(a, su, sz, si, spd, spo):
    # compute covariance matrices
    s_n = np.array([[su ** 2 + sz ** 2, a ** 0.5 * si * su / 2, -a ** 0.5 * si * su / 2],
                    [a ** 0.5 * si * su / 2, spd ** 2 + a * si ** 2 / 4, -a * si ** 2 / 4],
                    [-a ** 0.5 * si * su / 2, -a * si ** 2 / 4, spo ** 2 + (1 + a) * si ** 2 / 4]])

    s_e = np.array([[(1 + 1 / a) * su ** 2 + sz ** 2, a ** (-0.5) * si * su / 2 + a ** 0.5 * si * su / 2,
                     a ** (-0.5) * si * su / 2 - a ** 0.5 * si * su / 2],
                    [a ** (-0.5) * si * su / 2 + a ** 0.5 * si * su / 2, spd ** 2 + (1 + a) * si ** 2 / 4,
                     (1 - a) * si ** 2 / 4],
                    [a ** (-0.5) * si * su / 2 - a ** 0.5 * si * su / 2, (1 - a) * si ** 2 / 4,
                     spo ** 2 + (1 + a) * si ** 2 / 4]])

    return s_n, s_e


def _compute_alpha_owr(x, a, dsd, isd):
    alphas = np.zeros(x.shape[1])
    for i in range(x.shape[1]):
        alphas[i] = 1 / (1 + (1 - a) / a * exp(_lf_owr(x[:, i], dsd, isd)))
    return alphas


def _lf_owr(x, det, inv):
    return -0.5 * log(det) - 0.5 * _qvmv_owr(x, inv)


@jit
def _qvmv_owr(x, A):
    """Computes x'Ax.
    """
    m, n = A.shape
    qsum = 0

    for i in range(m):
        for j in range(n):
            qsum += A[i, j] * x[i] * x[j]

    return qsum


def compute_alpha_owr(oib, ret_d, ret_o, a, su, sz, si, spd, spo):
    """Computes conditional alpha.
    
    Params
    ------
    oib : pandas.Series or numpy.ndarray
        Order imbalance, representing the net difference between buy and sell orders.
    ret_d : pandas.Series or numpy.ndarray
        Returns calculated during the day.
    ret_o : pandas.Series or numpy.ndarray
        Overnight returns.
    a : float
        Unconditional probability of an information event.
    su : float
        Standard deviation of uninformed trades.
    sz : float
        Standard deviation of noise trades.
    si : float
        Standard deviation of informed trades.
    spd : float
        Standard deviation of daily price changes.
    spo : float
        Standard deviation of overnight price changes.
    
    Returns
    -------
    pandas.Series
        Conditional probability (`alpha`) of an information event occurring for each time period.
    
    """
    # if len(a) > 1:
    #     a = a.tolist().pop()
    #     su = su.tolist().pop()
    #     sz = sz.tolist().pop()
    #     si = si.tolist().pop()
    #     spd = spd.tolist().pop()
    #     spo = spo.tolist().pop()
    # s_n, s_e = _compute_cov_owr(a, su, sz, si, spd, spo)
    # dsn, isn = det3_owr(s_n), inv3_owr(s_n)
    # dse, ise = det3_owr(s_e), inv3_owr(s_e)
    # dsd = dsn / dse
    # isd = isn - ise
    # 
    # x = np.array([oib, ret_d, ret_o])
    # cpie = pd.Series(_compute_alpha_owr(x, a, dsd, isd), index=oib.index)
    # return cpie
    s_n, s_e = _compute_cov_owr(a, su, sz, si, spd, spo)
    dsn, isn = det3_owr(s_n), inv3_owr(s_n)
    dse, ise = det3_owr(s_e), inv3_owr(s_e)
    dsd = dsn/dse
    isd = isn-ise
    
    x = np.array([oib, ret_d, ret_o])
    cpie = pd.Series(_compute_alpha_owr(x, a, dsd, isd), index=oib.index)
    return cpie[0]


In [129]:
a, su, sz, si, spd, spo = 0.780775, 0.188613492, 0.47896027, 0.022607443, 0.005858138, 0.01009005
ro, rd, o = -0.00662909, -0.013827281, -0.352941176

alpha_result = compute_alpha_owr(pd.Series([o]), pd.Series([rd]), pd.Series([ro]), a, su, sz, si, spd, spo)
print("Computed alpha:", alpha_result)

Computed alpha: 0    0.848057
dtype: float64


In [45]:
# Example usage to download data from WRDS TAQ database using WRDS package
def download_taq_data(start_year, end_year):
    """Downloads required data from WRDS TAQ database for calculating r_d, r_o, and y_e.
    
    Params
    ------
    start_year : int
        The starting year for the data download.
    end_year : int
        The ending year for the data download.
    
    Returns
    -------
    DataFrame
        A DataFrame containing the data needed to calculate r_d, r_o, and y_e.
    """
    db = wrds.Connection(wrds_username='aheitz')
    data_list = []

    for year in range(start_year, end_year + 1):
        query = f"""
        SELECT date, sym_root, sym_suffix, buynumtrades_lr, sellnumtrades_lr, oprc, cprc, ret_mkt_m,
               vw_price_m, mid_after_open, total_vol_m, total_vol_b, total_vol_a
        FROM taqmsec.wrds_iid_{year}
        WHERE sym_root IS NOT NULL
        """
        yearly_data = db.raw_sql(query)
        data_list.append(yearly_data)

    db.close()
    taq_data = pd.concat(data_list, ignore_index=True)
    return taq_data

In [79]:
db = wrds.Connection(wrds_username='aheitz')

Loading library list...
Done


In [55]:
describe_table = db.describe_table('taqmsec', 'wrds_iid_2003')

Approximately 602729 rows in taqmsec.wrds_iid_2003.


In [66]:
db.close()

AttributeError: 'NoneType' object has no attribute 'dispose'

In [64]:
db.list_libraries()

['aha_sample',
 'ahasamp',
 'audit',
 'audit_audit_comp',
 'audit_common',
 'audit_corp_legal',
 'audit_oia',
 'auditsmp',
 'auditsmp_all',
 'bank',
 'bank_all',
 'bank_premium_samp',
 'banksamp',
 'block',
 'block_all',
 'boardex',
 'boardex_na',
 'boardex_trial',
 'boardsmp',
 'bvd_amadeus_trial',
 'bvd_bvdbankf_trial',
 'bvd_orbis_trial',
 'bvdsamp',
 'calcbench_trial',
 'calcbnch',
 'cboe',
 'cboe_all',
 'cboe_sample',
 'cboesamp',
 'ciq',
 'ciq_common',
 'ciqsamp',
 'ciqsamp_capstrct',
 'ciqsamp_common',
 'ciqsamp_keydev',
 'ciqsamp_pplintel',
 'ciqsamp_ratings',
 'ciqsamp_transactions',
 'ciqsamp_transcripts',
 'cisdmsmp',
 'columnar',
 'comp',
 'comp_bank',
 'comp_bank_daily',
 'comp_execucomp',
 'comp_global',
 'comp_global_daily',
 'comp_na_annual_all',
 'comp_na_daily_all',
 'comp_na_monthly_all',
 'comp_segments_hist',
 'comp_segments_hist_daily',
 'compa',
 'compb',
 'compg',
 'compm',
 'compsamp',
 'compsamp_all',
 'compsamp_snapshot',
 'compseg',
 'contrib',
 'contrib_as_

In [56]:
help(db.list_tables)

Help on method list_tables in module wrds.sql:

list_tables(library) method of wrds.sql.Connection instance
    Returns a list of all the views/tables/foreign tables within a schema.
    
    :param library: Postgres schema name.
    
    :rtype: list
    
    Usage::
    >>> db.list_tables('wrdssec')
    ['wciklink_gvkey', 'dforms', 'wciklink_cusip', 'wrds_forms', ...]



In [52]:
from Constants import Constants as const
import os

table_list = db.list_tables('taqmsec')
with open(os.path.join(const.TEMP_PATH, 'taqmsec_tabls.txt'), 'w') as f:
    for table in table_list:
        f.write(f"{table}\n")

In [60]:
data_list = list()

In [62]:
for year in range(2006, 2014):
    query = f"""
    SELECT date, sym_root, sym_suffix, buynumtrades_lr, sellnumtrades_lr, oprc, cprc, ret_mkt_m,
           vw_price_m, mid_after_open, total_vol_m, total_vol_b, total_vol_a
    FROM taqmsec.wrds_iid_{year}
    WHERE sym_root IS NOT NULL
    """
    yearly_data = db.raw_sql(query)
    data_list.append(yearly_data)

In [63]:
wrds_iid_df = pd.concat(data_list, ignore_index=True)
wrds_iid_df.to_pickle(os.path.join(const.TEMP_PATH, 'wrds_iid_2014_2016.pkl'))

In [67]:
wrds_iid_2013 = pd.read_pickle(os.path.join(const.TEMP_PATH, 'wrds_iid_2013.pkl'))
taq_data = pd.concat([wrds_iid_2013, wrds_iid_df], ignore_index=True)

# Calculating y_e, r_d, and r_o
taq_data['y_e'] = (taq_data['buynumtrades_lr'] - taq_data['sellnumtrades_lr']) / (taq_data['buynumtrades_lr'] + taq_data['sellnumtrades_lr'])
taq_data['r_d'] = (taq_data['vw_price_m'] - taq_data['mid_after_open'] + taq_data.get('divamt', 0)) / taq_data['mid_after_open']
taq_data['r_o'] = (taq_data['mid_after_open'] - taq_data['vw_price_m']) / taq_data['mid_after_open']

print(taq_data.head())

         date sym_root sym_suffix  buynumtrades_lr  sellnumtrades_lr   oprc  \
0  2013-01-02        A       None          13885.0           15996.0  41.88   
1  2013-01-02       AA       None          25249.0           24626.0   8.88   
2  2013-01-02       AA         PR              1.0               1.0  84.79   
3  2013-01-02     AACC       None            327.0             367.0   4.69   
4  2013-01-02     AADR       None              3.0               2.0  31.65   

    cprc  ret_mkt_m  vw_price_m  mid_after_open  total_vol_m  total_vol_b  \
0  41.88   0.000000   41.575585          41.880    6264024.0      14270.0   
1   8.99   0.012311    8.915031           8.875   20558520.0     357248.0   
2  84.00  -0.009361   84.263333          84.420        300.0          NaN   
3   4.84   0.031482    4.845709           4.915     113543.0          NaN   
4  31.50  -0.004751   31.527453          31.485       2159.0          NaN   

   total_vol_a       y_e       r_d       r_o  
0      12480.0 

In [107]:
gpin_parameter_df: DataFrame = pd.read_csv(os.path.join(const.DATA_PATH, 'gpin-1319.csv')).drop(['f', 'rc'], axis=1).rename(
    columns={'yyyy': 'year'})
owr_parameter_df: DataFrame = pd.read_csv(os.path.join(const.DATA_PATH, 'owr-1319.csv')).drop(['f', 'rc'], axis=1).rename(
    columns={'yyyy': 'year'})

In [81]:
query = f"""
SELECT date, permno, sym_suffix, sym_root
FROM wrdsapps_link_crsp_taqm.taqmclink
WHERE sym_root IS NOT NULL AND date between '2013-01-01' and '2016-12-31' 
"""
taq_crsp_link = db.raw_sql(query)

In [82]:
db.close()

In [103]:
taq_crsp_link.memory_usage(True).sum() /(1024 * 1024)

603.9910926818848

In [88]:
taq_data.shape

(7917668, 16)

In [102]:
taq_data_crsp = taq_data.drop(['sym_suffix'], axis=1).drop_duplicates(
    subset=['date', 'sym_root'], keep='first').merge(
    taq_crsp_link.dropna(subset=['permno']).drop(['sym_suffix'], axis=1).drop_duplicates(
    subset=['date', 'sym_root'], keep='first'), on=['date', 'sym_root'], how='inner')
taq_data_crsp.shape

(4947894, 16)

In [112]:
taq_data_crsp['date'] = pd.to_datetime(taq_data['date'])
taq_data_crsp['year'] = taq_data_crsp['date'].dt.year
taq_data_crsp['permno'] = pd.to_numeric(taq_data_crsp['permno'], errors='coerce')
taq_data_crsp_model = taq_data_crsp.merge(gpin_parameter_df, on=['permno', 'year'], how='left').merge(
    owr_parameter_df, on=['permno', 'year'], how='left', suffixes=("", '_owr'))

In [114]:
taq_data_crsp_model.keys()

Index(['date', 'sym_root', 'buynumtrades_lr', 'sellnumtrades_lr', 'oprc',
       'cprc', 'ret_mkt_m', 'vw_price_m', 'mid_after_open', 'total_vol_m',
       'total_vol_b', 'total_vol_a', 'y_e', 'r_d', 'r_o', 'permno', 'year',
       'a', 'd', 'eta', 'p', 'r', 'th', 'a_owr', 'si', 'spd', 'spo', 'su',
       'sz'],
      dtype='object')

In [116]:
taq_data_crsp_model.dropna(subset=['a', 'd', 'eta', 'p', 'r', 'th', 'a_owr', 'si', 'spd', 'spo', 'su', 'sz'], how='all', inplace=True)

In [120]:
taq_data_crsp_model.loc[:, 'gpin'] = taq_data_crsp_model.apply(
    lambda x: compute_alpha_gpin(x['a'], x['r'], x['p'], x['eta'], x['d'], x['th'], x['buynumtrades_lr'], x['sellnumtrades_lr']), axis=1)

In [130]:
taq_data_crsp_model.loc[:, 'owr'] = taq_data_crsp_model.apply(
    lambda x: compute_alpha_owr(pd.Series([x['y_e']]), pd.Series([x['r_d']]), pd.Series([x['r_o']]), x['a_owr'], x['su'], x['sz'], x['si'], x['spd'], x['spo']), axis=1)

  alphas[i] = 1 / (1 + (1 - a) / a * exp(_lf_owr(x[:, i], dsd, isd)))
  alphas[i] = 1 / (1 + (1 - a) / a * exp(_lf_owr(x[:, i], dsd, isd)))


In [132]:
taq_data_crsp_model.to_pickle(os.path.join(const.TEMP_PATH, '2013_2016_gpin_owr.pkl'))