In [1]:
# imports
%reload_ext autoreload
%autoreload 2
%matplotlib inline 

import pandas as pd
import numpy as np

from utils.basic_utils import config, read_dates, load_csvs, csv_load, excl, csv_store
from utils.pricing import roll_vol, load_px_close, discret_rets, get_ind_index
from utils.fundamental import filter_cols, filter_cols
from scipy.stats import linregress

import matplotlib as mpl
import matplotlib.pyplot as plt

pd.options.display.float_format = '{:,.2f}'.format

Loading utils/config.json


In [14]:
# utility functions
def top_recomm_by_class(pred_df, labels, top_items):
    mask = pred_df.hard_pred_label == pred_df.soft_pred_label
    agree_df = pred_df.loc[mask].drop_duplicates()
    # should add sector and industries, group for allocation insights
    # should add marketcap, beta, etc, group for risk exposure insights
    label_mask = agree_df.soft_pred_label.isin(['bear', 'short', 'long', 'bull'])
    return agree_df.loc[label_mask]\
        .sort_values(by='soft_confidence', ascending=False)\
        .groupby(by='soft_pred_label').head(top_items)\
        .sort_values(by='soft_pred_class')

def pred_distrib(pred_df, count_col):
    dist = pd.value_counts(pred_df[count_col]).to_frame()
    dist['weights'] = dist / dist.sum()
    return dist

def add_desc_stats(df, descriptive_cols):
    
    tickers = df.index
    for k in descriptive_cols.keys():
        desc_df, cols = descriptive_cols[k]['df'], descriptive_cols[k]['columns']
        for c in cols: df.loc[:, c] = tickers.map(desc_df[c].to_dict()).values 

    df = clean_df(df, large_vals, div_cols)

    df.loc[:, 'pegRatio'] = df.forwardPE / (val_df.growthRate * 100)
    df.loc[:, 'size'] = discret_rets(df.marketCap, mkt_cap_cuts, mkt_cap_labels)
        
    return df

def clean_df(df, large_vals, div_cols):
    # convert large values to billions
    df.loc[:, large_vals] = df.loc[:, large_vals] / 10**9
    df.loc[:, div_cols] = df[div_cols].div(df.regularMarketPrice, axis=0)
    df.replace([np.inf, -np.inf], np.nan, inplace=True)
    
    return df

def ml_votes_by_class(fndm_dfs, group_mask):
    super_list = []
    for key in fndm_dfs.keys():
        df = fndm_dfs[key]
        super_list.append(
            fndm_dfs[key].loc[:, group_mask].set_index(re_index_col)\
            .rename(columns={group_mask[0]: key}))
    df = pd.concat(super_list, axis=1, sort=False)
    # df.apply(pd.value_counts).loc[fwd_ret_labels].style.background_gradient(cmap='RdYlGn')
    return df.T.apply(pd.value_counts).T

q_group = lambda x, q: x.quantile(q)

ticker_across_mls = lambda ticker, df: df.set_index('symbol').loc[ticker]

# equal weight ml_confidence
equal_wgt_conf = lambda x: ticker_across_mls(x[0]).loc[:, x[1]].mean()

# take index and class, and calculate a dot product, weighted confidence
score_wgt_conf = lambda x: ticker_across_mls(x[0], lf_df).loc[
    :, [x[1], 'model']].set_index('model').T.mul(model_weights).sum(1).iloc[0]

def load_s3_preds(s3_path, tgt_date, key_list, eqty_symbols, verbose=True):
    """ Read ML prediction files """
    print(f'Loading files for {tgt_date}')
    fndm_dfs = {k: pd.read_csv(csv_load(f'{s3_path[k]}{tgt_date}'), 
            index_col='pred_date', parse_dates=True) for k in key_list}

    super_list = []
    for key in fndm_dfs.keys(): 
        fndm_dfs[key].loc[:, 'model'] = key
        fndm_dfs[key] = fndm_dfs[key].loc[fndm_dfs[key].symbol.isin(eqty_symbols), :]
        df = fndm_dfs[key]
        if verbose: print(f'{key.upper()}, {df.symbol.shape[0]} unique symbols')
        super_list.append(df)
        
    return pd.concat(super_list, axis=0)

def get_wtd_ML_results(lf_df, model_weights):
    """ calculate model weighted class and confidence level for ranking / sorting """
    spc_df = lf_df.pivot_table(
        index=['symbol'], columns=['model'], 
        values=['pred_class'], aggfunc='mean')\
        ['pred_class'][list(model_weights.index)]
    wgt_class_df = (spc_df * model_weights.T).sum(axis=1)
    sc_df = lf_df.loc[:, ['symbol', 'confidence', 'model']]\
        .pivot(index='symbol', columns='model', values='confidence')\
                   .loc[:, model_weights.index]
    wgt_conf_df = (sc_df * model_weights.T).sum(axis=1)
    wtg_df = pd.concat([wgt_class_df, wgt_conf_df], axis=1, sort=False)
    wtg_df.columns = ['wtg_class', 'wtg_ML_prob']
    wtg_df[lf_df.index.name] = lf_df.index.unique()[0]
    
    return wtg_df

def visualize_MLs(lf_df, key_list):
    pvt_df = lf_df.pivot_table(
        index=['model'], columns=['soft_pred_label'], 
        values=['soft_confidence'], aggfunc='count')
    class_distrib = pvt_df.div(pvt_df.sum(axis=1), axis=0)['soft_confidence'][fwd_ret_labels]
    # plot mean class across models    
    class_distrib.mean(axis=0).plot.barh(title='Mean prediction by class',);
    plt.savefig('../images/ML_mean_weight.png', dpi=300, rot=0, bbox_inches='tight')
    # plot class distribution by model    
    class_distrib.plot(title='Prediction distribution by model', kind='barh', stacked=True, cmap='RdYlGn');
    plt.savefig('../images/ML_class_distrib.png', dpi=300, rot=0, bbox_inches='tight')
    return class_distrib.T[key_list]

conf_slope = lambda y: linregress(range(len(y)), y).slope

In [3]:
# s3 paths
s3_path = {
    'macro': 'recommend/macro_ML/',
    'px_mom': 'recommend/micro_ML/',
    'bottom_up': 'recommend/bottomup_ML/',
    'fin_data': 'recommend/fdmn_ML-fin_data/',
    'key_statistics': 'recommend/fdmn_ML-key_statistics/',
    'eps_trend': 'recommend/fdmn_ML-eps_trend/',
    'eps_estimates': 'recommend/fdmn_ML-eps_estimates/',
    'day_quote': 'recommend/fdmn_ML-day_quote/',
    'iv_value': 'valuation/waterfall/'
}

In [4]:
# environment variables
bench = '^GSPC'
fwd_ret_labels = ["bear", "short", "neutral", "long", "bull"]
show_classes = ['bear', 'short', 'long', 'bull']
min_confidence = 0.5
max_rows = 20

key = 'fin_data'
agg_funcs = ['count', 'median', 'max']
mkt_cap_cuts = [0, 0.3, 2, 10, 300, 5000]
mkt_cap_labels = ['micro', 'small', 'mid', 'large', 'mega']

re_index_col = 'symbol'
value_col = 'soft_confidence'
show = ['symbol', 'soft_confidence']
mask_col, class_value = 'soft_pred_label', 'bull'
hard_vote, soft_vote = 'hard_pred_label', 'soft_pred_label'

large_vals = ['marketCap']
div_cols = ['targetMeanPrice', 'targetMedianPrice']

In [5]:
# context / descriptive data
dates = read_dates('quote')
tgt_date = dates[-1] # last date saved in S3

quotes = load_csvs('quote_consol', [tgt_date])

dollar_vol = ((quotes.averageDailyVolume10Day * quotes.regularMarketPrice) / 10**6)
quotes.loc[(dollar_vol > dollar_vol.quantile(0.2)) & (quotes['quoteType'] == 'EQUITY')] # the delta
mask = (dollar_vol > dollar_vol.quantile(0.1)) & (quotes['quoteType'] == 'EQUITY') & (quotes['regularMarketPrice'] > 5)
eqty_symbols = excl(quotes.loc[mask].symbol, ['GOOGL'])
# quotes.loc[quotes['quoteType'] == 'EQUITY'].index.difference(eqty_symbols) # the delta

profile = load_csvs('summary_detail', ['assetProfile'])
keystats = load_csvs('summary_detail', ['defaultKeyStatistics/' + str(tgt_date)])
finstats = load_csvs('summary_detail', ['financialData/' + str(tgt_date)])

for df in (quotes, profile, keystats, finstats):
    df.set_index('symbol', drop=False, inplace=True)
    
path = 'valuation/waterfall/'
print(f'Loading file {path}{tgt_date}')
val_df = pd.read_csv(csv_load(path+tgt_date), parse_dates=True)
val_df.set_index('symbol', inplace=True)
val_df.dropna(subset=['premDisc'], inplace=True)
val_df = val_df.loc[(np.abs(val_df.premDisc) < val_df.premDisc.median() * 3).values, :]

descriptive_cols = {
    'quotes': { 'df': quotes, 'columns': ['shortName', 'forwardPE', 'trailingPE', 'marketCap', 'regularMarketPrice'],},
    'profile': { 'df': profile, 'columns': ['sector', 'industry', 'country'],},
    'keystats': { 'df': keystats, 'columns': ['pegRatio', 'shortPercentOfFloat'],},
    'finstats': { 'df': finstats, 'columns': ['earningsGrowth', 'recommendationMean', 'targetMeanPrice', 'targetMedianPrice', 'numberOfAnalystOpinions'],},
    'valuation': { 'df': val_df, 'columns': ['premDisc', 'growthRate'],},
}

Loading file quote/csv/2019-05-03
Loading file summary-categories/assetProfile
Loading file summary-categories/defaultKeyStatistics/2019-05-03
Loading file summary-categories/financialData/2019-05-03
Loading file valuation/waterfall/2019-05-03


In [10]:
# for notebook only
ec2_IP = config['ec2_IP']
remote_path = config['remote_path']
px_close_ds = 'universe-px-ds'
temp_path = '../tmp/'
!scp -i ~vveiga/.ssh/qc_infra.pem ubuntu@{ec2_IP}:{remote_path}/tmp/{px_close_ds} {temp_path}{px_close_ds}
px_close = load_px_close(temp_path, px_close_ds, True).drop_duplicates().dropna(subset=['^GSPC'])
px_close.info()

universe-px-ds                                100%   18MB   9.1MB/s   00:01    


  labels = getattr(columns, 'labels', None) or [
  return pd.MultiIndex(levels=new_levels, labels=labels, names=columns.names)
  labels, = index.labels


<class 'pandas.core.frame.DataFrame'>
Index: 3776 entries, 2004-05-04 to 2019-05-03
Columns: 1285 entries, 0700.HK to KRW=X
dtypes: float32(1285)
memory usage: 18.5+ MB


### Bottom-up MLs

In [11]:
#
key_list = ['px_mom', 'bottom_up']
lf_df = load_s3_preds(s3_path, tgt_date, key_list, eqty_symbols)

# Model score weights 
ml_score_dict = {
    'px_mom': 0.76, 'bottom_up': 0.80,
}
cut_off_confidence = 0.6
model_scores = np.array(list(map(ml_score_dict.get, list(key_list))))
adj_weights = (model_scores - cut_off_confidence) / (model_scores - cut_off_confidence).sum()
model_weights = pd.Series({x:y for x,y in zip(list(key_list), adj_weights)})
model_weights

Loading files for 2019-05-03
PX_MOM, 19 unique symbols
BOTTOM_UP, 209 unique symbols


px_mom      0.44
bottom_up   0.56
dtype: float64

### Recomendations

In [15]:
# weighted appraoch of two models
filter_lf_df = lf_df.loc[(lf_df.confidence > 0.5)]
wtg_df = get_wtd_ML_results(filter_lf_df, model_weights)
show = ['wtg_class','wtg_ML_prob','forwardPE', 'industry', 'targetMedianPrice', 'size']

In [16]:
L_df = wtg_df.loc[wtg_df['wtg_class'] > 3].sort_values(by='wtg_ML_prob', ascending=False).head(max_rows)
print(f'Long recommendations for {tgt_date}')
L_df
S_df = wtg_df.loc[wtg_df['wtg_class'] < 1].sort_values(by='wtg_ML_prob', ascending=False).head(max_rows)
print(f'Short recommendations for {tgt_date}')
S_df

Long recommendations for 2019-05-03
Short recommendations for 2019-05-03


Unnamed: 0_level_0,wtg_class,wtg_ML_prob,pred_date
symbol,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
SBAC,0.89,0.97,2019-05-01
IQ,0.0,0.56,2019-05-01
MPWR,0.0,0.56,2019-05-01
BOX,0.0,0.56,2019-05-01
SWKS,0.0,0.56,2019-05-01
SE,0.0,0.56,2019-05-01
PTC,0.0,0.55,2019-05-01
PEGA,0.0,0.55,2019-05-01
CSOD,0.0,0.55,2019-05-01
MANH,0.0,0.55,2019-05-01


In [17]:
# single model
one_model = lf_df.loc[(lf_df.model == 'px_mom')]

In [19]:
L_df = one_model.loc[one_model['pred_class'] > 3].sort_values(by='confidence', ascending=False).head(max_rows)
print(f'Long recommendations for {tgt_date}')
L_df
# S_df = one_model.loc[one_model['soft_pred_class'] < 1].sort_values(by='soft_confidence', ascending=False).head(max_rows)
# print(f'Short recommendations for {tgt_date}')
# S_df

Long recommendations for 2019-05-03


Unnamed: 0_level_0,symbol,pred_class,pred_label,confidence,bear,short,neutral,long,bull,model
pred_date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
2019-05-01,CHL,4,bull,0.98,0.0,0.0,0.01,0.0,0.98,px_mom
2019-05-01,AMX,4,bull,0.98,0.0,0.0,0.0,0.02,0.98,px_mom
2019-05-01,DISH,4,bull,0.93,0.0,0.0,0.06,0.01,0.93,px_mom


In [20]:
# lf_df.loc[lf_df.model.isin(['bottom_up']) & lf_df.symbol.isin(list(L_df.index))]
# lf_df.loc[lf_df.model.isin(['px_mom']) & lf_df.symbol.isin(list(L_df.index))]

Unnamed: 0_level_0,symbol,pred_class,pred_label,confidence,bear,short,neutral,long,bull,model
pred_date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1


#### Daily recommendations

In [21]:
lf_df = load_s3_preds(s3_path, tgt_date, key_list, eqty_symbols)
wtg_df = get_wtd_ML_results(lf_df, model_weights)
wtg_df = add_desc_stats(wtg_df, descriptive_cols)

Loading files for 2019-05-03
PX_MOM, 19 unique symbols
BOTTOM_UP, 209 unique symbols


In [22]:
# todays long positions
show = ['wtg_class','wtg_ML_prob','forwardPE', 'industry', 'targetMedianPrice', 'size']
L_df = wtg_df.loc[wtg_df['wtg_class'] > 3].sort_values(by='wtg_ML_prob', ascending=False).head(max_rows)
print(f'Long recommendations for {tgt_date}')
L_df.index

Long recommendations for 2019-05-03


Index(['DISH'], dtype='object', name='symbol')

In [91]:
# todays short positions
S_df = wtg_df.loc[wtg_df['wtg_class'] < 1].sort_values(by='wtg_ML_prob', ascending=False).head(max_rows)
print(f'Short recommendations for {tgt_date}')
S_df

Short recommendations for 2019-04-26


Unnamed: 0_level_0,wtg_class,wtg_ML_prob,pred_date,shortName,forwardPE,trailingPE,marketCap,regularMarketPrice,sector,industry,...,pegRatio,shortPercentOfFloat,earningsGrowth,recommendationMean,targetMeanPrice,targetMedianPrice,numberOfAnalystOpinions,premDisc,growthRate,size
symbol,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
NTGR,0.56,0.76,2019-04-26,"NETGEAR, Inc.",10.77,,0.97,30.92,Technology,Communication Equipment,...,,0.11,1.29,2.3,1.61,1.42,3.0,,,small
LGND,0.56,0.71,2019-04-26,Ligand Pharmaceuticals Incorpor,33.01,21.32,2.6,127.09,Healthcare,Biotechnology,...,,0.39,,1.7,1.67,1.81,5.0,,,mid
ANET,0.56,0.71,2019-04-26,"Arista Networks, Inc.",29.67,78.99,24.51,320.4,Technology,Computer Systems,...,1.03,0.03,0.63,2.2,0.96,0.98,27.0,2.7,0.29,large
PLCE,0.0,0.7,2019-04-26,"Children's Place, Inc. (The)",13.73,18.36,1.74,110.37,Consumer Cyclical,Apparel Stores,...,,0.39,,1.9,1.03,1.04,9.0,,,small
GRUB,0.0,0.68,2019-04-26,GrubHub Inc.,29.95,78.56,6.08,66.78,Technology,Internet Content & Information,...,0.36,0.2,,2.0,1.5,1.5,21.0,2.7,0.83,mid
TIF,0.0,0.67,2019-04-26,Tiffany & Co.,19.84,22.68,13.08,107.74,Consumer Cyclical,Luxury Goods,...,inf,,2.36,2.2,1.04,1.07,23.0,3.04,0.0,large
RGEN,0.56,0.67,2019-04-26,Repligen Corporation,70.3,186.19,3.04,68.89,Healthcare,Biotechnology,...,,0.1,-0.53,1.9,0.96,1.0,7.0,,,mid
PRGO,0.56,0.66,2019-04-26,Perrigo Company plc,11.2,53.65,6.93,50.97,Healthcare,Drug Manufacturers - Specialty & Generic,...,,0.09,0.16,2.9,1.16,0.99,10.0,,,mid
KHC,0.56,0.66,2019-04-26,The Kraft Heinz Company,11.36,,40.31,33.06,Consumer Defensive,Packaged Foods,...,inf,,,3.1,1.09,1.09,20.0,-1.44,0.0,large
VC,0.0,0.65,2019-04-26,Visteon Corporation,9.18,16.46,1.8,63.8,Consumer Cyclical,Auto Parts,...,,,-0.77,2.9,1.4,1.39,14.0,,,small


#### Company details

In [65]:
ticker = 'TSLA'
show = [
    'hard_pred_label','soft_pred_label','soft_confidence',
    'bear', 'short', 'neutral', 'long', 'bull', 'model']
wgt_input = (ticker, 'soft_confidence')
f'{quotes.loc[ticker].shortName}, {profile.loc[ticker].sector}, {profile.loc[ticker].industry}'

'Tesla, Inc., Consumer Cyclical, Auto Manufacturers'

In [66]:
# Company view across models
print(f'Model predictions for {ticker} on {tgt_date}')
print(f'Weighted ML confidence level: {score_wgt_conf(wgt_input)}')
lf_df.loc[lf_df['symbol'] == ticker, show]

Model predictions for TSLA on 2019-04-22
Weighted ML confidence level: 0.37099567131861505


Unnamed: 0_level_0,hard_pred_label,soft_pred_label,soft_confidence,bear,short,neutral,long,bull,model
pred_date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
2019-04-10,bull,bull,0.43,0.07,0.15,0.08,0.27,0.43,px_mom
2019-04-09,long,short,0.3,0.22,0.3,0.2,0.26,0.01,fin_data
2019-04-10,bear,short,0.4,0.32,0.4,0.09,0.16,0.03,key_statistics
2019-04-10,long,long,0.27,0.1,0.27,0.12,0.27,0.24,eps_trend
2019-04-10,neutral,long,0.26,0.21,0.19,0.12,0.26,0.22,eps_estimates
2019-04-10,bull,bull,0.39,0.12,0.16,0.11,0.23,0.39,day_quote


In [None]:
quotes.loc[ticker]