In [13]:
# imports
%reload_ext autoreload
%autoreload 2
%matplotlib inline 

import sys, os
import pandas as pd
from utils.basic_utils import excl, config
from utils.pricing import get_mults_pricing

pd.options.display.float_format = '{:,.2f}'.format

In [14]:
from sklearn import preprocessing
from sklearn.impute import SimpleImputer
from sklearn.model_selection import cross_val_score, cross_validate, train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.feature_selection import SelectFromModel
from sklearn.utils.validation import column_or_1d
from sklearn.utils.multiclass import unique_labels
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.metrics import accuracy_score, log_loss, precision_recall_fscore_support
from sklearn.metrics import precision_score, roc_auc_score

from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.neural_network import MLPClassifier

In [15]:
bench = '^GSPC'
sec_windows, stds = [5, 20, 60], 1
pred_fwd_windows = [60]
inv = incl_px = incl_name = False
y_col = 'fwdReturn'

### Get universe pricing

In [16]:
# TAKES ~8m on local drive, 3m on AWS for 1230 companies, do once and persist
excl_list = [] # ['BHF', 'ERI']
symbols_list = excl(config['companies'], excl_list)
%time px_close = get_mults_pricing(symbols_list).drop_duplicates().dropna(subset=['AAPL'])

54it [01:48, 22.92s/it]

Exception, get_mults_pricing: MYL
Read timeout on endpoint URL: "None"


104it [03:35, 21.10s/it]

Exception, get_mults_pricing: TSS
Read timeout on endpoint URL: "None"


464it [07:58, 19.90s/it]

Exception, get_mults_pricing: SBS
Read timeout on endpoint URL: "None"


679it [11:24, 20.37s/it]

Exception, get_mults_pricing: SBNY
Read timeout on endpoint URL: "None"


1230it [16:23,  1.68it/s]


CPU times: user 2min 48s, sys: 4.84 s, total: 2min 53s
Wall time: 18min 19s


In [5]:
# save down to drive if refresh pricing
os.makedirs('tmp', exist_ok=True)
px_close.to_parquet('tmp/mult-co-px-ds')

In [10]:
px_close = pd.read_parquet('tmp/mult-co-px-ds')
px_close.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 3776 entries, 2004-03-29 to 2019-03-27
Columns: 1230 entries, FR to TM
dtypes: float64(1230)
memory usage: 35.5 MB


In [10]:
# use the latest saved data for profile and quote info
dates = read_dates('quote')
tgt_date = [dates[-1]] # last date saved in S3

quotes = load_csvs('quote_consol', tgt_date)
quotes.set_index('symbol', drop=False, inplace=True)

profile = load_csvs('summary_detail', ['assetProfile'])
profile.set_index('symbol', drop=False, inplace=True)

profile.drop(profile[profile.symbol.isin(excl_list)].index, inplace=True)

all_equities = quotes[quotes.quoteType == 'EQUITY'].symbol.unique()
print('Delta quote: ', set(symbols_list) - set(all_equities))
# reduced subset, if any
sub_equities = set(px_close.columns.tolist()).intersection(all_equities)
print('Delta reduced set: ', set(symbols_list) - set(sub_equities))

eqty_symbols = profile[profile.symbol.isin(sub_equities)].symbol.unique().tolist()
delta_symb = set(symbols_list) - set(eqty_symbols)
print('Delta profile: ', len(delta_symb), delta_symb)

# Create a frame of market, sector and industry index (once)
# for relative performance calculations
sel_profiles = profile[profile.symbol.isin(all_equities)]
sel_profiles.groupby(['sector', 'industry'])[['industry']].count()
sectors = sel_profiles.sector.unique()
industries = sel_profiles.industry.unique()

print(f'Sectors: {sectors.shape[0]}, Industries: {industries.shape[0]}')

%%time
indices_df = pd.concat([
    eq_wgt_indices(profile, px_close, 'sector', sectors, subset=eqty_symbols),
    eq_wgt_indices(profile, px_close, 'industry', industries, subset=eqty_symbols),
    to_index_form(get_symbol_pricing(bench)['close'], bench)
], axis=1).drop_duplicates()

Loading file quote/csv/2019-03-26
Loading file summary-categories/assetProfile


'Sectors: 11, Industries: 136'

CPU times: user 11.1 s, sys: 2.15 s, total: 13.3 s
Wall time: 4.24 s


### Dataset creation

In [127]:
df = get_symbol_pricing('AAPL')
ft_df = px_mom_feats(
    df['close'], ticker, stds, inv, incl_px, 
    sec_windows, incl_name)

In [128]:
ft_df[y_col] = px_fwd_rets(
        df.close, ticker, pred_fwd_windows).mean(axis=1)
    
co = px_mom_co_feats(
    df, indices_df, 
    [bench] + list(profile.loc[ticker, ['sector', 'industry']]))

In [129]:
co.dropna().shape

(3656, 16)

In [130]:
ft_df.dropna().shape

(3463, 11)

In [131]:
co.shape, ft_df.shape

((3916, 16), (3774, 11))

In [None]:
px_close[ticker].drop_duplicates()

In [None]:
# Take a while ~40min to run on 1200 companies
# Can we make faster?
super_list = []
for ticker in tqdm(eqty_symbols):
    try:
        close = px_close[ticker].drop_duplicates()
        ft_df = px_mom_feats(
            close, ticker, stds, inv, incl_px, 
            sec_windows, incl_name)
        ft_df[y_col] = px_fwd_rets(
            close, ticker, pred_fwd_windows).mean(axis=1)
        df = get_symbol_pricing(ticker) #full retrieve
        co = px_mom_co_feats(
            df, indices_df, 
            [bench] + list(profile.loc[ticker, ['sector', 'industry']]))

        ft_df.loc[:, 'country'] = profile.loc[ticker,:].country
        ft_df.loc[:, 'currency'] = quotes.loc[ticker,:].currency

        ft_df = pd.concat([ft_df.dropna(), co.dropna()], axis=1)
        super_list.append(ft_df)
    except Exception as e:
        print("Exception: {0}\n{1}".format(ticker, e))
df_large = pd.concat(super_list, axis=0)
os.makedirs('tmp', exist_ok=True)
df_large.to_parquet('tmp/company-px_mom-large')
df_large.shape

In [134]:
df_large.shape

(3788726, 29)

In [None]:
# df_large.drop_duplicates().describe().T
# df_large.sort_index().groupby(by=df_large.index).count().mean().sort_values()
# df_large.dropna().describe().T