In [None]:
# imports
%reload_ext autoreload
%autoreload 2
%matplotlib inline 

from matplotlib import pyplot as plt

from utils.basic_utils import *
from utils.pricing import *
from utils import ml_utils as mu

pd.options.display.float_format = '{:,.2f}'.format

In [None]:
import time, os, sys
from tqdm import tqdm
from sklearn import preprocessing
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.model_selection import cross_val_score, cross_validate, train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.utils.multiclass import unique_labels
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.metrics import accuracy_score, log_loss, precision_recall_fscore_support
from sklearn.metrics import precision_score, roc_auc_score

from sklearn.ensemble import VotingClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.externals import joblib

In [None]:
bench = '^GSPC'
sec_windows, stds = [5, 20, 60], 1
pred_fwd_windows = [60]
inv = incl_px = incl_name = False
y_col = 'fwdReturn'
cuts = { '1d': [-1, -0.1, -.02, .02, .1, 1.] }
cut_range = cuts['1d']
fwd_ret_labels = ["bear", "short", "neutral", "long", "bull"]

### Get pricing / context data

In [None]:
# TAKES ~8m on local drive, 3m on AWS for 1230 companies, do once and persist
excl_list = [] # ['BHF', 'ERI']
symbols_list = excl(config['companies'], excl_list)
px_close = get_mults_pricing(symbols_list).drop_duplicates().dropna(subset=['AAPL'])
# save down to drive if refresh pricing
os.makedirs('tmp', exist_ok=True)
px_close.to_parquet('tmp/mult-co-px-ds')

In [None]:
px_close = pd.read_parquet('tmp/mult-co-px-ds')
px_close.info()

In [None]:
# px_close.tail().isna().any(0).sort_values()

In [None]:
# latest quotes, profile, and industries
dates = read_dates('quote')
tgt_date = [dates[-1]] # last date saved in S3

In [None]:
quotes = load_csvs('quote_consol', tgt_date)
quotes.set_index('symbol', drop=False, inplace=True)

profile = load_csvs('summary_detail', ['assetProfile'])
profile.set_index('symbol', drop=False, inplace=True)

profile.drop(profile[profile.symbol.isin(excl_list)].index, inplace=True)

all_equities = quotes[quotes.quoteType == 'EQUITY'].symbol.unique()
print('Delta quote: ', set(symbols_list) - set(all_equities))
# reduced subset, if any
sub_equities = set(px_close.columns.tolist()).intersection(all_equities)
print('Delta reduced set: ', set(symbols_list) - set(sub_equities))

eqty_symbols = profile[profile.symbol.isin(sub_equities)].symbol.unique().tolist()
delta_symb = set(symbols_list) - set(eqty_symbols)
print('Delta profile: ', len(delta_symb), delta_symb)

# Create a frame of market, sector and industry index (once)
# for relative performance calculations
sel_profiles = profile[profile.symbol.isin(all_equities)]
sel_profiles.groupby(['sector', 'industry'])[['industry']].count()
sectors = sel_profiles.sector.unique()
industries = sel_profiles.industry.unique()

print(f'Sectors: {sectors.shape[0]}, Industries: {industries.shape[0]}')

indices_df = pd.concat([
    eq_wgt_indices(profile, px_close, 'sector', sectors, subset=eqty_symbols),
    eq_wgt_indices(profile, px_close, 'industry', industries, subset=eqty_symbols),
    to_index_form(get_symbol_pricing(bench)['close'], bench)
], axis=1).drop_duplicates()

### Final

In [None]:
def create_ds(context):
    print('create_ds')
    train_model = context['train_model']
    (path, ds_name) = context['ds_path_name']
    tickers = context['tickers']
    load_ds = context['load_ds']
    tail = 10**4 if train_model else 252*2

    if load_ds & os.path.isfile(path + '/' + ds_name):
        df_large = pd.read_parquet(path + '/' + ds_name)
        return df_large
    
    super_list = []
    for i, ticker in tqdm(enumerate(tickers)):
        try:
            close = px_close[ticker].dropna().tail(tail)
            ft_df = px_mom_feats(close, ticker, stds, inv, incl_px, sec_windows, incl_name)
            ft_df[y_col] = px_fwd_rets(close, ticker, pred_fwd_windows).mean(axis=1)

            df = get_symbol_pricing(ticker).tail(tail) #full retrieve
            top_groups = tuple([bench] + list(profile.loc[ticker, ['sector', 'industry']]))
            co = px_mom_co_feats(df, indices_df, top_groups)

            ft_df.loc[:, 'country'] = profile.loc[ticker,:].country
            ft_df.loc[:, 'currency'] = quotes.loc[ticker,:].currency
            ft_df = pd.concat([ft_df, co.loc[ft_df.index, :]], axis=1)
            super_list.append(ft_df)
            # print('{} Adding {} to dataset'.format(i, ticker))
        except Exception as e:
            print("Exception: {0}\n{1}".format(ticker, e))
    df_large = pd.concat(super_list, axis=0)
    
    if train_model:
        os.makedirs(path, exist_ok=True)
        df_large.to_parquet(path + '/' + ds_name)
    print('df_large.shape {}'.format(df_large.shape))
    
    return df_large

def pre_process_ds(raw_df, context):
    print('pre_process_ds')    
    train_model = context['train_model']
    fill_on, imputer_on, scaler_on = context['fill'], context['impute'], context['scale']
    categoricals, exclude = context['categoricals'], context['exclude']
    (path, train_cols) = context['trained_cols']
    test_sz, verbose = context['test_size'], context['verbose']
    
    # convert categorical columns    
    for col in categoricals: raw_df = dummy_col(raw_df, col, shorten=True)
    raw_df.drop(columns=exclude[:-1], inplace=True) # remove all except symbol
    
    scaler = StandardScaler()
    imputer = SimpleImputer(
        missing_values=np.nan, 
        strategy='median', copy=False)
    X_cols = excl(raw_df.columns, [exclude[-1] ,y_col]) #not needed
        
    raw_df.replace([np.inf, -np.inf], np.nan, inplace=True)
    if scaler_on: raw_df.loc[:, X_cols] = scaler.fit_transform(raw_df[X_cols])

    pred_X = X_train = X_test = y_train = y_test = None
    if train_model:
        raw_df.drop(columns=exclude[-1], inplace=True) # remove symbol
        if fill_on: raw_df.loc[:, X_cols].fillna(method=fill_on, inplace=True)

        # discretize forward returns into classes
        raw_df.dropna(subset=[y_col], inplace=True)
        raw_df.loc[:, y_col] = discret_rets(raw_df[y_col], cut_range, fwd_ret_labels)
        raw_df.dropna(subset=[y_col], inplace=True) # no nas in y_col
        print(sample_wgts(raw_df[y_col]))
        raw_df.loc[:, y_col] = raw_df[y_col].astype(str) # class as string
        
        if imputer_on: raw_df.loc[:, X_cols] = imputer.fit_transform(raw_df[X_cols])
        else: raw_df = raw_df.dropna()

        X, y = raw_df.drop(columns=y_col), raw_df[y_col]
        X_train, X_test, y_train, y_test = train_test_split(
            X, y, test_size=test_sz, random_state=42)
        np.save(path + train_cols, X_train.columns) # save feature order
    else: 
        # feature for last date, pending to implement more flexibility
        pred_X = raw_df.loc[raw_df.index[-1], :].drop(columns=y_col).dropna(axis=0)
    
    [print(x.shape) for x in (pred_X, X_train, X_test, y_train, y_test) if x is not None]
    return pred_X, X_train, X_test, y_train, y_test

def train_ds(context):
    context['load_ds'] = True
    context['train_model'] = True
    grid_search = context['grid_search']
    verbose = context['verbose']
    (path, model_name) = context['ml_path']
    portion = context['portion']
        
    ds_df = create_ds(context)
    print(df.info(verbose=False))
    _, X_train, X_test, y_train, y_test = pre_process_ds(ds_df, context)

    features = X_train.shape[1]
    best_params = { # best from GridSearch
        'n_estimators': 25, 
        'max_features': features, 
        'max_depth': 30,
        'min_samples_split': 2,
        'min_samples_leaf': 2,
        'random_state': 0,    
        'n_jobs': -1}
    if grid_search:
        print('GridSearchCV for RandomForestClassifier')
        param_grid = {
            'n_estimators': [50], 
            'max_features': ['sqrt', 'log2', features // 2, features // 3,], 
            'max_depth': [30],
            'min_samples_split': [2, 5, 10],
            'min_samples_leaf': [2, 5, 10],
            'random_state': np.arange(0, 3, 1),}
        clf = GridSearchCV(RandomForestClassifier(random_state=42),
                           param_grid, n_jobs=-1,
                           cv=5, iid=True, verbose=verbose)
        clf.fit(X_train, y_train)
        if verbose: 
            mu.print_cv_results(
                clf, (X_train, X_test, y_train, y_test), 
                feat_imp=True, top=20)
        best_params = clf.best_params_
    clf1 = RandomForestClassifier(**best_params)
    clf1.fit(X_train, y_train)
    print('RandomForestClassifier scores: Train {}, Test {}'.format(
    clf1.score(X_train, y_train), clf1.score(X_test, y_test)))
    
    # ExtraTreesClassifier
    clf2 = ExtraTreesClassifier(
        n_estimators=50, 
        max_depth=30, 
        min_samples_split=2, 
        min_samples_leaf=1,        
        random_state=5, 
        n_jobs=-1)
    clf2.fit(X_train, y_train)
    print('ExtraTreesClassifier scores: Train {}, Test {}'.format(
    clf2.score(X_train, y_train), clf2.score(X_test, y_test)))
                
    for vote in ['hard', 'soft']:
        eclf = VotingClassifier(
            estimators=[('rf', clf1), ('et', clf2)],
            voting=vote)
        clf = eclf.fit(X_train, y_train)
        print('VotingClassifier scores Train {}, Test {}'.format(
                clf.score(X_train, y_train), clf.score(X_test, y_test)))
        os.makedirs(path, exist_ok=True)
        fname = path + model_name.format(vote)
        joblib.dump(clf, fname)
        print('Saved ', fname)
        
def predict_ds(context):
    context['load_ds'] = False
    context['train_model'] = False
    (path, model_name) = context['ml_path']
    verbose = context['verbose']
    (path, train_cols) = context['trained_cols']
    
    df_large = create_ds(context)
    pred_X, _, _, _, _ = pre_process_ds(df_large, context)
    print('predict_ds')
    print('pred_X.shape', pred_X.shape)
    
    # ensure prediction dataset is consistent with trained model
    trained_cols = np.load(path + train_cols) # save feature order    
    missing_cols = [x for x in trained_cols if x not in pred_X.columns]
    pred_X = pd.concat([pred_X, pd.DataFrame(columns=missing_cols)], axis=1)
    pred_X[missing_cols] = 0
    pred_X = pred_X[list(trained_cols) + ['symbol']]    

    pred_df = pd.DataFrame()
    pred_df['symbol'] = pred_X.symbol
    for vote in ['hard', 'soft']:
        fname = path + model_name.format(vote)
        clf = joblib.load(fname) # load latest models
        print('Loaded', fname)
        preds = clf.predict(pred_X.iloc[:, :-1])
        # preds = np.where(preds == 'nan', 'neutral', preds) #replace nan
        pred_class = np.array([fwd_ret_labels.index(x) for x in preds])        
        pred_df[f'{vote}_pred_class'] = pred_class
        pred_df[f'{vote}_pred_label'] = preds
        if vote == 'soft':
            probs = clf.predict_proba(pred_X.iloc[:, :-1])
            pred_prob = np.argmax(probs, axis=1)
            pred_df[f'{vote}_confidence'] = [x[np.argmax(x)] for x in probs] # higest prob
            prob_df = pd.DataFrame(probs, index=pred_df.index, columns=clf.classes_)
            pred_df = pd.concat([pred_df, prob_df[fwd_ret_labels]], axis=1)

    # store in S3
    s3_path = context['s3_path']
    s3_df = pred_df.reset_index(drop=False)
    rename_col(s3_df, 'index', 'pred_date')
    csv_store(s3_df, s3_path, csv_ext.format(dates[-1]))
            
    return pred_df        

In [None]:
# pending cleanup: use ml_path and tmp_path separate
context = {
    'tickers': [],
    'ml_path': ('../ML/', 'co_pxmom_ML_{}.pkl'),
    'ds_path_name': ('tmp', 'co-pxmom-large'),
    'trained_cols': ('../ML/', 'co_pxmom_train_cols.npy'),
    'load_ds': True,
    'portion': 100e-2,
    'categoricals': ['sector'],
    'exclude': ['industry', 'country', 'currency', 'symbol'],
    'fill': 'bfill',
    'impute': False,
    'scale': True,
    'test_size': .20,
    'grid_search': False,
    'verbose': 2,
    's3_path': 'recommend/co-pxmom/'
}

#### Train

In [None]:
!rm ./tmp/{context['ds_path_name'][1]}

In [None]:
%time train_ds(context)

#### Predict

In [None]:
(path, _) = context['ml_path']
!ls -lh ./{path}/

In [None]:
# predict for all
context['tickers'] = eqty_symbols[:50]
%time pred_df = predict_ds(context)

#### Store / Read S3

In [None]:
s3_path = context['s3_path']

In [None]:
storeDate = dates[-1]
# storeDate = '2019-03-27'

In [None]:
# read from S3
pred_df = pd.read_csv(
    csv_load(f'{s3_path}{storeDate}'), 
    index_col='pred_date', parse_dates=True)

In [None]:
# store in S3
s3_df = pred_df.reset_index(drop=False)
rename_col(s3_df, 'index', 'pred_date')
csv_store(s3_df, s3_path, csv_ext.format(storeDate))

#### Visualize

In [None]:
# recommendation distribution
pd.value_counts(pred_df.loc[pred_df.hard_pred_label == pred_df.soft_pred_label].soft_pred_label)

In [None]:
# top 3 picks by label
mask = pred_df.hard_pred_label == pred_df.soft_pred_label
agree_df = pred_df.loc[mask].drop_duplicates()
# should add sector and industries, group for allocation insights
# should add marketcap, beta, etc, group for risk exposure insights
label_mask = agree_df.soft_pred_label.isin(['bear', 'short', 'long', 'bull'])
agree_df.loc[label_mask]\
    .sort_values(by='soft_confidence', ascending=False)\
    .groupby(by='soft_pred_label').head(5)\
    .sort_values(by='soft_pred_label')

In [None]:
pred_df.loc[pred_df.symbol == 'BIIB',:]

### Step by Step

#### Dataset creation

In [None]:
# Take a while ~40min to run on 1200 companies
# Can we make faster?
super_list = []
for i, ticker in enumerate(tickers):
    try:
        close = px_close[ticker].drop_duplicates()
        ft_df = px_mom_feats(
            close, ticker, stds, inv, incl_px, 
            sec_windows, incl_name)
        ft_df[y_col] = px_fwd_rets(
            close, ticker, pred_fwd_windows).mean(axis=1)
        df = get_symbol_pricing(ticker) #full retrieve
        co = px_mom_co_feats(
            df, indices_df, 
            [bench] + list(profile.loc[ticker, ['sector', 'industry']]))

        ft_df.loc[:, 'country'] = profile.loc[ticker,:].country
        ft_df.loc[:, 'currency'] = quotes.loc[ticker,:].currency

        ft_df = pd.concat([ft_df.dropna(), co.dropna()], axis=1)
        super_list.append(ft_df)
        print(i, ticker)
    except Exception as e:
        print("Exception: {0}\n{1}".format(ticker, e))
df_large = pd.concat(super_list, axis=0)
os.makedirs('tmp', exist_ok=True)
df_large.to_parquet('tmp/company-px_mom-large')
df_large.shape

#### Pre-processing

In [None]:
df_large = pd.read_parquet('tmp/company-px_mom-large')
print(df_large.info(verbose=False))

In [None]:
df_large = trim_df(df_large, context['portion'])
df_large.shape

In [None]:
categoricals = context['categoricals']
exclude = context['exclude']
for col in categoricals: df_large = dummy_col(df_large, col, shorten=True)
df_large.drop(columns=exclude, inplace=True)

In [None]:
%time pred_X, X_train, X_test, y_train, y_test = pre_process_ds(df_large, context)

In [None]:
[x.shape for x in (X_train, X_test, y_train, y_test)]

In [None]:
%%time
# Trims dataset in case it's too large for experimentation
# Reduce dataset for experimentation
# Note that experiment dataset is not stratified
exp_perc = 20e-2
_, df_raw = train_test_split(df_large, test_size=exp_perc, shuffle=True, )

df_raw.dropna(subset=[y_col],  inplace=True)
df_raw[y_col] = discret_rets(df_raw[y_col], cut_range, fwd_ret_labels)

# df_raw.loc[:, y_col] = df_raw[y_col].astype(str)

y_col_dist = sample_wgts(df_raw[y_col], fwd_ret_labels)
(y_col_dist[fwd_ret_labels]).round(3)

In [None]:
categoricals = ['sector', 'industry', 'country', 'currency']
%time for col in categoricals: df_raw = dummy_col(df_raw, col, shorten=True)

In [None]:
excl_list = ['symbol', ] # drop unneeded columns
%time df_raw.drop(columns=excl_list, inplace=True, errors='ignore')
df_raw.shape

In [None]:
# Mean based imputer
imputer_on, scaler_on = True, False
imputer = SimpleImputer(missing_values=np.nan, strategy='mean', copy=False)
scaler = StandardScaler()

pre_ml_df = df_raw.copy()
pre_ml_df.dropna(subset=[y_col], inplace=True)
pre_ml_df.loc[:, y_col] = pre_ml_df[y_col].astype(str)
X_cols = excl(pre_ml_df.columns, [y_col])

if imputer_on: pre_ml_df.loc[:, X_cols] = imputer.fit_transform(pre_ml_df[X_cols])
else: pre_ml_df.dropna(inplace=True)
if scaler_on: pre_ml_df.loc[:, X_cols] = scaler.fit_transform(pre_ml_df[X_cols])

X, y = pre_ml_df.drop(columns=y_col), pre_ml_df[y_col]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.20, random_state=42)
X_train.shape, X_test.shape, y_train.shape, y_test.shape

#### Train models

In [None]:
X_train.shape

In [None]:
from sklearn.calibration import CalibratedClassifierCV
from sklearn.metrics import brier_score_loss

grid_search = context['grid_search']
verbose = context['verbose']
best_params = {
    'n_estimators': 100, 
    'max_features': X_train.shape[1] // 2, 
    'random_state': 0,
    'max_depth': None, 
    'min_samples_split': 2, 
    'n_jobs': -1}
if grid_search:
    print('GridSearchCV for RandomForestClassifier')
    param_grid = {
        'n_estimators': [100], 
        'max_features': ['sqrt', X_train.shape[1] // 2, X_train.shape[1] // 3,],
        'random_state': np.arange(0, 5, 1),}
    clf = GridSearchCV(RandomForestClassifier(random_state=42),
                       param_grid, n_jobs=-1,
                       cv=5, iid=True, verbose=verbose)
    clf.fit(X_train, y_train)
    if verbose: print_cv_results(clf, X_train, X_test, y_train, y_test, feat_imp=True, top=20)
    best_params = clf.best_params_
clf1 = RandomForestClassifier(**best_params)
%time clf1.fit(X_train, y_train)
print('RandomForestClassifier scores: Train {}, Test {}'.format(
clf1.score(X_train, y_train), clf1.score(X_test, y_test)))

In [None]:
# ExtraTreesClassifier
clf2 = ExtraTreesClassifier(
    n_estimators=100, max_depth=None, 
    min_samples_split=2, random_state=0, n_jobs=-1)

%time clf2.fit(X_train, y_train)
print('ExtraTreesClassifier scores: Train {}, Test {}'.format(
clf2.score(X_train, y_train), clf2.score(X_test, y_test)))

In [None]:
%%time
# MLPClassifier
params = {
    'activation': 'relu', 
    'alpha': 0.001, 
    'hidden_layer_sizes': (50,), 
    'learning_rate': 'adaptive', 
    'max_iter': 200, 
    'random_state': 3, 
    'solver': 'adam'}

clf3 = MLPClassifier(**params)

clf3.fit(X_train, y_train)
print('MLPClassifier scores: Train {}, Test {}'.format(
clf3.score(X_train, y_train), clf3.score(X_test, y_test)))

In [None]:
%%time
ml_path = context['ml_path']
for vote in ['hard', 'soft']:
    eclf = VotingClassifier(
        estimators=[('rf', clf1), ('et', clf2)],
        voting=vote)
    clf = eclf.fit(X_train, y_train)
    print('VotingClassifier scores Train {}, Test {}'.format(
            clf.score(X_train, y_train), clf.score(X_test, y_test)))
    os.makedirs(ml_path, exist_ok=True)
    fname = ml_path + f'co_pxmom_ML_{vote}.pkl'
    joblib.dump(clf, fname)
    print('Saved ', fname)

In [None]:
def predict_ds(context):
    ml_path = context['ml_path']
    verbose = context['verbose']
    
    px_close = get_mults_pricing(include, freq, verbose=verbose);
    px_close.drop_duplicates(inplace=True)
    
    ds_idx, df_large = create_ds(px_close, context)
    pred_X, _, _, _, _ = pre_process_ds(df_large, context)    

    print('pred_X.shape', pred_X.shape)

    bench_df = px_close.loc[pred_X.index, bench].to_frame()
    for vote in ['hard', 'soft']:
        fname = ml_path + f'macro_ML_{vote}.pkl'
        clf = joblib.load(fname) # load latest models
        print('Loaded', fname)
        preds = clf.predict(pred_X)
        pred_class = np.array([fwd_ret_labels.index(x) for x in preds])        
        bench_df[f'{vote}_pred_class'] = pred_class
        bench_df[f'{vote}_pred_label'] = preds
        if vote == 'soft':
            probs = clf.predict_proba(pred_X)
            pred_prob = np.argmax(probs, axis=1)
            bench_df[f'{vote}_confidence'] = [x[np.argmax(x)] for x in probs] # higest prob
            prob_df = pd.DataFrame(probs, index=bench_df.index, columns=clf.classes_)
            bench_df = pd.concat([bench_df, prob_df[fwd_ret_labels]], axis=1)
        bench_df.dropna(subset=[bench], inplace=True)

    return bench_df

#### Metrics

In [None]:
show_fi(clf1, X_train, 25)

In [None]:
print('Confussion Matrix\n', confusion_matrix(clf.predict(X_test), y_test, labels=fwd_ret_labels))

In [None]:
print('Classificaton report\n', classification_report(clf.predict(X_test), y_test, target_names=fwd_ret_labels))

In [None]:
log_loss(y_test, clf.predict_proba(X_test))

#### Gridsearches

In [None]:
np.arange(10, X.shape[1], int(X.shape[1]*.25))

In [None]:
%%time
# GridSearchCV for RandomForestClassifier
parameters = {
    'n_estimators': [10, 20, 40], 
    'max_features': np.arange(10, X.shape[1], int(X.shape[1]*.25)), 
    'random_state': np.arange(1, 10, 3)}

# parameters = {
#     'n_estimators': [80], 
#     'max_features': [16], 
#     'random_state': [4]}

clf = GridSearchCV(RandomForestClassifier(), parameters, n_jobs=-1, cv=5)
clf.fit(X_train, y_train)

print(clf.score(X_train, y_train), clf.score(X_test, y_test))
print(clf.best_params_)

#### Feature construction

In [None]:
%%time
df = get_symbol_pricing(ticker)
ft_df = px_mom_feats(df, ticker, stds, inv, incl_px, sec_windows, incl_name)
ft_df[y_col] = px_fwd_rets(
    df.close, ticker, pred_fwd_windows).mean(axis=1, skipna=False)

In [None]:
display_all(ft_df.describe(include='all'))

In [None]:
Y = px_fwd_rets(df.close, ticker, pred_fwd_windows)
Y.mean(axis=1).plot.hist(bins=50)

In [None]:
px_close[ticker].shape, px.close.shape

In [None]:
px_close[ticker].tail()

In [None]:
df = px_close[[ticker]].copy()
# df.set_index(df.index.astype(np.datetime64), inplace=True)
df.loc[:, 'weekday'] = df.index.weekday
df.weekday.unique()
df.tail(60)
df.groupby('weekday').describe()

In [None]:
%%time
df = get_symbol_pricing(ticker)
co = px_mom_co_feats(
    df, indices_df, 
    [bench] + list(profile.loc[ticker, ['sector', 'industry']]))

In [None]:
co.info()

#### Date and minute based time analysis

In [None]:
freq = '1d'

In [None]:
# s1, s2 = '1810.HK', 'AAPL'
group_pricing = pd.DataFrame()
df1 = get_symbol_pricing(s1, freq, ['close'])
df2 = get_symbol_pricing(s2, freq, ['close'])
group_pricing = pd.DataFrame(df1)
# group_pricing.loc[:, s2] = df2
group_pricing = pd.concat([group_pricing, df2], axis=1)
group_pricing.describe()
# group_pricing

In [None]:
path = config['pricing_path'].format(freq)
data_dict = json_load(path + json_ext.format(ticker))

tz = data_dict['meta']['exchangeTimezoneName']
df = build_px_struct(data_dict, freq)

adjClose = data_dict['indicators']['adjclose'][0] if 'adjclose' in  data_dict['indicators'] else 0
close = data_dict['indicators']['quote'][0]
data_dict.keys(), data_dict['indicators'].keys()

In [None]:
df.index.date

In [None]:
dates = pd.to_datetime(
        data_dict['timestamp'], 
        unit='s', infer_datetime_format=True)
# dates = dates.astype(f'datetime64[ns, {tz}]')
# dates.tz_convert('America/New_York')
# dates = dates.tz_localize('America/New_York')
dates.floor('d' if freq == '1d' else 'min')

In [None]:
df.tail(5)

In [None]:
freq = '1d'

In [None]:
%time px_close = get_mults_pricing(symbols_list[:10], freq);

In [None]:
# [px_close[x].dropna().tail() for x in px_close.columns]
px_close.describe()
# px_close.tail()

In [None]:
f'Ticker: {ticker}'

In [None]:
px = get_symbol_pricing(ticker, freq)
px.close.tail()

#### Old code

In [None]:
# test distribution of Y variable
tickers = list(mu.sample_sector_tickers(eqty_symbols, profile, sectors, 50).index)
context['grid_search'] = False
context['tickers'] = tickers
context['train_model'] = True

df_large = create_ds(context)

df = df_large.copy()
df.dropna(subset=[y_col], inplace=True)
df[y_col] = discret_rets(df[y_col], cut_range, fwd_ret_labels)
df.dropna(subset=[y_col], inplace=True) # no nas in y_col
df[y_col] = df[y_col].astype(str) # class as string
sample_wgts(df[y_col])

pred_X, X_traxin, X_test, y_train, y_test = pre_process_ds(df_large, context)
pd.value_counts(discret_rets(df_large.fwdReturn, cut_range, fwd_ret_labels)).sum()
pd.value_counts(pd.concat([y_train, y_test], axis=0)).sum()

In [None]:
# Test cumulative drawdowns and pulls
n = 100
r_w = np.random.randn(n).cumsum() + 100
l_dd, h_dd, l_p, h_p = max_draw_pull(r_w)

plt.plot(r_w)
plt.plot(
    [l_dd, h_dd], 
    [r_w[l_dd], r_w[h_dd],], 
    'o', color='Red', markersize=10)
plt.plot(
    [l_p, h_p], 
    [r_w[l_p], r_w[h_p]], 
    'o', color='Green', markersize=10)

In [None]:
# Retrieves historical pricing
secpx = get_symbol_pricing(symbol, freq)
secpx.set_index(secpx.index.astype(np.datetime64), inplace=True)

In [None]:
fwd_ss_ret = lambda x, df, arr: df.loc[[y for y in arr[x-1] if y in df.index.tolist()]].mean()

In [None]:
# seasonality analysis
ss_df = closepx.pct_change().resample('M').sum().to_frame()
ss_df['year'], ss_df['month'] = ss_df.index.year, ss_df.index.month
ss_df = ss_df.pivot_table(index='year', columns='month').mean()
ss_pos = [(x, (x+1) if not (x+1) // 12 else 0, 
     x+2 if not (x+2) // 12 else x - 10) for x in range(12)]

# [fwd_ss_ret(x+1, ss_df['close'], ss_pos) for x in range(12)] # test

In [None]:
# apply seasonality, mean return of curr month plus next two
secpx['month'] = secpx.index.month
secpx['fwdSSRet'] = secpx.loc[:].month.apply(
    fwd_ss_ret, args=(ss_df['close'], ss_pos,))

In [None]:
secpx.columns # all columns

In [None]:
# normalized columns for ML training, still has outliers
ml_ds_cols = secpx.describe().loc['50%'][secpx.describe().loc['50%'] < 5].index.tolist()
ml_ds_cols

In [None]:
# prepare ML dataset
ml_ds = secpx[ml_ds_cols].copy()

class_cols = ['fwdChg1w', 'fwdChg1m', 'fwdChg3m']
cut_range = [-1, -0.05, .0, .02, .09, 1.]
fwd_ret_labels = ["bear", "short", "neutral", "long", "bull"]

for c in class_cols: ml_ds[c] = pd.cut(secpx[c], cut_range, labels=fwd_ret_labels)
ml_ds.info()

In [None]:
# drop the predicting class with most nas
ml_ds.dropna(inplace=True)
ml_ds.info()

In [None]:
ml_ds.hist(figsize=(15,15));

In [None]:
ml_ds.to_csv(csv_ext.format('co_price_mom_ds'), index=False)