In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA, KernelPCA
from lightgbm import LGBMClassifier
from xgboost import XGBClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
import matplotlib.pyplot as plt
import seaborn as sns
from joblib import Parallel, delayed

In [2]:
df = pd.read_csv('historical_bar_daily.csv')
econstats = pd.read_csv('econstats.csv')

In [3]:
for s in df.symbol.unique():
    globals()[s] = df[df['symbol'] == s].copy()
    globals()[s].drop(['ab', 'bb', 'expiry', 'right', 'strike', 'instrument'], axis=1, inplace=True)
    for f in globals()[s].columns:
        if f not in ['symbol', 't']:
            globals()[s][f"{f}_lag1"] = globals()[s][f].shift(1)
            
    globals()[s]['date'] = globals()[s]['t'].apply(lambda x: str(x)[:10])
    globals()[s]['return'] = globals()[s]['c'].pct_change(1)
    globals()[s].drop(['a', 'b', 'c', 'h', 'l', 'm', 'o', 'symbol', 't', 'v'], axis=1, inplace=True)

In [4]:
for e in econstats.series_id.unique():
    globals()[e] = econstats[econstats['series_id'] == e].copy()
    globals()[e].rename(columns={'value':f'{e}_value'}, inplace=True)

In [5]:
econstats_series_id2 = []
for e in econstats.series_id.unique():
    if globals()[e].shape[0] == 47:
        if globals()[e][f'{e}_value'].isnull().sum() == 0:
            econstats_series_id2.append(e)

In [6]:
for e in econstats_series_id2:
    BTCUSD = pd.merge(
        BTCUSD, 
        globals()[e].drop(['series_id', 'date'], axis=1).rename(columns={'release_date':'date'}), 
        on='date',
        how='left'
    )
    
BTCUSD.fillna(method='ffill', inplace=True)
BTCUSD[BTCUSD.columns[10:]] = BTCUSD[BTCUSD.columns[10:]].shift(1)
BTCUSD.dropna(inplace=True)

In [7]:
BTCUSD.head()

Unnamed: 0,a_lag1,b_lag1,c_lag1,h_lag1,l_lag1,m_lag1,o_lag1,v_lag1,date,return,...,BALT524BP1FHSA_value,BALT524BPPRIV_value,BALT524BPPRIVSA_value,BALT524FIRE_value,BALT524INFON_value,BALT524LF_value,BALT524MFG_value,BALT524NAN_value,BALT524TRADN_value,BALT524UR_value
60,1193.7,1181.9,1187.8,1192.1,1181.5,1187.8,1181.9,2491.0,2017-03-01,0.0,...,423.719798,475.0,523.328065,80.270569,17.3,1494399.0,55.928772,1366.8,244.8,4.3
61,1193.7,1181.9,1187.8,1192.1,1181.5,1187.8,1181.9,2491.0,2017-03-03,0.035023,...,423.719798,475.0,523.328065,80.270569,17.3,1494399.0,55.928772,1366.8,244.8,4.3
62,1235.5,1223.3,1229.4,1229.5,1190.7,1229.4,1187.8,10079.0,2017-03-04,0.02489,...,423.719798,475.0,523.328065,80.270569,17.3,1494399.0,55.928772,1366.8,244.8,4.3
63,1266.3,1253.7,1260.0,1285.4,1215.7,1260.0,1229.4,13981.0,2017-03-05,0.021508,...,423.719798,475.0,523.328065,80.270569,17.3,1494399.0,55.928772,1366.8,244.8,4.3
64,1293.5,1280.7,1287.1,1279.3,1259.5,1287.1,1260.0,2539.0,2017-03-06,-0.01313,...,423.719798,475.0,523.328065,80.270569,17.3,1494399.0,55.928772,1366.8,244.8,4.3


In [8]:
def bt(instrument, clf, pca, min_obs):
    predictions = []
    for i in range(len(instrument)):
        if i + min_obs + 1 < len(instrument):
            X_train = instrument[i:i + min_obs].drop(['date', 'return'], axis=1)
            X_test = instrument[i + min_obs:i + min_obs + 1].drop(['date', 'return'], axis=1)

            y_train =  instrument[i:i + min_obs]['return']
            y_test =  instrument[i + min_obs:i + min_obs + 1]['return']

            y_train = np.where(y_train >= 0, 1, 0)
            y_test = np.where(y_test >= 0, 1, 0)
            
            pca.fit(X_train)
            X_train = pca.transform(X_train)
            X_test = pca.transform(X_test)

            clf.fit(X_train, y_train)
            y_pred = clf.predict(X_test)[0]
            predictions.append(y_pred)
            
    return predictions

In [13]:
symbols = []
classifiers = []
window_len = []
pcas = []
accuracies = []

estimators = [
    LGBMClassifier(n_jobs=-1), 
    XGBClassifier(use_label_encoder=False), 
    LogisticRegression(n_jobs=-1), 
    DecisionTreeClassifier(), 
    RandomForestClassifier(n_jobs=-1)
]

estimators_string = [
    "LGBMClassifier", 
    "XGBClassifier", 
    "LogisticRegression", 
    "DecisionTreeClassifier", 
    "RandomForestClassifier"
]

decomposer = [
    PCA(.95), PCA(.90), PCA(.85), PCA(.80), PCA(.75), PCA(.70), PCA(.65), PCA(.60), PCA(.55), PCA(.50), PCA(.45), PCA(.40),
    KernelPCA(.95), KernelPCA(.90), KernelPCA(.85), KernelPCA(.80), KernelPCA(.75), KernelPCA(.70), KernelPCA(.65), 
    KernelPCA(.60), KernelPCA(.55), KernelPCA(.50), KernelPCA(.45), KernelPCA(.40)
]


for e, s in zip(estimators, estimators_string):
    print(s)
    for p in np.arange(5, 96, 5):
        for l in range(30, 90 + 1):
            predictions = bt(instrument=BTCUSD, clf=e, pca=PCA(p), min_obs=l)
            diff = len(BTCUSD) - len(predictions)
            acc = (np.where(BTCUSD[diff:]['return'] >= 0, 1, 0) == predictions).mean()
            sharpe = (predictions * BTCUSD[diff:]['return']).mean() / (predictions * BTCUSD[diff:]['return']).std()

            classifiers.append(s)
            window_len.append(l)
            pcas.append(p)
            accuracies.append(acc)

LGBMClassifier
XGBClassifier
LogisticRegression


KeyboardInterrupt: 

In [9]:
predictions = bt(instrument=BTCUSD, clf=LGBMClassifier(n_jobs=-1), pca=PCA(10), min_obs=30)
diff = len(BTCUSD) - len(predictions)
acc = (np.where(BTCUSD[diff:]['return'] >= 0, 1, 0) == predictions).mean()
gain = ((np.where(BTCUSD[diff:]['return'] >= 0, 1, 0) == predictions) * BTCUSD[diff:]['return']).sum()
sharpe = (predictions * BTCUSD[diff:]['return']).mean() / (predictions * BTCUSD[diff:]['return']).std()
print(sharpe)

0.07575777431928114


In [10]:
acc

0.549963530269876

In [15]:
BTCUSD[diff:]['return'].mean() / BTCUSD[diff:]['return'].std()

0.07871721830608487

In [None]:
# 7.516982474041943

In [87]:
from sklearn.linear_model import Lasso

In [95]:
def bt2(instrument, clf, min_obs):
    predictions = []
    for i in range(len(instrument)):
        if i + min_obs + 1 < len(instrument):
            X_train = instrument[i:i + min_obs].drop(['date', 'return'], axis=1)
            X_test = instrument[i + min_obs:i + min_obs + 1].drop(['date', 'return'], axis=1)

            y_train =  instrument[i:i + min_obs]['return']
            y_test =  instrument[i + min_obs:i + min_obs + 1]['return']

            y_train = np.where(y_train >= 0, 1, 0)
            y_test = np.where(y_test >= 0, 1, 0)

            clf.fit(X_train, y_train)
            y_pred = clf.predict(X_test)[0]
            predictions.append(y_pred)
            
    return predictions

In [101]:
for p in range(1, 11):
    predictions = bt(instrument=BTCUSD, clf=LogisticRegression(n_jobs=-1), pca=PCA(p), min_obs=30)
    diff = len(BTCUSD) - len(predictions)
    acc = (np.where(BTCUSD[diff:]['return'] >= 0, 1, 0) == predictions).mean()
    gain = ((np.where(BTCUSD[diff:]['return'] >= 0, 1, 0) == predictions) * BTCUSD[diff:]['return']).sum()
    print(acc)

0.5025528811086798
0.5200583515681984
0.5229759299781181
0.5251641137855579
0.5215171407731582
0.524434719183078
0.524434719183078
0.524434719183078
0.5229759299781181
0.5200583515681984
