# description
**purpose**: forecast price of crypto currencies 

**idea**: 
- virtual agents that compete for returns based on price time-series. 
- genetic algorithm for self-coding agent

**current capability**:
- baseline: Lasso 

**desired capability**:
- submit to kaggle
- competition between multiple agents for best results

# setup

In [9]:
## dependencies
# common
import os
import sys
import time
from datetime import datetime
import traceback

# data
import pandas as pd
import numpy as np
import plotly.graph_objects as go
import matplotlib.pyplot as plt
import scipy.stats as stats
from scipy.stats import pearsonr
import gc
import datatable as dt
# model
from sklearn import set_config
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression, Lasso, LinearRegression
from sklearn.multioutput import MultiOutputRegressor
from sklearn.pipeline import Pipeline
from sklearn.metrics import r2_score, roc_auc_score, mean_absolute_error, mean_squared_error
from sklearn.model_selection import GridSearchCV, KFold
import tensorflow as tf
import tensorflow.keras.backend as K
set_config(display='diagram') 
# special
sys.path.append('/kaggle/input/g-research-crypto-forecasting/')
import gresearch_crypto


[CV] linear__alpha=1.0 ...............................................
[CV]  linear__alpha=1.0, MAE=(train=-0.003, test=-0.004), total=  16.9s
[CV] linear__alpha=1.4677992676220695 ................................
[CV]  linear__alpha=1.4677992676220695, MAE=(train=-0.003, test=-0.004), total=  24.5s
[CV] linear__alpha=2.154434690031884 .................................
[CV]  linear__alpha=2.154434690031884, MAE=(train=-0.003, test=-0.004), total=  27.1s
[CV] linear__alpha=3.1622776601683795 ................................
[CV]  linear__alpha=3.1622776601683795, MAE=(train=-0.003, test=-0.004), total=  19.7s
[CV] linear__alpha=4.641588833612778 .................................
[CV]  linear__alpha=4.641588833612778, MAE=(train=-0.003, test=-0.004), total=  20.0s
[CV] linear__alpha=6.812920690579611 .................................
[CV]  linear__alpha=6.812920690579611, MAE=(train=-0.003, test=-0.004), total=  18.7s
[CV] linear__alpha=10.0 ..............................................

In [None]:
# config 

DEVICE = "TPU" #or "GPU"

SEED = 42

# CV PARAMS
FOLDS = 5
GROUP_GAP = 130
MAX_TEST_GROUP_SIZE = 180
MAX_TRAIN_GROUP_SIZE = 280

# LOAD STRICT? YES=1 NO=0 | see: https://www.kaggle.com/julian3833/proposal-for-a-meaningful-lb-strict-lgbm
LOAD_STRICT = True

# WHICH YEARS TO INCLUDE? YES=1 NO=0
INC2021 = 0
INC2020 = 0
INC2019 = 0
INC2018 = 0
INC2017 = 0
INCCOMP = 1
INCSUPP = 0

# BATCH SIZE AND EPOCHS
BATCH_SIZES = [1024] * FOLDS
EPOCHS = [1] * FOLDS

# WHICH NETWORK ARCHITECTURE TO USE?
DEPTH_NETS = [3, 3, 3, 3, 3] 
WIDTH_NETS = [16, 16, 16, 16, 16]

# abstract

In [2]:
%run /kaggle/usr/lib/crypto_utils/crypto_utils.ipynb

# data

In [3]:
# read data 

basePath = '/kaggle/input/cryptocurrency-extra-data-binance-coin'
orig_df_train = dt.fread(f'{basePath}/orig_train.jay').to_pandas()
df_asset_details = dt.fread(f'{basePath}/orig_asset_details.jay').to_pandas()
supp_df_train = dt.fread(f'{basePath}/orig_supplemental_train.jay').to_pandas()
assets_details = dt.fread(f'{basePath}/orig_asset_details.jay').to_pandas()
asset_weight_dict = {assets_details['Asset_ID'].tolist()[idx]: assets_details['Weight'].tolist()[idx] for idx in range(len(assets_details))}
asset_name_dict = {assets_details['Asset_ID'].tolist()[idx]: assets_details['Asset_Name'].tolist()[idx] for idx in range(len(assets_details))}
test = dt.fread(f'{basePath}/orig_example_test.jay').to_pandas()
sample_prediction_df = dt.fread(f'{basePath}/orig_example_sample_submission.jay').to_pandas()
train_data = orig_df_train
train_data['date'] = pd.to_datetime(train_data['timestamp'], unit = 's') # seconds
train_data = train_data.sort_values('date')
groups = pd.factorize(train_data['date'].dt.day.astype(str) + '_' + train_data['date'].dt.month.astype(str) + '_' + train_data['date'].dt.year.astype(str))[0]
# train_data = train_data.drop(columns = 'date')
# train_data.drop(columns = 'timestamp', inplace = True)
dates = train_data['date'].copy()
target = train_data['Target'].copy()
timestamp = train_data['timestamp'].copy()
train_data.drop(columns = 'Target', inplace = True)
train_data = reduce_mem_usage(train_data)
assets_idx = train_data['Asset_ID']
train_data = get_features(train_data)
train_data['Asset_ID'] = assets_idx
train_data['groups'] = groups
train_data['date'] = dates
train_data = reduce_mem_usage(train_data)
train_data['Target'] = target
train_data['timestamp'] = timestamp
train_data = fill_nan_inf(train_data)
test = fill_nan_inf(test)
feature_names = [i for i in train_data.columns if i not in ['Target', 'date', 'timestamp', 'VWAP', 'Asset_ID', 'groups']]

Memory usage of dataframe is 2034.03 MB
Memory usage after optimization is: 1063.24 MB
Decreased by 47.7%
Memory usage of dataframe is 1340.61 MB
Memory usage after optimization is: 1201.93 MB
Decreased by 10.3%


# model

In [4]:
# define pipeline 
# pipeline: impute -> scale -> lasso
imp_mean = SimpleImputer(missing_values=np.nan, strategy='mean')
scaler = StandardScaler()

linear = Lasso(
    max_iter=1000,
    tol=0.1,    
    random_state=42
)
pipe = Pipeline(steps=[
    ('imputer', imp_mean),
    ('scaler', scaler),
    ('linear', linear)
])

pipe

In [5]:
gc.collect() # garabage collector

46

In [6]:
# grid search cross validation definition
param_grid = {
 'linear__alpha': np.logspace(0.0, 1.0, 7),
}

scoring = {'MAE': 'neg_mean_absolute_error'}

cv = PurgedGroupTimeSeriesSplit(
    n_splits=3,
    max_train_group_size=150,
    group_gap=20,
    max_test_group_size=60
)

search = GridSearchCV(
    pipe,
    param_grid,
    n_jobs=3,
    cv=cv,
    verbose=10,
    scoring=scoring,
    refit=False, # 'MAE',   # <-- do we want to refit on the entire dataset?
    return_train_score=True
)

In [7]:
%%time
# grid search for best lasso parameters

FIT = True

if FIT:
    search.fit(
        train_data[feature_names].values,
        train_data['Target'].values,
        groups=train_data['groups'].values,
    )

Fitting 3 folds for each of 7 candidates, totalling 21 fits


[Parallel(n_jobs=3)]: Using backend LokyBackend with 3 concurrent workers.
[Parallel(n_jobs=3)]: Done   2 tasks      | elapsed:   46.7s
[Parallel(n_jobs=3)]: Done   7 tasks      | elapsed:  2.0min
[Parallel(n_jobs=3)]: Done  12 tasks      | elapsed:  2.9min
[Parallel(n_jobs=3)]: Done  19 out of  21 | elapsed:  4.5min remaining:   28.5s


CPU times: user 5min 47s, sys: 4.04 s, total: 5min 51s
Wall time: 6min 10s


[Parallel(n_jobs=3)]: Done  21 out of  21 | elapsed:  4.9min finished


In [8]:
# results
results = search.cv_results_
param = 'param_' + list(param_grid.keys())[0]

results_idx = np.argmin(results['mean_test_MAE'])
best_param = results[param][results_idx]

print(f'The best setting for alpha is {best_param}')

The best setting for alpha is 1.0


## entire data

In [None]:
# train on entire data
linear = Lasso(
    max_iter=1000,
    alpha=0.1,
    tol=0.1,
)

pipe_lr = Pipeline(steps=[
    ('imputer', imp_mean),
    ('scaler', scaler),
    ('linear', linear)
])

In [None]:
gc.collect()

In [None]:
%%time

pipe_lr.fit(
    train_data[feature_names].values,
    train_data['Target'].values
)

gc.collect()

In [None]:
pipe_lr

## todo

In [None]:
# Select some input features from the trading data: 
# 5 min log return, abs(5 min log return), upper shadow, and lower shadow.
upper_shadow = lambda asset: asset.High - np.maximum(asset.Close,asset.Open)
lower_shadow = lambda asset: np.minimum(asset.Close,asset.Open)- asset.Low

X_btc = pd.concat([log_return(btc.VWAP,periods=5), log_return(btc.VWAP,periods=1).abs(), 
               upper_shadow(btc), lower_shadow(btc)], axis=1)
y_btc = btc.Target

X_eth = pd.concat([log_return(eth.VWAP,periods=5), log_return(eth.VWAP,periods=1).abs(), 
               upper_shadow(eth), lower_shadow(eth)], axis=1)
y_eth = eth.Target

In [None]:
# select training and test periods
train_window = [totimestamp("01/05/2021"), totimestamp("30/05/2021")]
test_window = [totimestamp("01/06/2021"), totimestamp("30/06/2021")]

# divide data into train and test, compute X and y
# we aim to build simple regression models using a window_size of 1
X_btc_train = X_btc.loc[train_window[0]:train_window[1]].fillna(0).to_numpy()  # filling NaN's with zeros
y_btc_train = y_btc.loc[train_window[0]:train_window[1]].fillna(0).to_numpy()  

X_btc_test = X_btc.loc[test_window[0]:test_window[1]].fillna(0).to_numpy() 
y_btc_test = y_btc.loc[test_window[0]:test_window[1]].fillna(0).to_numpy() 

X_eth_train = X_eth.loc[train_window[0]:train_window[1]].fillna(0).to_numpy()  
y_eth_train = y_eth.loc[train_window[0]:train_window[1]].fillna(0).to_numpy()  

X_eth_test = X_eth.loc[test_window[0]:test_window[1]].fillna(0).to_numpy() 
y_eth_test = y_eth.loc[test_window[0]:test_window[1]].fillna(0).to_numpy() 

In [None]:
# simple preprocessing of the data 
scaler = StandardScaler()

X_btc_train_scaled = scaler.fit_transform(X_btc_train)
X_btc_test_scaled = scaler.transform(X_btc_test)

X_eth_train_scaled = scaler.fit_transform(X_eth_train)
X_eth_test_scaled = scaler.transform(X_eth_test)

In [None]:
# implement basic baseline (one per asset)

lr = LinearRegression()
lr.fit(X_btc_train_scaled,y_btc_train)
y_pred_lr_btc = lr.predict(X_btc_test_scaled)

lr.fit(X_eth_train_scaled,y_eth_train)
y_pred_lr_eth = lr.predict(X_eth_test_scaled)

In [None]:
# implement more complex baseline (multiple output regression model)
# we concatenate X and y for both assets
X_both_train = np.concatenate((X_btc_train_scaled, X_eth_train_scaled), axis=1)
X_both_test = np.concatenate((X_btc_test_scaled, X_eth_test_scaled), axis=1)
y_both_train = np.column_stack((y_btc_train, y_eth_train))
y_both_test = np.column_stack((y_btc_test, y_eth_test))

# define the direct multioutput model and fit it
mlr = MultiOutputRegressor(LinearRegression())
lr.fit(X_both_train,y_both_train)
y_pred_lr_both = lr.predict(X_both_test)

# evaluate

In [None]:
print('Test score for LR baseline: BTC', f"{np.corrcoef(y_pred_lr_btc, y_btc_test)[0,1]:.2f}", 
                                ', ETH', f"{np.corrcoef(y_pred_lr_eth, y_eth_test)[0,1]:.2f}")
print('Test score for multiple output LR baseline: BTC', f"{np.corrcoef(y_pred_lr_both[:,0], y_btc_test)[0,1]:.2f}", 
                                                ', ETH', f"{np.corrcoef(y_pred_lr_both[:,1], y_eth_test)[0,1]:.2f}")

In [None]:
env = gresearch_crypto.make_env()

# Training data is in the competition dataset as usual
lr.fit(train)
mlr.fit(train)
iter_test = env.iter_test()
for (test_df, sample_prediction_df) in iter_test:
    sample_prediction_df['Target'] = tgt_1_model.predict(test_df)
    env.predict(sample_prediction_df)

# conclusion

# todo