In [1]:
import os
import random
import pandas as pd
import numpy as np
from lightgbm import LGBMRegressor
import lightgbm as lgb
import gresearch_crypto
import datetime
from sklearn.preprocessing import RobustScaler
from sklearn.metrics import mean_absolute_error as MSE
import warnings
import talib as ta
warnings.filterwarnings('ignore')

TRAIN_CSV = 'train.csv'
ASSET_DETAILS_CSV = 'asset_details.csv'

SEED = 20

REMOVE_LB_TEST_OVERLAPPING_DATA = True

In [3]:
def fix_all_seeds(seed):
    np.random.seed(seed)
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)

fix_all_seeds(SEED)

In [4]:
df_train = pd.read_csv(TRAIN_CSV)
df_train

Unnamed: 0,timestamp,Asset_ID,Count,Open,High,Low,Close,Volume,VWAP,Target
0,1514764860,2,40.0,2376.580000,2399.500000,2357.140000,2374.590000,1.923301e+01,2373.116392,-0.004218
1,1514764860,0,5.0,8.530000,8.530000,8.530000,8.530000,7.838000e+01,8.530000,-0.014399
2,1514764860,1,229.0,13835.194000,14013.800000,13666.110000,13850.176000,3.155006e+01,13827.062093,-0.014643
3,1514764860,5,32.0,7.659600,7.659600,7.656700,7.657600,6.626713e+03,7.657713,-0.013922
4,1514764860,7,5.0,25.920000,25.920000,25.874000,25.877000,1.210873e+02,25.891363,-0.008264
...,...,...,...,...,...,...,...,...,...,...
24236801,1632182400,9,775.0,157.181571,157.250000,156.700000,156.943857,4.663725e+03,156.994319,
24236802,1632182400,10,34.0,2437.065067,2438.000000,2430.226900,2432.907467,3.975460e+00,2434.818747,
24236803,1632182400,13,380.0,0.091390,0.091527,0.091260,0.091349,2.193732e+06,0.091388,
24236804,1632182400,12,177.0,0.282168,0.282438,0.281842,0.282051,1.828508e+05,0.282134,


In [5]:
df_test = pd.read_csv('example_test.csv')
df_test.head()

Unnamed: 0,timestamp,Asset_ID,Count,Open,High,Low,Close,Volume,VWAP,group_num,row_id
0,1623542400,3,1201.0,1.478556,1.48603,1.478,1.483681,654799.561103,1.481439,0,0
1,1623542400,2,1020.0,580.306667,583.89,579.91,582.276667,1227.988328,581.697038,0,1
2,1623542400,0,626.0,343.7895,345.108,343.64,344.598,1718.832569,344.441729,0,2
3,1623542400,1,2888.0,35554.289632,35652.46465,35502.67,35602.004286,163.811537,35583.469303,0,3
4,1623542400,4,433.0,0.312167,0.3126,0.31192,0.312208,585577.410442,0.312154,0,4


In [6]:
%%capture
'''
l =len(df_train)
l = l//2
df_train = df_train[l:].reset_index(drop=True)
df_train
'''

In [7]:
# Remove the future
if REMOVE_LB_TEST_OVERLAPPING_DATA:
    df_train['datetime'] = pd.to_datetime(df_train['timestamp'], unit='s')
    df_valid = df_train[(df_train['datetime'] > '2021-06-13 00:00:00')].reset_index(drop=True)
    df_train = df_train[(df_train['datetime'] < '2021-06-13 00:00:00')].reset_index(drop=True)
    #df_valid = df_train[(df_train['datetime'] > '2021-06-13 00:00:00') | (df_train['datetime'] < '2019-01-01 00:00:00')]
    #df_train = df_train[(df_train['datetime'] < '2021-06-13 00:00:00') | (df_train['datetime'] > '2019-01-01 00:00:00')]
    #print('delete data  ','train=',len(df_train),'  valid=',len(df_valid))
    df_train  = df_train.drop(['datetime'],axis=1)
    df_valid  = df_valid.drop(['datetime'],axis=1)
else:
    df_train['datetime'] = pd.to_datetime(df_train['timestamp'], unit='s')
    df_valid = df_train[(df_train['datetime'] > '2021-06-13 00:00:00')].reset_index(drop=True)
    df_train  = df_train.drop(['datetime'],axis=1)
    df_valid  = df_valid.drop(['datetime'],axis=1)


In [8]:
df_valid = df_valid.dropna(subset=['Target']).reset_index(drop=True)
#bkup =df_valid.copy()

In [9]:
df_asset_details = pd.read_csv(ASSET_DETAILS_CSV).sort_values("Asset_ID")
df_asset_details

Unnamed: 0,Asset_ID,Weight,Asset_Name
1,0,4.304065,Binance Coin
2,1,6.779922,Bitcoin
0,2,2.397895,Bitcoin Cash
10,3,4.406719,Cardano
13,4,3.555348,Dogecoin
3,5,1.386294,EOS.IO
5,6,5.894403,Ethereum
4,7,2.079442,Ethereum Classic
11,8,1.098612,IOTA
6,9,2.397895,Litecoin


In [10]:
from sklearn.model_selection import train_test_split
import lightgbm as lgb
from sklearn.metrics import accuracy_score
#!pip install optuna
import optuna 
import optuna.integration.lightgbm as lgbo

In [11]:
def log_return(series, periods=5):
    return np.log(series).diff(periods=periods)

# Two features from the competition tutorial
def upper_shadow(df): return df['High'] - np.maximum(df['Close'], df['Open'])
def lower_shadow(df): return np.minimum(df['Close'], df['Open']) - df['Low']

In [12]:
def add_features(df):
    
    df["high_div_low"] = df["High"] / df["Low"]
    df["open_sub_close"] = df["Open"] - df["Close"]
    #df["open_sub_close2"] = df["Open"] / df["Close"] #
    df["open_sub_close_1"] = df["open_sub_close"].shift(-4)
    
    df['Open_shift-1'] = df['Open'].shift(-1)
    df['Open_shift-4'] = df['Open'].shift(-4)
    df['Open_shift-7'] = df['Open'].shift(-7)
    
    df['Open_shift1'] = df['Open'].shift(1)
    
    #df['Close_shift-1'] = df['Close'].shift(-1)
    
    df['close_log1'] = log_return(df['Close'],periods=1)
    df['close_log4'] = log_return(df['Close'],periods=4)
    #df['close_log7'] = log_return(df['Close'],periods=7)#
    
    #df['open_log1'] = log_return(df['Open'],periods=1)#
    
    #df['close_log4-1'] =df['close_log4'].shift(-1) #
    #df['avg5'] =  ta.SMA(df['Open'], timeperiod=5) #移動平均
    #df['avg25'] = ta.SMA(df['Open'], timeperiod=25) #df['Open'].rolling(25).mean()
    df['avg75'] = ta.SMA(df['Open'], timeperiod=75 )#df['Open'].rolling(75).mean()
    df['ema']   =ta.EMA(df['Open'], timeperiod=20) 
    df['willr'] = ta.WILLR(df['High'], df['Low'],np.array(df.loc[:, 'Close']), timeperiod=14)
    times = pd.to_datetime(df["timestamp"],unit="s",infer_datetime_format=True)
    df["hour"] = times.dt.hour  
    df["dayofweek"] = times.dt.dayofweek 
    df["day"] = times.dt.day 
    
    #df['RSI-9'] = ta.RSI(df['Close'], timeperiod=9)#
    df['RSI'] = ta.RSI(df['Close'], timeperiod=14)
    df['RSI1'] = df['RSI'].shift(-1)
    df['RSI4'] = df['RSI'].shift(-4)
    df['RSI7'] = df['RSI'].shift(-7)
    df['RSI10'] = df['RSI'].shift(-10)
    df['RSI13'] = df['RSI'].shift(-13) 
    df['RSI16'] = df['RSI'].shift(-16)  
    df['RSI~4'] = df['RSI'].shift(4)  #
    #df['RSI~7'] = df['RSI'].shift(7)  #
    #df['RSI+1'] = df['RSI'].shift(1)
    
    df['MACD'], df['macdsignal'], df['MACD_HIST'] = ta.MACD(df['Close'], fastperiod=12, slowperiod=26, signalperiod=9)
    #df['MACD1'] = df['MACD'].shift(-1)
    #df['MACD4'] = df['MACD'].shift(-4)
    #df['MACD7'] = df['MACD'].shift(-7)
    #df['MACD_HIST-4'] =df['MACD_HIST'].shift(4)# 
    df['MACD_HIST1'] =df['MACD_HIST'].shift(-1) 
    df['MACD_HIST4'] =df['MACD_HIST'].shift(-4) 
    df['MACD_HIST7'] =df['MACD_HIST'].shift(-7) 
    #df['MACD_HIST10'] =df['MACD_HIST'].shift(-10)#
    #df['u_bandx'], df['m_bandx'], df['l_bandx'] = ta.BBANDS(df['Close'], timeperiod=7, nbdevup=2, nbdevdn=2, matype=0)#
    
    df['u_band'], df['m_band'], df['l_band'] = ta.BBANDS(df['Close'], timeperiod=5, nbdevup=2, nbdevdn=2, matype=0) 
    df['adx'] = ta.ADX(df['High'], df['Low'],np.array(df.loc[:, 'Close']),timeperiod=14)
    df['adx1'] = df['adx'].shift(-1)
    df['adx4'] = df['adx'].shift(-4)
    df['adx+1'] = df['adx'].shift(1)
    #df['adx+2'] = df['adx'].shift(7)
    df['adx7'] = df['adx'].shift(-7)
    #df['adx10'] = df['adx'].shift(-10)#
    #df['adx+4'] = df['adx'].shift(4)
    #df['adx+1'] = df['adx'].shift(1)
    #df['adx7-12'] = ta.ADX(df['High'], df['Low'],np.array(df.loc[:, 'Close']),timeperiod=4) #
    
    df['macdsignal1'] = df['macdsignal'].shift(-1)
    df['macdsignal4'] = df['macdsignal'].shift(-4)
    df['macdsignal7'] = df['macdsignal'].shift(-7)
    #df['macdsignal10'] = df['macdsignal'].shift(-10) #
    
    df['DI_plus'] = ta.PLUS_DI(df['High'], df['Low'],np.array(df.loc[:, 'Close']), timeperiod=14)
    df['DI_minus'] = ta.MINUS_DI(df['High'], df['Low'],np.array(df.loc[:, 'Close']), timeperiod=14)
    
    #df = df.drop(['Close'],axis=1)
    #df['DI_plus1+1'] = df['DI_plus'].shift(1) #
    df['DI_plus1'] = df['DI_plus'].shift(-1)
    df['DI_plus4'] = df['DI_plus'].shift(-4)
    df['DI_plus7'] = df['DI_plus'].shift(-7) #いる
    df['DI_plus10'] = df['DI_plus'].shift(-10)
    
    df['DI_minus1'] = df['DI_minus'].shift(-1)
    df['DI_minus4'] = df['DI_minus'].shift(-4)
    df['DI_minus7'] = df['DI_minus'].shift(-7)
    #df['DI_minus10'] = df['DI_minus'].shift(-10) #
    
    df['ROCP'] =ta.ROCP(df['Open'])
    df['momentam'] =ta.MOM(df['Open'])
    
    df['APO'] =ta.APO(df['Open'])
    df['APO1'] = df['APO'].shift(-1)
    df['APO4'] = df['APO'].shift(-4)
    df['APO7'] = df['APO'].shift(-7) 
    
    df['PPO'] =ta.PPO(df['Open'])
    #df['PPO1'] = df['PPO'].shift(-1)
    #df['PPO4'] = df['PPO'].shift(-4)
    #df['PPO7'] = df['PPO'].shift(-7)
    #df['vwap-c'] =(1-np.exp(df['VWAP']))
    df = df.drop(['VWAP'],axis=1)

    df['CMO'] =ta.CMO(df['Open'])
    df['MIDPOINT'] =ta.MIDPOINT(df['Open'])
    #df['MIDPOINT1'] =df['MIDPOINT'].shift(-1)
    df['TRENDLINE'] =ta.HT_TRENDLINE(df['Open'])
    #df['TRENDLINE1']= df['TRENDLINE'].shift(-1)
    #df['timestamp1'] = df['timestamp'].shift(-1)
    return df 

In [13]:
# Two new features from the competition tutorial
def upper_shadow(df):
    return df['High'] - np.maximum(df['Close'], df['Open'])

def lower_shadow(df):
    return np.minimum(df['Close'], df['Open']) - df['Low']

# A utility function to build features from the original df
# It works for rows to, so we can reutilize it.
def get_features(df_feat):
    #df_feat = df[['Count', 'Open', 'High', 'Low', 'Close', 'Volume', 'VWAP']].copy()
    df_feat['Upper_Shadow'] = upper_shadow(df_feat)
    df_feat['Lower_Shadow'] = lower_shadow(df_feat)
    return df_feat

def get_Xy_and_model_for_asset(df_train,asset_id):
    df = df_train[df_train["Asset_ID"] == asset_id]
    #df_v = df_v[df_v["Asset_ID"] == asset_id]
    
    # TODO: Try different features here!
    #print( 'step1',datetime.datetime.now().strftime('%Y年%m月%d日 %H:%M:%S'))
    df = df.dropna(subset=['Target'])
    
    y = df['Target'] 
    df      = df.drop(['Target','Asset_ID'],axis=1)
    df_proc = get_features(df)
    df_proc = add_features(df_proc)
    df_proc = df_proc.fillna(-1)
    #df_proc = df_proc.drop(['Asset_ID','Target'],axis=1)
    X= df_proc #.drop("y", axis=1)
    #y = df_proc["y"]
        
    #x_train, x_test, y_train, y_test = train_test_split(X,y, test_size=0.2, random_state=42)
    #lgb_train = lgb.Dataset(x_train, y_train)
    #lgb_valid = lgb.Dataset(x_test, y_test)
    
    params = {'objective': 'binary',  'metric': 'binary_logloss'}# ,'device':'gpu'} 

    #model = lgbo.train(params, lgb_train, valid_sets=[lgb_valid], verbose_eval=False, num_boost_round=100, early_stopping_rounds=5) 
    #best_lgb_params = model.parms
    best_lgb_params ={'objective': 'regression',
    'metric': 'rmse',
    #'feature_pre_filter': False,
    #'lambda_l1': 0.12036539136796603,
    #'lambda_l2': 0.005617203356210593,
    #'num_leaves': 700,
    #'feature_fraction': 0.88,
    #'bagging_fraction': 1.0,
    #'bagging_freq': 0,
    #'min_child_samples': 10,
    #'num_iterations': 100,
    'early_stopping_round': 100}
    best_lgb_params["learning_rate"] = 0.5
    best_lgb_params["force_row_wise"] = True
    #best_lgb_params["device"] = 'gpu'
    best_lgb_params["early_stopping_round"] = 50
    best_lgb_params["num_iterations"] = 1500
    
    #model = lgb.train(best_lgb_params,lgb_train, valid_sets=[lgb_valid], verbose_eval=100)
    #print(' ')
    
    # TODO: Try different models here!
    #model = LGBMRegressor(n_estimators=10)
    #model.fit(X, y)
    #print( ' ',datetime.datetime.now().strftime('%Y年%m月%d日 %H:%M:%S'))
    model = LGBMRegressor(n_estimators=1500,num_leaves=700,learning_rate=0.1,silent=True)
    model.fit(X, y)
    
    #print( 'step3',datetime.datetime.now().strftime('%Y年%m月%d日 %H:%M:%S'))
    fi =model.feature_importances_
    fi_df = pd.DataFrame({'feature': list(X.columns),
         'feature importance': fi[:]}).sort_values('feature importance', ascending = False)
    if asset_id ==0:
        display(fi_df)
    return model

In [14]:
Xs = {}
ys = {}
models = {}

for asset_id, asset_name in zip(df_asset_details['Asset_ID'], df_asset_details['Asset_Name']):
    print(f"Training model for {asset_name:<16} (ID={asset_id:<2})   ",datetime.datetime.now().strftime('%Y/%m/%d %H:%M:%S'))
    
    model = get_Xy_and_model_for_asset(df_train, asset_id)    
    models[asset_id] =  model
    
    #validation
    x_pred = pd.DataFrame()
    x = asset_id
    record = df_valid[df_valid.Asset_ID == x]   
    target = record.Target 
    record = record.drop(['Target','Asset_ID'],axis=1)
    model = models[x]
    x_test = get_features(record)
    x_test = add_features(x_test) 
    #x_test = x_test.drop(['Asset_ID','Target'],axis=1)
        
    x_pred['x'] = model.predict(x_test)
    print('Test score for LR baseline: ', f"{np.corrcoef(x_pred.x, target)[0,1]:.5f}")
    #print('MSE=',MSE(x_pred.x, target))
    #print(x_pred.x[:10])
    #print(target[:10])
    del record
    del x_pred
    del x_test
    
    #break

Training model for Binance Coin     (ID=0 )    2021/12/08 09:04:05


Unnamed: 0,feature,feature importance
30,RSI16,45926
0,timestamp,44713
23,day,34916
21,hour,34609
29,RSI13,28645
...,...,...
38,u_band,2964
66,MIDPOINT,2722
15,Open_shift1,2449
5,Close,2005


Test score for LR baseline:  0.42426
Training model for Bitcoin          (ID=1 )    2021/12/08 09:13:26
Test score for LR baseline:  0.30609
Training model for Bitcoin Cash     (ID=2 )    2021/12/08 09:21:34
Test score for LR baseline:  0.37482
Training model for Cardano          (ID=3 )    2021/12/08 09:29:47
Test score for LR baseline:  0.47667
Training model for Dogecoin         (ID=4 )    2021/12/08 09:38:13
Test score for LR baseline:  0.43627
Training model for EOS.IO           (ID=5 )    2021/12/08 09:43:56
Test score for LR baseline:  0.40900
Training model for Ethereum         (ID=6 )    2021/12/08 09:52:07
Test score for LR baseline:  0.30954
Training model for Ethereum Classic (ID=7 )    2021/12/08 10:00:04
Test score for LR baseline:  0.45371
Training model for IOTA             (ID=8 )    2021/12/08 10:08:51
Test score for LR baseline:  0.84098
Training model for Litecoin         (ID=9 )    2021/12/08 10:16:25
Test score for LR baseline:  0.29954
Training model for Maker   

0.42429

In [15]:
import pickle
with open('models','wb') as web:
    pickle.dump(models,web)

In [16]:
    x_pred = pd.DataFrame()
    for x in range(len(df_valid.Asset_ID.unique())):
        print( 'Asset_ID=',x,' ',datetime.datetime.now().strftime('%Y年%m/%d/ %H:%M:%S'))
        record = df_valid[df_valid.Asset_ID == x]     
        record = record.drop(['Target','Asset_ID'],axis=1)
        model = models[x]
        x_test = get_features(record)
        x_test = add_features(pd.DataFrame(x_test)) #.values
        #x_test = x_test.drop(['Asset_ID','Target'],axis=1)

        x_test['y_pred'] = model.predict(x_test)
        x_pred = pd.concat([x_test,x_pred])    
    x_pred = x_pred.sort_index()    
    print(datetime.datetime.now().strftime('%Y/%m/%d %H:%M:%S'),'Test score for LR baseline: ', f"{np.corrcoef(x_pred.y_pred,df_valid.Target)[0,1]:.5f}")

Asset_ID= 0   2021年12/08/ 10:53:17
Asset_ID= 1   2021年12/08/ 10:53:29
Asset_ID= 2   2021年12/08/ 10:53:41
Asset_ID= 3   2021年12/08/ 10:53:53
Asset_ID= 4   2021年12/08/ 10:54:05
Asset_ID= 5   2021年12/08/ 10:54:19
Asset_ID= 6   2021年12/08/ 10:54:30
Asset_ID= 7   2021年12/08/ 10:54:42
Asset_ID= 8   2021年12/08/ 10:54:54
Asset_ID= 9   2021年12/08/ 10:55:07
Asset_ID= 10   2021年12/08/ 10:55:19
Asset_ID= 11   2021年12/08/ 10:55:31
Asset_ID= 12   2021年12/08/ 10:55:44
Asset_ID= 13   2021年12/08/ 10:55:58
2021/12/08 10:56:10 Test score for LR baseline:  0.67428


In [17]:
env = gresearch_crypto.make_env()
iter_test = env.iter_test()

for i, (df_test, df_pred) in enumerate(iter_test):
    #test = pd.DataFrame(index=(df_test['timestamp'].unique()))
    #df_test = get_features(df_test)
    
    #df_valid.set_index("timestamp",inplace=True) 
    #df_proc = df_proc.fillna(-1)
    #x_pred = pd.DataFrame()
    #timestamp= pd.DataFrame()
    for x in df_test.Asset_ID.unique():
        #print('Asset_Id=',x)
        model = models[x]
        x_test = df_test[df_test.Asset_ID == x]
        x_test = get_features(x_test)
        x_test = add_features(x_test)

        row_id = x_test.row_id
        x_test = x_test.drop(['row_id','Asset_ID'],axis=1)        
        #timestamp = data['timestamp']
        #row_id = data['row_id'].values
        
        #x_test = df_proc[df_proc.index.isin(data.timestamp.values)]
        
        #print('Asset_Id=',x,'  predict',len(x_test.columns),' ',x_test.columns)
        x_test['pred'] = model.predict(x_test)
        
        x_test['row_id'] =row_id
        #x_pred = pd.concat([x_pred,x_test])
      
        for j,row in x_test.iterrows():
            df_pred.loc[df_pred['row_id'] == row['row_id'], 'Target'] = row.pred
        
    #if x == 0:
    #    display(df_pred)
        
    env.predict(df_pred)

This version of the API is not optimized and should not be used to estimate the runtime of your code on the hidden test set.


In [18]:
df_test

Unnamed: 0,timestamp,Asset_ID,Count,Open,High,Low,Close,Volume,VWAP,row_id
0,1623542580,3,1023,1.479075,1.479399,1.4726,1.473527,328684.9,1.476372,42
1,1623542580,2,409,580.49,580.69,578.31,578.912,204.521,579.470144,43
2,1623542580,0,614,343.238,343.406,341.94,342.325,1295.199,342.525876,44
3,1623542580,1,2901,35478.867162,35503.460134,35381.01,35423.49,118.8025,35438.243466,45
4,1623542580,4,1094,0.310923,0.3114,0.3104,0.310676,2403980.0,0.310894,46
5,1623542580,5,465,4.819883,4.8221,4.8083,4.81345,34767.54,4.815294,47
6,1623542580,7,347,54.997567,55.0442,54.9168,54.94095,2735.096,54.974253,48
7,1623542580,6,2624,2365.769427,2367.5,2359.01,2360.505714,2253.663,2362.394059,49
8,1623542580,8,169,0.9966,1.0129,0.9751,0.991,62518.2,0.992974,50
9,1623542580,9,431,161.862333,161.93,161.27,161.485,1156.636,161.595349,51


In [19]:
df_pred

Unnamed: 0,row_id,Target
0,42,-0.012735
1,43,-0.00162
2,44,-0.008905
3,45,-0.001754
4,46,-0.015395
5,47,-0.000419
6,48,-0.001264
7,49,-0.003406
8,50,-0.047763
9,51,-0.005205
