## References: 

[G-Research- Starter LGBM Pipeline](https://www.kaggle.com/julian3833/g-research-starter-lgbm-pipeline)


# Environment Setup

In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))


/kaggle/input/g-research-crypto-forecasting/example_sample_submission.csv
/kaggle/input/g-research-crypto-forecasting/asset_details.csv
/kaggle/input/g-research-crypto-forecasting/example_test.csv
/kaggle/input/g-research-crypto-forecasting/train.csv
/kaggle/input/g-research-crypto-forecasting/supplemental_train.csv
/kaggle/input/g-research-crypto-forecasting/gresearch_crypto/competition.cpython-37m-x86_64-linux-gnu.so
/kaggle/input/g-research-crypto-forecasting/gresearch_crypto/__init__.py


In [2]:
import gresearch_crypto
import time
from datetime import datetime

dir_in = '/kaggle/input/g-research-crypto-forecasting/'
file_train = 'train.csv'
file_asset_details = 'asset_details.csv'

df_train = pd.read_csv(os.path.join(dir_in, file_train))
df_asset_details = pd.read_csv(os.path.join(dir_in, file_asset_details))

In [3]:
# set seed

seed = 2021

def fix_all_seeds(seed):
    os.environ['PYTHONHASHSEED'] = str(seed)
    
fix_all_seeds(seed)    

In [4]:
df_train.head()

Unnamed: 0,timestamp,Asset_ID,Count,Open,High,Low,Close,Volume,VWAP,Target
0,1514764860,2,40.0,2376.58,2399.5,2357.14,2374.59,19.233005,2373.116392,-0.004218
1,1514764860,0,5.0,8.53,8.53,8.53,8.53,78.38,8.53,-0.014399
2,1514764860,1,229.0,13835.194,14013.8,13666.11,13850.176,31.550062,13827.062093,-0.014643
3,1514764860,5,32.0,7.6596,7.6596,7.6567,7.6576,6626.71337,7.657713,-0.013922
4,1514764860,7,5.0,25.92,25.92,25.874,25.877,121.08731,25.891363,-0.008264


In [5]:
df_asset_details

Unnamed: 0,Asset_ID,Weight,Asset_Name
0,2,2.397895,Bitcoin Cash
1,0,4.304065,Binance Coin
2,1,6.779922,Bitcoin
3,5,1.386294,EOS.IO
4,7,2.079442,Ethereum Classic
5,6,5.894403,Ethereum
6,9,2.397895,Litecoin
7,11,1.609438,Monero
8,13,1.791759,TRON
9,12,2.079442,Stellar


# Training

## Linear Regression - Test on BTC and ETH

In [6]:
def upper_shadow(df):
    return df['High'] - np.maximum(df['Close'], df['Open'])

def lower_shadow(df):
    return np.minimum(df['Close'], df['Open']) - df['Low']

def log_return(series, periods=1):
    return np.log(series).diff(periods=periods)

def totimestamp(ts):
    return np.int32(time.mktime(datetime.strptime(ts, "%d/%m/%Y").timetuple()))

In [7]:
def get_features(df):
    df_feat = df[['VWAP', 'Open', 'High', 'Low', 'Close']].copy()
    df_feat['Upper_Shadow'] = upper_shadow(df_feat)
    df_feat['Lower_Shadow'] = lower_shadow(df_feat)
#     df_feat['Log_Return_5min'] = log_return(df_feat['VWAP'], periods=5)
#     df_feat['Log_Return_1min_abs'] = log_return(df_feat['VWAP'], periods=1).abs()
    
    return df_feat

In [8]:
def fill_time_gap(df):
    return df.reindex(range(df.index[0], df.index[-1]+60, 60), method='ffill')

In [9]:
# mini modeling

train_window = [totimestamp("01/05/2021"), totimestamp("30/05/2021")]
test_window = [totimestamp("01/06/2021"), totimestamp("30/06/2021")]

In [10]:
btc = df_train[df_train["Asset_ID"]==1].copy().set_index("timestamp")
eth = df_train[df_train["Asset_ID"]==6].copy().set_index("timestamp")

In [11]:
btc.info(show_counts = True)

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1956282 entries, 1514764860 to 1632182400
Data columns (total 9 columns):
 #   Column    Non-Null Count    Dtype  
---  ------    --------------    -----  
 0   Asset_ID  1956282 non-null  int64  
 1   Count     1956282 non-null  float64
 2   Open      1956282 non-null  float64
 3   High      1956282 non-null  float64
 4   Low       1956282 non-null  float64
 5   Close     1956282 non-null  float64
 6   Volume    1956282 non-null  float64
 7   VWAP      1956282 non-null  float64
 8   Target    1955978 non-null  float64
dtypes: float64(8), int64(1)
memory usage: 149.3 MB


In [12]:
eth.info(show_counts = True)

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1956200 entries, 1514764860 to 1632182400
Data columns (total 9 columns):
 #   Column    Non-Null Count    Dtype  
---  ------    --------------    -----  
 0   Asset_ID  1956200 non-null  int64  
 1   Count     1956200 non-null  float64
 2   Open      1956200 non-null  float64
 3   High      1956200 non-null  float64
 4   Low       1956200 non-null  float64
 5   Close     1956200 non-null  float64
 6   Volume    1956200 non-null  float64
 7   VWAP      1956200 non-null  float64
 8   Target    1955860 non-null  float64
dtypes: float64(8), int64(1)
memory usage: 149.2 MB


In [13]:
# slice the dataframes for mini modeling

btc_mini = btc.copy().loc[train_window[0]:test_window[1]]
eth_mini = eth.copy().loc[train_window[0]:test_window[1]]

In [14]:
btc_mini.info(show_counts = True)

print("start timestamp: {}; end timestamp: {}".format(pd.to_datetime(btc_mini.index[0], unit='s'),
                                                      pd.to_datetime(btc_mini.index[-1], unit='s')))

print("start timestamp: {}; end timestamp: {}".format(pd.to_datetime(btc_mini.index[0], unit='s', utc=True),
                                                      pd.to_datetime(btc_mini.index[-1], unit='s', utc=True)))

print("start timestamp: {}; end timestamp: {}".format(btc_mini.index[0].astype('datetime64[s]'),
                                                      btc_mini.index[-1].astype('datetime64[s]')))

<class 'pandas.core.frame.DataFrame'>
Int64Index: 86399 entries, 1619827260 to 1625011200
Data columns (total 9 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Asset_ID  86399 non-null  int64  
 1   Count     86399 non-null  float64
 2   Open      86399 non-null  float64
 3   High      86399 non-null  float64
 4   Low       86399 non-null  float64
 5   Close     86399 non-null  float64
 6   Volume    86399 non-null  float64
 7   VWAP      86399 non-null  float64
 8   Target    86399 non-null  float64
dtypes: float64(8), int64(1)
memory usage: 6.6 MB
start timestamp: 2021-05-01 00:01:00; end timestamp: 2021-06-30 00:00:00
start timestamp: 2021-05-01 00:01:00+00:00; end timestamp: 2021-06-30 00:00:00+00:00
start timestamp: 2021-05-01T00:01:00; end timestamp: 2021-06-30T00:00:00


In [15]:
eth_mini.info(show_counts = True)

print("start timestamp: {}; end timestamp: {}".format(eth_mini.index[0].astype('datetime64[s]'),
                                                      eth_mini.index[-1].astype('datetime64[s]')))

<class 'pandas.core.frame.DataFrame'>
Int64Index: 86396 entries, 1619827260 to 1625011200
Data columns (total 9 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Asset_ID  86396 non-null  int64  
 1   Count     86396 non-null  float64
 2   Open      86396 non-null  float64
 3   High      86396 non-null  float64
 4   Low       86396 non-null  float64
 5   Close     86396 non-null  float64
 6   Volume    86396 non-null  float64
 7   VWAP      86396 non-null  float64
 8   Target    86392 non-null  float64
dtypes: float64(8), int64(1)
memory usage: 6.6 MB
start timestamp: 2021-05-01T00:01:00; end timestamp: 2021-06-30T00:00:00


In [16]:
# fill up missing timestamps

btc_mini = fill_time_gap(btc_mini)
eth_mini = fill_time_gap(eth_mini)


In [17]:
btc_mini.info(show_counts = True)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 86400 entries, 1619827260 to 1625011200
Data columns (total 9 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Asset_ID  86400 non-null  int64  
 1   Count     86400 non-null  float64
 2   Open      86400 non-null  float64
 3   High      86400 non-null  float64
 4   Low       86400 non-null  float64
 5   Close     86400 non-null  float64
 6   Volume    86400 non-null  float64
 7   VWAP      86400 non-null  float64
 8   Target    86400 non-null  float64
dtypes: float64(8), int64(1)
memory usage: 5.9 MB


In [18]:
eth_mini.info(show_counts = True)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 86400 entries, 1619827260 to 1625011200
Data columns (total 9 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Asset_ID  86400 non-null  int64  
 1   Count     86400 non-null  float64
 2   Open      86400 non-null  float64
 3   High      86400 non-null  float64
 4   Low       86400 non-null  float64
 5   Close     86400 non-null  float64
 6   Volume    86400 non-null  float64
 7   VWAP      86400 non-null  float64
 8   Target    86393 non-null  float64
dtypes: float64(8), int64(1)
memory usage: 5.9 MB


In [19]:
# # train-test split

# X_btc_train = get_features(
#     btc_mini.loc[train_window[0]:train_window[1]]
# ).fillna(0).to_numpy()
# y_btc_train = btc_mini['Target'].loc[train_window[0]:train_window[1]].fillna(0).to_numpy()

# X_eth_train = get_features(
#     eth_mini.loc[train_window[0]:train_window[1]]
# ).fillna(0).to_numpy()
# y_eth_train = eth_mini['Target'].loc[train_window[0]:train_window[1]].fillna(0).to_numpy()

# X_btc_test = get_features(
#     btc_mini.loc[test_window[0]:test_window[1]]
# ).fillna(0).to_numpy()
# y_btc_test = btc_mini['Target'].loc[test_window[0]:test_window[1]].fillna(0).to_numpy()

# X_eth_test = get_features(
#     eth_mini.loc[test_window[0]:test_window[1]]
# ).fillna(0).to_numpy()
# y_eth_train = eth_mini['Target'].loc[test_window[0]:test_window[1]].fillna(0).to_numpy()


Comment: Here I just fill with 0 now, but it's probably not a good method.

Reference: [Imputing the Time-Series Using Python](https://drnesr.medium.com/filling-gaps-of-a-time-series-using-python-d4bfddd8c460#:~:text=To%20apply%20machine%20learning%20models,or%20filled%20with%20appropriate%20values.&text=However%2C%20this%20is%20not%20applicable%20in%20the%20time%20series.)

In [20]:
# train-test split

X_btc_train = get_features(
    btc_mini.loc[train_window[0]:train_window[1]]
).fillna(0)
y_btc_train = btc_mini['Target'].copy().loc[train_window[0]:train_window[1]].fillna(0)

X_btc_test = get_features(
    btc_mini.loc[test_window[0]:test_window[1]]
).fillna(0)
y_btc_test = btc_mini['Target'].loc[test_window[0]:test_window[1]].fillna(0)


X_eth_train = get_features(
    eth_mini.loc[train_window[0]:train_window[1]]
).fillna(0)
y_eth_train = eth_mini['Target'].loc[train_window[0]:train_window[1]].fillna(0)

X_eth_test = get_features(
    eth_mini.loc[test_window[0]:test_window[1]]
).fillna(0)
y_eth_test = eth_mini['Target'].loc[test_window[0]:test_window[1]].fillna(0)


In [21]:
print(X_btc_train.shape)
print(y_btc_train.shape)
print(X_btc_test.shape)

(41760, 7)
(41760,)
(41761, 7)


In [22]:
print(X_eth_train.shape)
print(y_eth_train.shape)
print(X_eth_test.shape)

(41760, 7)
(41760,)
(41761, 7)


In [23]:
# train-test split

X_btc_train = X_btc_train.to_numpy()
y_btc_train = y_btc_train.to_numpy()

X_btc_test = X_btc_test.to_numpy()
y_btc_test = y_btc_test.to_numpy()


X_eth_train = X_eth_train.to_numpy()
y_eth_train = y_eth_train.to_numpy()

X_eth_test = X_eth_test.to_numpy()
y_eth_test = y_eth_test.to_numpy()


In [24]:
from sklearn.preprocessing import StandardScaler

standard_scaler = StandardScaler()

X_btc_train = standard_scaler.fit_transform(X_btc_train)
X_btc_test = standard_scaler.fit_transform(X_btc_test)

X_eth_train = standard_scaler.fit_transform(X_eth_train)
X_eth_test = standard_scaler.fit_transform(X_eth_test)

# note that it's not easy to describe each column's statistics since the object is array

In [25]:
from sklearn.linear_model import LinearRegression

lr = LinearRegression()

lr.fit(X_btc_train, y_btc_train)
y_btc_pred = lr.predict(X_btc_test)

lr.fit(X_eth_train, y_eth_train)
y_eth_pred = lr.predict(X_eth_test)

In [26]:
# evaluation

score_btc = np.corrcoef(y_btc_pred, y_btc_test)[0,1]
score_eth = np.corrcoef(y_eth_pred, y_eth_test)[0,1]

print("Test score for Linear Regression - BTC: {:.4f}".format(score_btc))
print("Test score for Linear Regression - ETH: {:.4f}".format(score_eth))

Test score for Linear Regression - BTC: 0.0024
Test score for Linear Regression - ETH: 0.0194


# Pipeline

In [27]:
# wrap the training-data-only preprocessing steps into a single function
# steps only applies to train set: fill up time gap 
# however, there is no point handling the time gap if I just fill all missing values with 0s

Reference: 
- [How to use sklearn fit_transform with pandas and return dataframe instead of numpy array?](https://stackoverflow.com/questions/35723472/how-to-use-sklearn-fit-transform-with-pandas-and-return-dataframe-instead-of-num)
- [Build your first Machine Learning pipeline using scikit-learn!](https://www.analyticsvidhya.com/blog/2020/01/build-your-first-machine-learning-pipeline-using-scikit-learn/)
- [Creating Custom Transformers Using Scikit-Learn](https://www.kaggle.com/ksvmuralidhar/creating-custom-transformers-using-scikit-learn)
- [Pipelines & Custom Transformers in scikit-learn: The step-by-step guide (with Python code)](https://towardsdatascience.com/pipelines-custom-transformers-in-scikit-learn-the-step-by-step-guide-with-python-code-4a7d9b068156)
- [ML Data Pipelines with Custom Transformers in Python](https://towardsdatascience.com/custom-transformers-and-ml-data-pipelines-with-python-20ea2a7adb65)

In [28]:
# customize class for feature transformation
from sklearn.base import BaseEstimator, TransformerMixin

class GetFeatureTransformer(BaseEstimator, TransformerMixin):
    def __init__(self):
        pass
    
    def fit(self, X, y=None):
        return self
    
    def transform(self, X, y=None):
        X_ = get_features(X)
        return X_
    
class FillNaTransformer(BaseEstimator, TransformerMixin):
    def __init__(self):
        pass
    
    def fit(self, X, y=None):
        return self
    
    def transform(self, X, y=None):
        X_ = X.fillna(0.0001)  # prevent log() error  
        X_ = X_.replace(0, 0.0001)  # prevent log() error
        X_ = X_.replace((np.inf, -np.inf), 0.0001)  # handle infinite values in original dataset
        return X_

In [29]:
# assamble the pipeline
from sklearn.pipeline import Pipeline

pipe_lr = Pipeline(steps=[
    ('impute_r1', FillNaTransformer()),
    ('get_feature', GetFeatureTransformer()),
    ('impute_r2', FillNaTransformer()),
    ('scale', StandardScaler()),
    ('model', LinearRegression())
])

# Loop training

In [30]:
def get_asset_data(df_train, asset_id):
    df_i = df_train[df_train['Asset_ID'] == asset_id].copy()
#     df_i = fill_time_gap(df_i)
    y = df_i['Target'].copy()
    y = y.fillna(0)
    X = df_i.drop('Target', axis=1)
    
    return X, y


def get_corr(y_pred, y):
    corr = np.corrcoef(y_pred, y)[0,1]
    return corr

## Training - test on Maker
Edge case with Inf values in the dataset

In [31]:
# use BTC to test interface
# X_tt, y_tt = get_asset_data(df_train, 1)

# troubleshooting Maker fitting
X_tt, y_tt = get_asset_data(df_train, 10)

In [32]:
X_tt.head()

Unnamed: 0,timestamp,Asset_ID,Count,Open,High,Low,Close,Volume,VWAP
1634867,1525965660,10,1.0,1182.0,1182.0,1182.0,1182.0,0.045492,1182.0
1635142,1525967220,10,1.0,1496.0,1496.0,1496.0,1496.0,0.195,1496.0
1635229,1525967700,10,2.0,1496.0,1496.0,1496.0,1496.0,0.07,1496.0
1635389,1525968600,10,3.0,1100.2,1100.2,1090.7,1100.2,0.06986,1091.692637
1636138,1525972740,10,1.0,1488.0,1488.0,1488.0,1488.0,0.001,1488.0


In [33]:
y_tt.head()

1634867    0.0
1635142    0.0
1635229    0.0
1635389    0.0
1636138    0.0
Name: Target, dtype: float64

In [34]:
X_tt.info(show_counts = True)
# BTC: 403948 non-null
# Maker: 376700 non-null
# without the fill_time_gap() process, 670488 non-null

<class 'pandas.core.frame.DataFrame'>
Int64Index: 670497 entries, 1634867 to 24236802
Data columns (total 9 columns):
 #   Column     Non-Null Count   Dtype  
---  ------     --------------   -----  
 0   timestamp  670497 non-null  int64  
 1   Asset_ID   670497 non-null  int64  
 2   Count      670497 non-null  float64
 3   Open       670497 non-null  float64
 4   High       670497 non-null  float64
 5   Low        670497 non-null  float64
 6   Close      670497 non-null  float64
 7   Volume     670497 non-null  float64
 8   VWAP       670488 non-null  float64
dtypes: float64(7), int64(2)
memory usage: 51.2 MB


In [35]:
X_tt.describe()

Unnamed: 0,timestamp,Asset_ID,Count,Open,High,Low,Close,Volume,VWAP
count,670497.0,670497.0,670497.0,670497.0,670497.0,670497.0,670497.0,670497.0,670488.0
mean,1611197000.0,10.0,49.94109,1820.200936,1823.634975,1817.007973,1820.190846,8.235375,
std,14043530.0,0.0,106.711867,1347.895142,1351.104027,1345.147613,1347.878172,20.960418,
min,1525966000.0,10.0,1.0,166.5,166.5,42.0,166.5,-0.366281,-inf
25%,1601981000.0,10.0,10.0,548.5316,549.36,547.57,548.5362,0.716103,548.5552
50%,1612039000.0,10.0,21.0,1544.528667,1549.44,1540.53,1544.512667,2.495282,1544.957
75%,1622097000.0,10.0,50.0,2790.5456,2793.8983,2787.0,2790.534025,7.47048,2790.477
max,1632182000.0,10.0,8885.0,6337.542817,10000.0,6308.59,6342.71278,1618.90478,inf


In [36]:
print(y_tt.size - np.isnan(y_tt).sum())
# BTC: 403948
# Maker: 376700
# Maker without reindexing time: 670497

670497


In [37]:
print(np.any(np.isnan(X_tt)))
#print(np.all(np.isfinite(X_tt)))
print(np.any(np.isinf(X_tt)))

print(np.any(np.isnan(y_tt)))
# print(np.all(np.isfinite(y_tt)))
print(np.any(np.isinf(y_tt)))

True
True
False
False


In [38]:
X_tt.loc[np.isinf(X_tt).any(1)]

Unnamed: 0,timestamp,Asset_ID,Count,Open,High,Low,Close,Volume,VWAP
15008467,1592193180,10,51.0,477.39785,480.91,471.9999,476.9179,61.64614,inf
15021332,1592251080,10,12.0,522.36745,526.6249,518.1,522.04785,2.320164,inf
15039285,1592333460,10,25.0,530.2927,530.9067,529.87,529.9846,7.682915,inf
15087923,1592560440,10,7.0,515.25575,517.1,513.4096,516.0128,0.197087,inf
15113365,1592680740,10,5.0,512.0046,512.9992,511.0,512.0046,0.5,inf
15143374,1592824560,10,6.0,503.6,503.6,502.0,503.6,0.0,-inf
15143593,1592825580,10,20.0,500.9,502.0,499.8,500.97995,8.106645,inf


In [39]:
X_tt.loc[np.isnan(X_tt).any(1)]

Unnamed: 0,timestamp,Asset_ID,Count,Open,High,Low,Close,Volume,VWAP
15004269,1592173560,10,2.0,501.0,501.0,501.0,501.0,0.0,
15004283,1592173620,10,4.0,501.0,501.0,501.0,501.0,0.0,
15059232,1592426160,10,2.0,529.77,529.77,529.77,529.77,0.0,
15143187,1592823720,10,2.0,503.6,503.6,503.6,503.6,0.0,
15183088,1593008940,10,2.0,484.16,484.16,484.16,484.16,0.0,
15184216,1593013920,10,2.0,480.0,480.0,480.0,480.0,0.0,
15184243,1593014040,10,2.0,480.0,480.0,480.0,480.0,0.0,
15184309,1593014340,10,6.0,479.07,479.07,479.07,479.07,0.0,
15184778,1593016440,10,4.0,478.0,478.0,475.0,478.0,0.0,


In [40]:
X_tt.loc[(X_tt==0).any(1)]

Unnamed: 0,timestamp,Asset_ID,Count,Open,High,Low,Close,Volume,VWAP
15004269,1592173560,10,2.0,501.0,501.0,501.0,501.0,0.0,
15004283,1592173620,10,4.0,501.0,501.0,501.0,501.0,0.0,
15059232,1592426160,10,2.0,529.77,529.77,529.77,529.77,0.0,
15143187,1592823720,10,2.0,503.6,503.6,503.6,503.6,0.0,
15143374,1592824560,10,6.0,503.6,503.6,502.0,503.6,0.0,-inf
15183088,1593008940,10,2.0,484.16,484.16,484.16,484.16,0.0,
15184216,1593013920,10,2.0,480.0,480.0,480.0,480.0,0.0,
15184243,1593014040,10,2.0,480.0,480.0,480.0,480.0,0.0,
15184309,1593014340,10,6.0,479.07,479.07,479.07,479.07,0.0,
15184778,1593016440,10,4.0,478.0,478.0,475.0,478.0,0.0,


In [41]:
# X_tt = X_tt.replace((np.inf, -np.inf), 0.0001)
# RuntimeWarning: divide by zero encountered in log
# assign to a very small number instead of 0

In [42]:
# X_tt = X_tt.fillna(0.0001)

In [43]:
# X_tt = get_features(X_tt)

In [44]:
# X_tt = X_tt.fillna(0.0001)

In [45]:
# X_tt = standard_scaler.fit_transform(X_tt)

In [46]:
# X_tt[:5]

In [47]:
# X_tt = X_tt.fillna(0.0001)

In [48]:
print(sum(np.isnan(y_tt)))
print(sum(np.isinf(y_tt)))

0
0


In [49]:
# lr.fit(X_tt, y_tt)

In [50]:
# model_tt = pipe_lr.fit(X_tt, y_tt)

In [51]:
# in-sample prediction

# y_tt_pred = model_tt.predict(X_tt)
# # print("In-sample test score for Linear Regression - BTC: {:.4f}".format(get_corr(y_tt_pred, y_tt)))
# print("In-sample test score for Linear Regression - Maker: {:.4f}".format(get_corr(y_tt_pred, y_tt)))   # 0.0309

In [52]:
dict_pred = {
    'row_id': np.arange(50),
    'Target': np.zeros(50)
}

# df_pred = pd.DataFrame(np.array([[0,0]]),
#                        columns=['row_id', 'Target'])
df_pred = pd.DataFrame(data=dict_pred)

df_test = X_tt[:50].copy()
df_test.insert(0, 'row_id', np.arange(len(df_test)))               

In [53]:
df_pred.head()

Unnamed: 0,row_id,Target
0,0,0.0
1,1,0.0
2,2,0.0
3,3,0.0
4,4,0.0


In [54]:
df_test.head()

Unnamed: 0,row_id,timestamp,Asset_ID,Count,Open,High,Low,Close,Volume,VWAP
1634867,0,1525965660,10,1.0,1182.0,1182.0,1182.0,1182.0,0.045492,1182.0
1635142,1,1525967220,10,1.0,1496.0,1496.0,1496.0,1496.0,0.195,1496.0
1635229,2,1525967700,10,2.0,1496.0,1496.0,1496.0,1496.0,0.07,1496.0
1635389,3,1525968600,10,3.0,1100.2,1100.2,1090.7,1100.2,0.06986,1091.692637
1636138,4,1525972740,10,1.0,1488.0,1488.0,1488.0,1488.0,0.001,1488.0


In [55]:
k = 0
for j, row in df_test.iterrows():
    if k == 1:
        break
    print(row)
    k += 1

row_id       0.000000e+00
timestamp    1.525966e+09
Asset_ID     1.000000e+01
Count        1.000000e+00
Open         1.182000e+03
High         1.182000e+03
Low          1.182000e+03
Close        1.182000e+03
Volume       4.549208e-02
VWAP         1.182000e+03
Name: 1634867, dtype: float64


A problem I encountered: not able to break at j==5. The loop just kept running until all the rows are returned.\
Solution: 
[While Loop does not break and runs infinitely (Python)](https://stackoverflow.com/questions/58974030/while-loop-does-not-break-and-runs-infinitely-python)

In [56]:
X_train = {}
y_train = {}
model_lr = {}
y_insmpl_pred = {}
score_insmpl = {}

for asset_id, asset_name in zip(df_asset_details['Asset_ID'], df_asset_details['Asset_Name']):
    print(f"Training model for {asset_name:<16} (ID={asset_id:<2})...")
    
    X_train[asset_id], y_train[asset_id] = get_asset_data(df_train, asset_id)
    model_lr[asset_id] = pipe_lr.fit(X_train[asset_id], y_train[asset_id])
    y_insmpl_pred[asset_id] = model_lr[asset_id].predict(X_train[asset_id])
    score_insmpl[asset_id] = get_corr(y_insmpl_pred[asset_id], y_train[asset_id])
    
    print(f"In-sample test score for {asset_name:<16} {score_insmpl[asset_id]:.4f}")

Training model for Bitcoin Cash     (ID=2 )...
In-sample test score for Bitcoin Cash     0.0150
Training model for Binance Coin     (ID=0 )...
In-sample test score for Binance Coin     0.0086
Training model for Bitcoin          (ID=1 )...
In-sample test score for Bitcoin          0.0159
Training model for EOS.IO           (ID=5 )...
In-sample test score for EOS.IO           0.0241
Training model for Ethereum Classic (ID=7 )...
In-sample test score for Ethereum Classic 0.0155
Training model for Ethereum         (ID=6 )...
In-sample test score for Ethereum         0.0189
Training model for Litecoin         (ID=9 )...
In-sample test score for Litecoin         0.0198
Training model for Monero           (ID=11)...
In-sample test score for Monero           0.0203
Training model for TRON             (ID=13)...
In-sample test score for TRON             0.0328
Training model for Stellar          (ID=12)...
In-sample test score for Stellar          0.0146
Training model for Cardano          (ID=

RuntimeWarning: invalid value encountered in reduce\
      9 \
     10     X_train[asset_id], y_train[asset_id] = get_asset_data(df_train, asset_id)\
---> 11     model_lr[asset_id] = pipe_lr.fit(X_train[asset_id], y_train[asset_id])\
     12     y_insmpl_pred[asset_id] = model_lr[asset_id].predict(X_train[asset_id])\
     13     score_insmpl[asset_id] = get_corr(y_insmpl_pred[asset_id], y_train[asset_id])

01/30/2022    changed preprocessing to handle Inf, NaN, and 0s
```
Training model for Bitcoin Cash     (ID=2 )...
In-sample test score for Bitcoin Cash     0.0311
Training model for Binance Coin     (ID=0 )...
In-sample test score for Binance Coin     0.0265
Training model for Bitcoin          (ID=1 )...
In-sample test score for Bitcoin          0.0333
Training model for EOS.IO           (ID=5 )...
In-sample test score for EOS.IO           0.0348
Training model for Ethereum Classic (ID=7 )...
In-sample test score for Ethereum Classic 0.0348
Training model for Ethereum         (ID=6 )...
In-sample test score for Ethereum         0.0433
Training model for Litecoin         (ID=9 )...
In-sample test score for Litecoin         0.0435
Training model for Monero           (ID=11)...
In-sample test score for Monero           0.0224
Training model for TRON             (ID=13)...
In-sample test score for TRON             0.0328
Training model for Stellar          (ID=12)...
In-sample test score for Stellar          0.0146
Training model for Cardano          (ID=3 )...
In-sample test score for Cardano          0.0392
Training model for IOTA             (ID=8 )...
In-sample test score for IOTA             0.0082
Training model for Maker            (ID=10)...
/opt/conda/lib/python3.7/site-packages/pandas/core/arraylike.py:364: RuntimeWarning: invalid value encountered in log
  result = getattr(ufunc, method)(*inputs, **kwargs)
/opt/conda/lib/python3.7/site-packages/pandas/core/arraylike.py:364: RuntimeWarning: invalid value encountered in log
  result = getattr(ufunc, method)(*inputs, **kwargs)
In-sample test score for Maker            0.0307
Training model for Dogecoin         (ID=4 )...
In-sample test score for Dogecoin         0.0388
```

01/30/2022    Abandoned log return as predictors because the test dataframes don't have continuous timeseries data
```
Training model for Bitcoin Cash     (ID=2 )...
In-sample test score for Bitcoin Cash     0.0150
Training model for Binance Coin     (ID=0 )...
In-sample test score for Binance Coin     0.0086
Training model for Bitcoin          (ID=1 )...
In-sample test score for Bitcoin          0.0159
Training model for EOS.IO           (ID=5 )...
In-sample test score for EOS.IO           0.0241
Training model for Ethereum Classic (ID=7 )...
In-sample test score for Ethereum Classic 0.0155
Training model for Ethereum         (ID=6 )...
In-sample test score for Ethereum         0.0189
Training model for Litecoin         (ID=9 )...
In-sample test score for Litecoin         0.0198
Training model for Monero           (ID=11)...
In-sample test score for Monero           0.0203
Training model for TRON             (ID=13)...
In-sample test score for TRON             0.0328
Training model for Stellar          (ID=12)...
In-sample test score for Stellar          0.0146
Training model for Cardano          (ID=3 )...
In-sample test score for Cardano          0.0344
Training model for IOTA             (ID=8 )...
In-sample test score for IOTA             0.0060
Training model for Maker            (ID=10)...
In-sample test score for Maker            0.0309
Training model for Dogecoin         (ID=4 )...
In-sample test score for Dogecoin         0.0315
```

In [57]:
# make predictions - TEST
for j, row in df_test.iterrows():
    asset_id = row['Asset_ID']
    y_pred = model_lr[asset_id].predict(row.to_frame().T)[0]
#     y_pred = model_lr[asset_id].predict([row])[0] # 'list' object has no attribute 'fillna'
    df_pred.loc[df_pred['row_id']==row['row_id'], 'Target'] = y_pred
        

In [58]:
import traceback

df_test_all = {}
df_pred_all = {}

env = gresearch_crypto.make_env()
iter_test = env.iter_test()


In [59]:
for i, (df_test, df_pred) in enumerate(iter_test):
    
    # make predictions
    for j, row in df_test.iterrows():
        asset_id = row['Asset_ID']
        try:
            y_pred = model_lr[asset_id].predict(row.to_frame().T)[0]
        except:
            y_pred = 0.0
            traceback.print_exc()
        df_pred.loc[df_pred['row_id']==row['row_id'], 'Target'] = y_pred
        
    # store test dataframes
    df_test_all[i] = df_test
    df_pred_all[i] = df_pred
    
    # submit predictions
    env.predict(df_pred)

This version of the API is not optimized and should not be used to estimate the runtime of your code on the hidden test set.


Ran into problem with row level prediction because 1 row cannot calculate lag(n) return.
```
AttributeError: 'numpy.float64' object has no attribute 'diff'
```

ValueError: Expected 2D array, got 1D array instead:
array=[1.48143939e+00 1.47855583e+00 1.48603000e+00 1.47800000e+00
 1.48368133e+00 2.34866667e-03 5.55833333e-04].
Reshape your data either using array.reshape(-1, 1) if your data has a single feature or array.reshape(1, -1) if it contains a single sample.

----> 1 for i, (df_test, df_pred) in enumerate(iter_test):
      2 
      3     # make predictions
      4     for j, row in df_test.iterrows():
      5         asset_id = row['Asset_ID']

TypeError: cannot unpack non-iterable NoneType object

In [60]:
df_test_all

{0:      timestamp  Asset_ID  Count          Open          High           Low  \
 0   1623542400         3   1201      1.478556      1.486030      1.478000   
 1   1623542400         2   1020    580.306667    583.890000    579.910000   
 2   1623542400         0    626    343.789500    345.108000    343.640000   
 3   1623542400         1   2888  35554.289632  35652.464650  35502.670000   
 4   1623542400         4    433      0.312167      0.312600      0.311920   
 5   1623542400         5    359      4.832550      4.845900      4.822900   
 6   1623542400         7    541     55.223080     55.494000     55.182000   
 7   1623542400         6   2186   2371.194286   2379.200000   2369.670000   
 8   1623542400         8     35      1.003150      1.019800      0.987300   
 9   1623542400         9    560    161.933429    162.480000    161.730000   
 10  1623542400        10     61   2939.862750   2952.160000   2936.230000   
 11  1623542400        13    229      0.068132      0.068240 

In [61]:
df_pred_all

{0:     row_id     Target
 0        0  -0.002162
 1        1  -1.005113
 2        2  -0.315059
 3        3 -21.113435
 4        4  -0.000229
 5        5  -0.001391
 6        6  -0.102449
 7        7  -2.620626
 8        8  -0.001866
 9        9  -0.043631
 10      10  -2.129429
 11      11   0.000047
 12      12  -0.000140
 13      13  -0.151552,
 1:     row_id    Target
 0       14 -0.000227
 1       15 -0.573593
 2       16 -0.064789
 3       17  7.772273
 4       18  0.000144
 5       19  0.000327
 6       20 -0.025276
 7       21  0.438052
 8       22 -0.000804
 9       23 -0.040779
 10      24 -1.735704
 11      25  0.000032
 12      26 -0.000030
 13      27  0.033135,
 2:     row_id     Target
 0       28   0.000669
 1       29   0.368379
 2       30   0.167458
 3       31  19.722343
 4       32   0.000245
 5       33   0.004326
 6       34   0.072100
 7       35   1.651156
 8       36   0.000285
 9       37  -0.024190
 10      38   2.949211
 11      39   0.000121
 12      40   0

In [62]:
file_smpl_subm = 'example_sample_submission.csv'

df_smpl_subm = pd.read_csv(os.path.join(dir_in, file_smpl_subm))

In [63]:
df_smpl_subm.head()

Unnamed: 0,group_num,row_id,Target
0,0,0,0
1,0,1,0
2,0,2,0
3,0,3,0
4,0,4,0


In [64]:
df_subm_wgid = pd.DataFrame(columns = df_smpl_subm.columns)

In [65]:
df_subm = pd.DataFrame(columns = ['row_id', 'Target'])

In [66]:
for group_num, df_pred in df_pred_all.items():
    df = df_pred.copy()
    
    # without group_num
    df_subm = df_subm.append(df)
    
    # with group_num
    df['group_num'] = group_num
    df_subm_wgid = df_subm_wgid.append(df)

In [67]:
df_subm.head()

Unnamed: 0,row_id,Target
0,0,-0.002162
1,1,-1.005113
2,2,-0.315059
3,3,-21.113435
4,4,-0.000229


In [68]:
df_subm_wgid.head()

Unnamed: 0,group_num,row_id,Target
0,0,0,-0.002162
1,0,1,-1.005113
2,0,2,-0.315059
3,0,3,-21.113435
4,0,4,-0.000229


In [69]:
df_subm.to_csv('submission.csv', index=False)
df_subm_wgid.to_csv('submission_with_group_num.csv', index=False)