# Imports

In [None]:
import os

import numpy as np
import pandas as pd

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LinearRegression
from sklearn.multioutput import MultiOutputRegressor

# Loading Data

update and confirm the data folder

In [None]:
data_folder = "../input/time-series-crypto-forecasting/"
!ls  $data_folder

In [None]:
asset_details = pd.read_csv(data_folder + 'asset_details.csv')
asset_details.head()

In [None]:
df_train = pd.read_csv(data_folder + 'train.csv')
df_train.head()

In [None]:
df_test = pd.read_csv(data_folder + 'test.csv')
df_test.head()

# Splitting and Preprocessing

In [None]:
# splitting
assets = list()
test_assets = list()

for index, row in asset_details.iterrows():
    asset = {'name': row['Asset_Name'], 
             'df': (df_train[df_train["Asset_ID"] == row['Asset_ID']]).drop(['Asset_ID', 'Unnamed: 0'], axis=1)}
    train_asset = {'name': row['Asset_Name'],
                   'df': ((df_test[df_test["Asset_ID"] == row['Asset_ID']]).drop(['Asset_ID', 'Unnamed: 0'], axis=1))}
    
    assets.append(asset)
    test_assets.append(train_asset)

In [None]:
# changing timestamp to datetime object and changing it to index
# handling missing target values 
# making lag constant

for asset in assets:
    asset['df'] = asset['df'].fillna(0)
    asset['df'] = asset['df'].set_index('timestamp')
    asset['df'] = asset['df'].reindex(range(asset['df'].index[0],asset['df'].index[-1]+60,60),method='pad')
    (asset['df']).index = pd.to_datetime((asset['df']).index, unit='s')
    
for test_asset in test_assets:
    test_asset['df'] = test_asset['df'].fillna(0)
    test_asset['df'] = test_asset['df'].set_index('timestamp')
    test_asset['df'] = test_asset['df'].reindex(range(test_asset['df'].index[0],test_asset['df'].index[-1]+60,60),method='pad')
    (test_asset['df']).index = pd.to_datetime((test_asset['df']).index, unit='s')

In [None]:
for asset in assets:
    start_date = asset['df'].index[0]
    end_date = asset['df'].index[-1]
    asset_name = asset['name']
    
    print('{:<20} {} -> {}'.format(asset_name, start_date, end_date))
    
print(5 * '\n')
    
for asset in test_assets:
    start_date = asset['df'].index[0]
    end_date = asset['df'].index[-1]
    asset_name = asset['name']
    
    print('{:<20} {} -> {}'.format(asset_name, start_date, end_date))

# Linear Regression

## Directly

In [None]:
direct_pipeline = Pipeline([
                    ('scaler', StandardScaler()),
                    ('liner', LinearRegression())
                ])

In [None]:
X_train = assets[0]['df'].drop(['Target'], axis=1)
y_train = assets[0]['df'].Target

X_test = test_assets[0]['df'].drop(['Target'], axis=1)
y_test = test_assets[0]['df'].Target

In [None]:
direct_pipeline.fit(X_train, y_train)
direct_predicted = direct_pipeline.predict(X_test)

In [None]:
np.corrcoef(direct_predicted, y_test)[0,1]

## MultipleOutput Regressor

In [None]:
# asset 0

X_train_0 = assets[0]['df'].drop(['Target'], axis=1)
y_train_0 = assets[0]['df'].Target

X_test_0 = test_assets[0]['df'].drop(['Target'], axis=1)
y_test = test_assets[0]['df'].Target

In [None]:
# asset 3

X_train_0 = assets[3]['df'].drop(['Target'], axis=1)
y_train_0 = assets[3]['df'].Target

X_test_0 = test_assets[3]['df'].drop(['Target'], axis=1)
y_test_0 = test_assets[3]['df'].Target

In [None]:
x_both_train = np.concatenate((X_train_0, X_train), axis=1)
x_both_test = np.concatenate((X_test_0, X_test), axis=1)

y_both_train = np.column_stack((y_train_0, y_train))
y_both_test = np.column_stack((y_test_0, y_test))

In [None]:
multiple_pipeline = Pipeline([
                    ('scaler', StandardScaler()),
                    ('multiple', MultiOutputRegressor(LinearRegression()))
                ])

In [None]:
multiple_pipeline.fit(x_both_train, y_both_train)
y_both_predict = multiple_pipeline.predict(x_both_test)

In [None]:
np.corrcoef(y_both_predict[:,1], y_test)[0,1], np.corrcoef(y_both_predict[:,0], y_test_0)[0,1]

## Feature Engineering

In [None]:
def upper_shadow(df):
    return df['High'] - np.maximum(df['Close'], df['Open'])
def lower_shadow(df):
    return np.minimum(df['Close'], df['Open']) - df['Low']

def get_features(df, lag_count):
    df_feat = df[['Count', 'Open', 'High', 'Low', 'Close', 'Volume', 'VWAP']].copy()
    df_feat['Upper_Shadow'] = upper_shadow(df_feat)
    df_feat['Lower_Shadow'] = lower_shadow(df_feat)
    
    for lag in range(1, lag_count+1):
        df_feat[f'lag_{lag}'] = df.Open.shift(lag)
    df_feat = df_feat.fillna(method='bfill')
    
    df_feat = df_feat.drop(['High', 'Low', 'Close'], axis=1)
    
    return df_feat

In [None]:
results = {}
for asset in assets:
    
    temp = {}
    
    for lag_count in range (2, 35):
        X_train = get_features(asset['df'], lag_count)
        y_train = asset['df'].Target

        X_test = get_features(asset['df'], lag_count)
        y_test = asset['df'].Target
        
        direct_pipeline.fit(X_train, y_train)
        direct_predicted = direct_pipeline.predict(X_test)
        
        temp[lag_count] = np.corrcoef(direct_predicted, y_test)[0,1]
        
    results[asset['name']] = temp
    print(asset['name'] + 'done ...')

In [None]:
pd.DataFrame(results)

In [None]:
pd.DataFrame(results).mean(axis=0)#average for each column


In [None]:
pd.DataFrame(results).mean(axis=1)#average for each row