In [6]:
import random
import keras
import tensorflow as tf
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import MinMaxScaler
from sklearn.linear_model import LinearRegression 
from tensorflow.keras import Sequential
from tensorflow.keras.layers import Dense, LSTM, Dropout, Bidirectional, GRU, LayerNormalization

In [9]:
import os

from google.colab import drive
drive.mount('/content/drive')

os.chdir('/content/drive/MyDrive/Lab4')
os.listdir()

Mounted at /content/drive


['Code',
 'supplemental_train.csv',
 'train.csv',
 'example_sample_submission.csv',
 'example_test.csv',
 'asset_details.csv',
 'notebook98543790ec.ipynb',
 'ans.csv',
 'visualization.ipynb',
 'Lab4.ipynb',
 'bi-lstm-lstm-grus-multi-step-forecasting.ipynb']

In [10]:
tf.random.set_seed(202201)

In [None]:
assetDetailsData = pd.read_csv('asset_details.csv')
trainData = pd.read_csv('train.csv')

In [None]:
assetDetailsData

Unnamed: 0,Asset_ID,Weight,Asset_Name
0,2,2.397895,Bitcoin Cash
1,0,4.304065,Binance Coin
2,1,6.779922,Bitcoin
3,5,1.386294,EOS.IO
4,7,2.079442,Ethereum Classic
5,6,5.894403,Ethereum
6,9,2.397895,Litecoin
7,11,1.609438,Monero
8,13,1.791759,TRON
9,12,2.079442,Stellar


In [None]:
def extract_feature_data_train(train):
    trainData = train
    trainData['DateAndTime'] = pd.to_datetime(trainData['timestamp'], unit='s',utc = True,infer_datetime_format = True)
    trainData['Date'] = trainData.DateAndTime.dt.date
    trainData['Date'] = trainData['Date'].astype('datetime64[ns]')
    trainData.set_index(['DateAndTime'], inplace=True)
    # Data Range    
    startDate = '2021-7-01'
    endDate = '2021-09-21'
    mask = (trainData['Date'] > startDate) & (trainData['Date'] <= endDate)
    newTrainData = trainData.loc[mask]
    
    # Infinite Data
    newTrainData['VWAP'].interpolate(inplace=True)

    VMAX = np.max(newTrainData[np.isfinite(newTrainData.VWAP)].VWAP)
    VMIN = np.min(newTrainData[np.isfinite(newTrainData.VWAP)].VWAP)

    tmp_V = np.nan_to_num(newTrainData.VWAP, posinf=VMAX, neginf=VMIN)
    newTrainData['VWAP'] = tmp_V
    
    # Outlier detection
    upperFence = newTrainData['Target'].mean() + 2*newTrainData['Target'].std()
    lowwerFence = newTrainData['Target'].mean() - 2*newTrainData['Target'].std()

    # Replace outlier by interpolation for base consumption
    newTrainData.loc[newTrainData['Target'] > upperFence, 'Target'] = np.nan
    newTrainData.loc[newTrainData['Target'] < lowwerFence, 'Target'] = np.nan
    newTrainData['Target'].interpolate(inplace=True)
    
    #上影線
    newTrainData['Up_shadow'] = newTrainData['High'] - np.maximum(newTrainData['Close'],newTrainData['Open'])
    #下影線
    newTrainData['Down_shadow'] = np.minimum(newTrainData['Close'],newTrainData['Open']) - newTrainData['Low']
    
    return newTrainData

<a id="14"></a> <br>
# Train and Test Data Set Plot

In [2]:
# Create LSTM or GRU model
def create_model(X_train, n, m):
    model = Sequential()
    model.add(m(units = n ,return_sequences = True,input_shape = [X_train.shape[1], X_train.shape[2]]))
    model.add(Dropout(0.2))
    model.add(LayerNormalization())
    model.add(m(units = n))
    model.add(Dropout(0.2))
    model.add(LayerNormalization())
    model.add(Dense(units = 1,activation='relu',kernel_regularizer=keras.regularizers.l2(0.01)))
    #Compile model
    model.compile(loss='mse', optimizer='adam')
    return model

In [None]:
# Create a 3D input for Scikit-Learn
def create_dataset (X, y, time_steps = 1):
    Xs, ys = [], []
    for i in range(len(X)-time_steps):
        v = X[i:i+time_steps, :]
        Xs.append(v)
        ys.append(y[i+time_steps])
    return np.array(Xs), np.array(ys)

In [None]:
# Fit BiLSTM, LSTM and GRU
def fit_model(model,X,Y):
    early_stop = keras.callbacks.EarlyStopping(monitor = 'val_loss',patience = 5)
    history = model.fit(X, Y, epochs = 100, validation_split = 0.2, batch_size = 1024, shuffle = False, callbacks = [early_stop], verbose=0)
    return model, history

In [None]:
models = {}
historys = {}
Y_origin = {}
Y_pred = {}

In [None]:
trainData = extract_feature_data_train(trainData)

# Split train data and test data
train_size = int(len(trainData)*0.8)
trainData, testData = trainData.iloc[:train_size],trainData.iloc[train_size:]

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  isetter(loc, value)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a

In [None]:
import itertools as it

In [None]:
grid_search={
    'model':[LSTM,GRU],
    'n':range(50,200,50)
}

all_para = sorted(grid_search)
combination = it.product(*(grid_search[name] for name in all_para))
# print(list(combination))

for m,n in list(combination):
    print(m)
    print(n)

<class 'keras.layers.recurrent_v2.LSTM'>
50
<class 'keras.layers.recurrent_v2.LSTM'>
100
<class 'keras.layers.recurrent_v2.LSTM'>
150
<class 'keras.layers.recurrent_v2.GRU'>
50
<class 'keras.layers.recurrent_v2.GRU'>
100
<class 'keras.layers.recurrent_v2.GRU'>
150


In [None]:
for i in assetDetailsData['Asset_ID'].unique():
    grid_loss = 10000
    tmp_train_data = trainData[trainData['Asset_ID']==i]
    tmp_test_data = testData[testData['Asset_ID']==i]

    # train_data = extract_feature_data_train(tmp_train_data)
    
    ## Split train data to X and y
    X_train = tmp_train_data.drop(['timestamp','Asset_ID','Target','Date'], axis = 1)
    Y_train = tmp_train_data.loc[:,['Target']]

    # Split test data to X and y
    X_test = tmp_test_data.drop(['timestamp','Asset_ID','Target','Date'], axis = 1)
    Y_test = tmp_test_data.loc[:,['Target']]
    
    # print("X,Y Train Finish")
    
    # MinMaxScaler is used to normalize the data
    scaler = MinMaxScaler()

    # Apply the scaler to training data
    X_train = scaler.fit_transform(X_train)
    X_test = scaler.transform(X_test)
    Y_train = scaler.fit_transform(Y_train)
    Y_test = scaler.transform(Y_test)
    
    # print("Scaler Finish")
    
    # Create a 3D input for Scikit-Learn
    TIME_STEPS = 30
    X_train, Y_train = create_dataset(np.array(X_train), np.array(Y_train),TIME_STEPS)
    # print("Training Data TimeSteps Finish")
    X_test, Y_test = create_dataset(np.array(X_test), np.array(Y_test),TIME_STEPS)
    # print("Testing Data TimeSteps Finish")


    grid_search={
        'model':[GRU, LSTM],
        'n':range(32,65,16)
    }

    all_para = sorted(grid_search)
    combination = list(it.product(*(grid_search[name] for name in all_para)))
    
    best_m = 0
    best_n = 0

    for m,n in combination:
        model_lstm = create_model(X_train, n, m)
        tmp_model , tmp_loss = fit_model(model_lstm,X_train,Y_train)
        print(tmp_loss.history["val_loss"][-1])
        if tmp_loss.history["val_loss"][-1] < grid_loss:
            grid_loss = tmp_loss.history["val_loss"][-1]
            models[i] = tmp_model
            historys[i] = tmp_loss
            best_m = m
            best_n = n

    print("asset id:",i)
    print("Best_m",best_m)
    print("Best_n",best_n)

    # # model
    # model_lstm = create_model(X_train)
    # print("Create Model Finish")
    
    # models[i],historys[i] = fit_model(model_lstm,X_train,Y_train)
    print(i," finish")
    
    Y_origin[i] = Y_test
    Y_pred[i] = models[i].predict(X_test)


0.010637782514095306
0.010572550818324089
0.010573976673185825
0.010565757751464844
0.01056174747645855
0.01055835746228695
asset id: 2
Best_m <class 'keras.layers.recurrent_v2.LSTM'>
Best_n 64
2  finish
0.011632467620074749
0.011638560332357883
0.011621491983532906
0.01164538599550724
0.011648048646748066
0.01162766944617033
asset id: 0
Best_m <class 'keras.layers.recurrent_v2.GRU'>
Best_n 64
0  finish
0.0046346005983650684
0.005330951418727636
0.005393544211983681
0.004668569192290306
0.005342971067875624
0.0053751347586512566
asset id: 1
Best_m <class 'keras.layers.recurrent_v2.GRU'>
Best_n 32
1  finish
0.014574672095477581
0.01411112304776907
0.014096071012318134
0.014472437091171741
0.014167038723826408
0.014121194370090961
asset id: 5
Best_m <class 'keras.layers.recurrent_v2.GRU'>
Best_n 64
5  finish
0.015847720205783844
0.01589038409292698
0.015894364565610886
0.015950098633766174
0.01593170128762722
0.015943387523293495
asset id: 7
Best_m <class 'keras.layers.recurrent_v2.GRU'>

In [None]:
models[0].predict(X_train[:10])

In [None]:
Y_train[:10]

In [None]:
historys[0].history

In [None]:
fig,axes= plt.subplots(7, 2, figsize=(16,28), constrained_layout=True)
fig.suptitle('Crypto Target training loss', fontsize=20)
plt.legend(['Train loss', 'Validation loss'], loc='upper right',prop={'size': 15})
for i in range(14):
    now_crypto = historys[i]
    axes[i//2][i%2].plot(historys[i].history['loss'],color="#004C99")
    axes[i//2][i%2].plot(historys[i].history['val_loss'],color="#D96552")
    axes[i//2][i%2].set(title = assetDetailsData.Asset_Name[i] + " Training loss")
    xes[i//2][i%2].legend()

In [None]:
Y_origin[1]

In [None]:
fig,axes= plt.subplots(7, 2, figsize=(16,28), constrained_layout=True)
fig.suptitle('Crypto Target Target', fontsize=20)
for i in range(14):
    # plot_data = Y_origin[i][30:]
    # plot_data['Predict'] = Y_pred[i]
    # plot_data = plot_data[:200]
    # ymin = min(plot_data['Target'])
    # ymax = max(plot_data['Target'])

    # print("ymin:",ymin,"\nymax:",ymax)

    axes[i//2][i%2].plot(Y_origin[i][1000:1200], color="#004C99")
    axes[i//2][i%2].plot(Y_pred[i][1000:1200], color="#D96552")
    axes[i//2][i%2].set(title = assetDetailsData.Asset_Name[i] + " Target")
    # axes[i//2][i%2].set_ylim([ymin,ymax])

In [None]:
testData

In [None]:
Y_origin = {}
Y_pred = {}

In [None]:
for i in assetDetailsData['Asset_ID'].unique():
    tmp_test_data = testData[testData['Asset_ID']==i]
    # Split test data to X and y
    X_test = tmp_test_data.drop(['timestamp','Asset_ID','Count','VWAP','Target','Date'], axis = 1)
    Y_test = tmp_test_data.loc[:,['Target']]

    Y_origin[i] = Y_test
    X_test = scaler.fit_transform(X_test)
    Y_test = scaler.fit_transform(Y_test)

    TIME_STEPS = 30
    X_test, Y_test = create_dataset(np.array(X_test), np.array(Y_test),TIME_STEPS)
    print("TimeSteps Finish")

    Y_pred[i] = prediction(models[i],X_test)

    


In [None]:
fig,axes= plt.subplots(7, 2, figsize=(16,28), constrained_layout=True)
fig.suptitle('Crypto Target Target', fontsize=20)
for i in range(14):
    plot_data = Y_origin[i][30:]
    plot_data['Predict'] = Y_pred[i]
    plot_data = plot_data[:200]
    ymin = min(plot_data['Target'])
    ymax = max(plot_data['Target'])

    # print("ymin:",ymin,"\nymax:",ymax)

    axes[i//2][i%2].plot(plot_data.index, plot_data['Target'].values, color="#004C99")
    axes[i//2][i%2].plot(plot_data.index, plot_data['Predict'], color="#D96552")
    axes[i//2][i%2].set(title = assetDetailsData.Asset_Name[i] + " Target")
    axes[i//2][i%2].set_ylim([ymin,ymax])

In [None]:
# # Split test data to X and y
# X_test = testData.drop(['timestamp','Asset_ID','Count','VWAP','Target','Date'], axis = 1)
# Y_test = testData.loc[:,['Close']]

In [None]:
# # Apply the scaler to test data
# X_test = scaler.fit_transform(X_test)
# y_test = scaler.fit_transform(y_test)