In [2]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

1. (5 points) Step 1: Data collection - Data is already given by the Competition

In [3]:
import random
import keras
import tensorflow as tf
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import MinMaxScaler
from sklearn.linear_model import LinearRegression 
from tensorflow.keras import Sequential
from tensorflow.keras.layers import Dense, LSTM, Dropout
from sklearn.model_selection import KFold

Set global seed but the operation seed is not set here, we get different results for every call to the random op, but the same sequence for every re-run of the program.

In [4]:
tf.random.set_seed(1234)

For Memory Optimizaton and Utilization.

In [5]:
def downcastMemoryUsage(dataFrame):
    startMemoryOptimization = dataFrame.memory_usage().sum() / 1024 ** 2
    print('Memory usage of dataframe is: {:.2f} MB'.format(startMemoryOptimization))
    subTypeInt = ['uint8','uint16','uint32','uint64','int8','int16','int32','int64']
    subTypeFloat = ['float16','float32','float64']
    for column in dataFrame.columns:
        columnType = str(dataFrame[column].dtypes)
        maximumColumn = dataFrame[column].max()
        minimumColumn = dataFrame[column].min()
        if 'int' in columnType:
            for element in subTypeInt:
                if minimumColumn > np.iinfo(element).min and maximumColumn < np.iinfo(element).max:
                    dataFrame[column] = dataFrame[column].astype(element)
                    break
        elif 'float' in columnType:
            for element in subTypeFloat:
                if minimumColumn > np.finfo(element).min and maximumColumn < np.finfo(element).max:
                    dataFrame[column] = dataFrame[column].astype(element)
                    break
        elif 'object' in columnType:
            if column =='date':
                dataFrame['date'] = pd.to_datetime(dataFrame['date'],format='%Y-%m-%d')
            else:
                numberOfUnique = len(dataFrame[column].unique())
                numberOfTotal = len(dataFrame[column])
                if numberOfUnique / numberOfTotal < 0.5:
                    dataFrame[column] = dataFrame[column].astype('category')
    endMemoryOptimization = dataFrame.memory_usage().sum() / 1024 ** 2
    print('Memory usage after optimization is: {:.2f} MB'.format(endMemoryOptimization))
    print('Compressed by: {:.2f} %'.format(100*(startMemoryOptimization - endMemoryOptimization) / startMemoryOptimization))
    
    return dataFrame

A. Loading data into python

In [6]:
asset_df = pd.read_csv('../input/g-research-crypto-forecasting/asset_details.csv')
downcastMemoryUsage(asset_df)

In [7]:
train_df = pd.read_csv('../input/g-research-crypto-forecasting/train.csv',nrows=5000) #start with the first 5K rows
train_df

B. Report the number of data samples, the number of features.

In [8]:
len(train_df)

In [9]:
len(asset_df)

In [10]:
train_df.info()

Number of Features = 9

Number of rows = 5000

Number of columns =10


Calculate Percentage Weight for Each Coin.

In [11]:
asset_df.sort_values(by=['Weight'],ascending=False,inplace=True)
# df[percent] = (df['column_name'] / df['column_name'].sum()) * 100
asset_df['coinWeightPercent'] = (asset_df['Weight'] / asset_df['Weight'].sum()) * 100
asset_df

Plotting the Amount of Weight Each Crypto, Received in the Metric.

In [12]:
fig = plt.figure()
ax = plt.gca()
box = ax.get_position()
ax.set_position([box.x0, box.y0, box.width * 5, box.height * 3])
colors = sns.color_palette('colorblind')[0:14]
labels = ['Bitcoin', 'Ethereum', 'Cardano', 'Binance Coin', 'Dogecoin', 'Bitcoin Cash', 'Litecoin', 'Ethereum Classic',
          'Stellar', 'TRON', 'Monero', 'EOS.IO', 'IOTA', 'Maker']
explode = (0.3, 0.3, 0.2, 0.2, 0.1, 0.1, 0.1, 0.0, 0.1, 0.0, 0.1, 0.0, 0.1, 0.0)
plt.pie(asset_df['coinWeightPercent'], colors=colors, autopct='%.0f%%', labels=labels, explode=explode,
        startangle=30, shadow=True, textprops={'fontweight': 'semibold', 'fontsize': 15},
        wedgeprops={'linewidth': 2, 'edgecolor': 'k'}, labeldistance=1.1)
plt.title("Amount of Weight of Each Crypto.", fontweight="bold", fontsize=22, pad=21)
plt.axis('equal')
plt.show()

Convert "timestamp" to DateTime[s]

In [13]:
train_df['DateAndTime'] = pd.to_datetime(train_df['timestamp'], unit='s',utc = True,infer_datetime_format = True,).apply(lambda x:x.tz_convert('Europe/London'))
train_df['Date'] = train_df.DateAndTime.dt.date
train_df['Date'] = train_df['Date'].astype('datetime64[ns]')
train_df.set_index(['DateAndTime'], inplace=True)
downcastMemoryUsage(train_df)

In [14]:
train_df.isnull().sum()

Handling the Null Values

In [15]:
def replace_missing (attribute):
    return attribute.interpolate(inplace=True)


replace_missing(train_df['VWAP'])
replace_missing(train_df['Target'])

train_df.isnull().sum()

Data Preprocessing

In [16]:
startDate = '2021-01-01'
endDate = '2021-12-31'
mask = (train_df['Date'] > startDate) & (train_df['Date'] <= endDate) & (train_df['Asset_ID'] == 1)
train_data1 = train_df.loc[mask]
train_data1

In [17]:
data_training = train_data1[train_data1['Date'] >= '2021-01-01'].copy()

In [18]:
from sklearn.model_selection import train_test_split
train_set, test_set = train_test_split(train_df, test_size=0.5, random_state=42)
print(train_set.shape)
print(test_set.shape)

In [19]:
Test_set,val_set=train_test_split(test_set, test_size=0.5, random_state=42)
print(test_set.shape)
print(val_set.shape)

Plotting the dataset

In [20]:
fig, ax = plt.subplots(figsize = (20,10))
ax.plot(train_set.Close,color="#004C99")
ax.plot(test_set.Close,color="#D96552")
ax.set_facecolor("#D3D3D3")
plt.grid(b=True,axis = 'y')
ax.grid(b=True,axis = 'y')
plt.ylabel('USD')
plt.xlabel('Time')
plt.legend(['Train set', 'Test set'], loc='upper right',prop={'size': 15})
print('Dimension of train data: ',train_set.shape)
print('Dimension of test data: ', test_set.shape)

Splitting into training and testing set

In [21]:
# Split train data to X and y
X_train = train_set.drop(['timestamp','Asset_ID','Count','VWAP','Target','Date'], axis = 1)
y_train = train_set.loc[:,['Close']]

# Split test data to X and y
X_test = test_set.drop(['timestamp','Asset_ID','Count','VWAP','Target','Date'], axis = 1)
y_test = test_set.loc[:,['Close']]

The size of the training and testing set

In [22]:
print("X_train Dimensions:", X_train.shape)
print("y_train Dimensions:", y_train.shape)
print("X_test Dimensions:", X_test.shape)
print("y_test Dimensions:", y_test.shape)

Normalization

In [23]:
#StandardScaler is used to normalize the data
scaler = MinMaxScaler()

# Apply the scaler to training data
X_train = scaler.fit_transform(X_train)
y_train = scaler.fit_transform(y_train)

# Apply the scaler to test data
X_test = scaler.fit_transform(X_test)
y_test = scaler.fit_transform(y_test)

Create a 3D input dataset for Sk-Learn

In [24]:
# Create a 3D input for Scikit-Learn
def create_dataset (X, y, time_steps = 1):
    Xs, ys = [], []
    for i in range(len(X)-time_steps):
        v = X[i:i+time_steps, :]
        Xs.append(v)
        ys.append(y[i+time_steps])
    return np.array(Xs), np.array(ys)
TIME_STEPS = 30
X_test, y_test = create_dataset(X_test, y_test, TIME_STEPS)
X_train, y_train = create_dataset(X_train, y_train,TIME_STEPS)

print("X_train Dimensions:", X_train.shape)
print("y_train Dimensions:", y_train.shape)
print("X_test Dimensions:", X_test.shape)
print("y_test Dimensions:", y_test.shape)

Creating a LSTM Model 

In [25]:
def create_model(units, m):
    model = Sequential()
    model.add(m (units = units, return_sequences = True,input_shape = [X_train.shape[1], X_train.shape[2]]))
    model.add(Dropout(0.2))
    model.add(m (units = units))
    model.add(Dropout(0.2))
    model.add(Dense(units = 1,activation='relu',kernel_regularizer=keras.regularizers.l2(0.01)))
    #Compile model
    model.compile(loss='mse', optimizer='adam')
    return model


Building a LSTM Model

In [35]:
model_lstm = create_model(32, LSTM)
model_lstm

Fitting the Model

In [27]:
def fit_model(model):
    early_stop = keras.callbacks.EarlyStopping(monitor = 'val_loss',
                                               patience = 10)
    kFold = KFold(n_splits=10)
    history = model.fit(X_train, y_train, epochs = 50,  
                        validation_split = 0.1, batch_size = 1024, 
                        shuffle = False, callbacks = [early_stop])
    return history

In [28]:
history_lstm = fit_model(model_lstm)

In [29]:
# Plot train loss and validation loss
def plot_loss (history):
    fig, ax = plt.subplots(figsize = (20,10))
    ax.plot(history.history['loss'],color="#004C99")
    ax.plot(history.history['val_loss'],color="#D96552")
    ax.set_facecolor("#D3D3D3")
    plt.grid(b=True,axis = 'y')
    ax.grid(b=True,axis = 'y')
    plt.ylabel('Loss')
    plt.xlabel('epoch')
    plt.legend(['Train loss', 'Validation loss'], loc='upper right',prop={'size': 15})

In [30]:
plot_loss (history_lstm)

In [31]:
def prediction(model):
    prediction = model.predict(X_test)
    prediction = scaler.inverse_transform(prediction)
    return prediction

prediction_lstm = prediction(model_lstm)

In [32]:
def plot_future(prediction, y_test):
    fig, ax = plt.subplots(figsize = (20,10))
    range_future = len(prediction)
    ax.plot(np.arange(range_future), np.array(y_test),label='Actual',color="#004C99")
    ax.plot(np.arange(range_future),np.array(prediction),label='Prediction',color="#D96552")
    ax.set_facecolor("#D3D3D3")
    plt.grid(b=True,axis = 'y')
    ax.grid(b=True,axis = 'y')
    plt.ylabel('USD')
    plt.legend(loc='upper left',prop={'size': 15})
    plt.tick_params(axis='x', which='both', bottom=False, top=False, labelbottom=False)    

In [33]:
plot_future(prediction_lstm, y_test)

Calculate MAE for Performance

In [34]:
y_pred = model_lstm.predict(X_test)
from sklearn.metrics import mean_absolute_error
mean_absolute_error(y_test, y_pred)