In [25]:
# Import Libraries

import numpy as np
import pandas as pd
import hvplot.pandas

In [2]:
# Set the random seed for reproducibility
# Note: This is for the homework solution, but it is good practice to comment this out and run multiple experiments to evaluate your model

from numpy.random import seed
seed(1)
from tensorflow import random
random.set_seed(2)

In [3]:
# Load 'btc_sentiment.csv'

sentiment = pd.read_csv('btc_sentiment.csv', index_col="date", infer_datetime_format=True, parse_dates=True)
sentiment = sentiment.drop(columns="fng_classification")
sentiment.head()

Unnamed: 0_level_0,fng_value
date,Unnamed: 1_level_1
2019-07-29,19
2019-07-28,16
2019-07-27,47
2019-07-26,24
2019-07-25,42


In [4]:
# Load 'btc_historical'

historical = pd.read_csv('btc_historic.csv', index_col="Date", infer_datetime_format=True, parse_dates=True)['Close']
historical = historical.sort_index()
historical.tail()

Date
2019-07-25    9882.429688
2019-07-26    9847.450195
2019-07-27    9478.320313
2019-07-28    9531.769531
2019-07-29    9529.889648
Name: Close, dtype: float64

In [5]:
# Join the data into a single DataFrame

btc = sentiment.join(historical, how="inner")
btc.tail()

Unnamed: 0,fng_value,Close
2019-07-25,42,9882.429688
2019-07-26,24,9847.450195
2019-07-27,47,9478.320313
2019-07-28,16,9531.769531
2019-07-29,19,9529.889648


In [6]:
btc.head()

Unnamed: 0,fng_value,Close
2018-02-01,30,9114.719727
2018-02-02,15,8870.820313
2018-02-03,40,9251.269531
2018-02-04,24,8218.049805
2018-02-05,11,6937.080078


In [7]:
# This function accepts the column number for the features (X) and the target (y)
# It chunks the data up with a rolling window of Xt-n to predict Xt
# It returns a numpy array of X any y

def window_data(btc, window, feature_col_number, target_col_number):
    X = []
    y = []
    for i in range(len(btc) - window - 1):
        features = btc.iloc[i:(i + window), feature_col_number]
        target = btc.iloc[(i + window), target_col_number]
        X.append(features)
        y.append(target)
    return np.array(X), np.array(y).reshape(-1, 1)

In [8]:
# Predict Closing Prices using a 10 day window of previous closing prices
# Then, experiment with window sizes anywhere from 1 to 10 and see how the model performance changes

window_size = 10

# Column index 0 is the 'fng_value' column
# Column index 1 is the `Close` column

feature_column = 1
target_column = 1
X, y = window_data(btc, window_size, feature_column, target_column)

In [9]:
# Use 70% of the data for training and the remainder for testing

split = int(.7 * len(X))
X_train = X[:split - 1]
X_test = X[split:]
y_train = y[:split - 1]
y_test = y[split:]

In [10]:
from sklearn.preprocessing import MinMaxScaler

# Use the MinMaxScaler to scale data between 0 and 1.

x_train_scaler = MinMaxScaler()
x_test_scaler = MinMaxScaler()
y_train_scaler = MinMaxScaler()
y_test_scaler = MinMaxScaler()

# Fit the scaler training data

x_train_scaler.fit(X_train)
y_train_scaler.fit(y_train)

# Scale the training data

X_train = x_train_scaler.transform(X_train)
y_train = y_train_scaler.transform(y_train)

In [11]:
# fit the scaler for the testing the data

x_test_scaler.fit(X_test)
y_test_scaler.fit(y_test)

# Scale the test data

X_test = x_test_scaler.transform(X_test)
y_test = y_test_scaler.transform(y_test)

In [12]:
# Reshape the features for the model

X_train = X_train.reshape((X_train.shape[0], X_train.shape[1], 1))
X_test = X_test.reshape((X_test.shape[0], X_test.shape[1], 1))

## Build and Train the Model

In [13]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Dropout

In [14]:
# Build the LSTM model. 
# The return sequences need to be set to True if you are adding additional LSTM layers, but 
# You don't have to do this for the final layer. 
# Note: The dropouts help prevent overfitting
# Note: The input shape is the number of time steps and the number of indicators
# Note: Batching inputs has a different input shape of Samples/TimeSteps/Features

model = Sequential()

number_units = 30
dropout_fraction = 0.2

# Layer 1

model.add(LSTM(
    units=number_units,
    return_sequences=True,
    input_shape=(X_train.shape[1], 1))
    )
model.add(Dropout(dropout_fraction))
# Layer 2

model.add(LSTM(units=number_units, return_sequences=True))
model.add(Dropout(dropout_fraction))
# Layer 3

model.add(LSTM(units=number_units))
model.add(Dropout(dropout_fraction))
# Output layer

model.add(Dense(1))

In [15]:
# Compile the model

model.compile(optimizer="adam", loss = "mean_squared_error")

In [16]:
# Summarize the model

model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
lstm (LSTM)                  (None, 10, 30)            3840      
_________________________________________________________________
dropout (Dropout)            (None, 10, 30)            0         
_________________________________________________________________
lstm_1 (LSTM)                (None, 10, 30)            7320      
_________________________________________________________________
dropout_1 (Dropout)          (None, 10, 30)            0         
_________________________________________________________________
lstm_2 (LSTM)                (None, 30)                7320      
_________________________________________________________________
dropout_2 (Dropout)          (None, 30)                0         
_________________________________________________________________
dense (Dense)                (None, 1)                 3

In [17]:
# Train the model
# Use at least 10 epochs
# Do not shuffle the data
# Experiment with the batch size, but a smaller batch size is recommended

model.fit(X_train, y_train, epochs=10, shuffle=False, batch_size=1, verbose=1)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<tensorflow.python.keras.callbacks.History at 0x229f26a0748>

## Model Performance

In [18]:
# Evaluate the model

model.evaluate(X_test, y_test)



0.04872175678610802

In [19]:
# Make some predictions

predicted = model.predict(X_test)

In [20]:
predicted

array([[0.0563641 ],
       [0.05869537],
       [0.06159847],
       [0.06438437],
       [0.06688484],
       [0.06960344],
       [0.0709496 ],
       [0.07107282],
       [0.07031783],
       [0.06920239],
       [0.06798703],
       [0.06698795],
       [0.06629479],
       [0.06578927],
       [0.06507555],
       [0.0647285 ],
       [0.06492997],
       [0.06538276],
       [0.06591439],
       [0.06669518],
       [0.06758224],
       [0.06821198],
       [0.06856868],
       [0.06868652],
       [0.06864816],
       [0.06866183],
       [0.0692234 ],
       [0.07011358],
       [0.07104992],
       [0.07198651],
       [0.0730134 ],
       [0.07380316],
       [0.07426901],
       [0.07449985],
       [0.07452438],
       [0.07413819],
       [0.07351522],
       [0.07327057],
       [0.07341339],
       [0.07408349],
       [0.07513486],
       [0.07631572],
       [0.07754049],
       [0.08168179],
       [0.08860809],
       [0.09651043],
       [0.10441642],
       [0.111

In [21]:
# Recover the original prices instead of the scaled version

predicted_prices = y_test_scaler.inverse_transform(predicted)
real_prices = y_test_scaler.inverse_transform(y_test.reshape(-1, 1))

In [22]:
# Create a DataFrame of Real and Predicted values

stocks = pd.DataFrame({
    "Real": real_prices.ravel(),
    "Predicted": predicted_prices.ravel()
}, index = btc.index[-len(real_prices): ]) 
stocks.head()

Unnamed: 0,Real,Predicted
2019-02-20,3924.23999,4248.814941
2019-02-21,3974.050049,4270.220703
2019-02-22,3937.040039,4296.877441
2019-02-23,3983.530029,4322.45752
2019-02-24,4149.089844,4345.416504


In [23]:
# Plot the real vs predicted values as a line chart

stocks.hvplot()

In [24]:
# FIN