In [1]:
import os 
import pandas as pd 
import dataPrep as dP
pd.set_option('display.width',200)
pd.set_option('display.max_columns',200)
import warnings 
warnings.filterwarnings('ignore')

In [2]:
#Get the bitstamp and the coinbase data 
bitstampData = dP.getBitstampData()
coinbaseData = dP.coinbaseData()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4363457 entries, 0 to 4363456
Data columns (total 8 columns):
 #   Column             Dtype  
---  ------             -----  
 0   Timestamp          int64  
 1   Open               float64
 2   High               float64
 3   Low                float64
 4   Close              float64
 5   Volume_(BTC)       float64
 6   Volume_(Currency)  float64
 7   Weighted_Price     float64
dtypes: float64(7), int64(1)
memory usage: 266.3 MB
None
Timestamp                  0
Open                 1236977
High                 1236977
Low                  1236977
Close                1236977
Volume_(BTC)         1236977
Volume_(Currency)    1236977
Weighted_Price       1236977
Date                       0
dtype: int64


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2099760 entries, 0 to 2099759
Data columns (total 8 columns):
 #   Column             Dtype  
---  ------             -----  
 0   Timestamp          int64  
 1   Open               flo

We have data from two sources for bitcoin prices - One from Bitstamp and the other from Coinbase. It is given as Time Series and let's look at the tasks which can be performed using Time Series Analysis (ARIMA, RNN and LSTM). 
Start by extracting the relevant data:
Date as the Index 
Weighted Price at different points in time 
Amount of Bitcoin transacted in the time window
Volume of currency transacted in the time window

To construct the time series for analysis, let's choose one value from each day. The most obvious way to do this is to go with the last recorded time and make it as a proxy for the entire day. This works somewhat for prices, but not so much for the Volume of currency or the weighted price. 

In [14]:
#Group by date and extract the maximum and the mean values
def getMeanAndMaxValues(df):
    df = df.groupby('Date')
    meanValues = df.mean()
    maxValues = df.max()
    return meanValues,maxValues
muBitstamp,maxBitstamp = getMeanAndMaxValues(bitstampData)
muCoinbase,maxCoinbase = getMeanAndMaxValues(coinbaseData)
#Free up some memory 
del bitstampData,coinbaseData

<class 'pandas.core.frame.DataFrame'>
Index: 3033 entries, 2011-12-31 to 2020-04-22
Data columns (total 7 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Open               3033 non-null   float64
 1   High               3033 non-null   float64
 2   Low                3033 non-null   float64
 3   Close              3033 non-null   float64
 4   Volume_(BTC)       3033 non-null   float64
 5   Volume_(Currency)  3033 non-null   float64
 6   Weighted_Price     3033 non-null   float64
dtypes: float64(7)
memory usage: 189.6+ KB
None
                Open      High       Low     Close  Volume_(BTC)  Volume_(Currency)  Weighted_Price
Date                                                                                               
2011-12-31  4.465000  4.482500  4.465000  4.482500     23.829470         106.330084        4.471603
2012-01-01  4.806667  4.806667  4.806667  4.806667      7.200667          35.259720        4.806667
2012

"\ngroup = bitstampData.groupby('Date')\nmeanValues = group.mean()\nmaxValues = group.max()\n"

In [26]:
#Let's predict using the data from Bitstamp exchange since more points are available here
#Than the data from Coinbase
#Use the weighted prices and the closing prices.
import numpy as np
meanWeightedPrices = muBitstamp['Weighted_Price']
maxWeightedPrices = maxBitstamp['Weighted_Price']
meanClosingPrices = muBitstamp['Close']
maxClosingPrices = maxBitstamp['Close']
#Split into training and test data 
#I choose last 100 rows as testing data 
weightedTrainMean = meanWeightedPrices.iloc[:len(meanWeightedPrices) - 100]
weightedTestMean = meanWeightedPrices.iloc[len(weightedTrainMean):]
weightedTrainMax = maxWeightedPrices.iloc[:len(maxWeightedPrices) - 100]
weightedTestMax = maxWeightedPrices.iloc[len(weightedTrainMax):]
weightedTrainMean = np.array(weightedTrainMean)
weightedTestMean = np.array(weightedTestMean)
weightedTrainMean = np.reshape(weightedTrainMean,(len(weightedTrainMean),1))
weightedTestMean = np.reshape(weightedTestMean,(len(weightedTestMean),1))
#print(weightedTrainMean.shape)
#print(weightedTestMean.shape)

(2933, 1)
(100, 1)


In [30]:
#Preprocessing step 2: Scale the values between 0 and 1 using scikit library
from sklearn.preprocessing import MinMaxScaler
def getTrainingData(X,timeStep):
    #X is the Time series in the form of an array and timeStep is the chosen timestep
    #Returns a training array and a set of labels matching the shape
    scaler=MinMaxScaler(feature_range=(0,1))
    X = scaler.fit_transform(X)
    x_train = []
    y_train = []
    for i in range(timeStep,X.shape[0]):
        x_train.append(X[i-timeStep:i,0])
        y_train.append(X[i,0])
    return np.array(x_train),np.array(y_train)
X_weightedPrice_mean,Y_weightedPrice_mean = getTrainingData(weightedTrainMean,100)
X_weightedPrice_mean = np.reshape(X_weightedPrice_mean,(X_weightedPrice_mean.shape[0],X_weightedPrice_mean.shape[1],1))
print(X_weightedPrice_mean.shape)
print(Y_weightedPrice_mean.shape)            

(2833, 100, 1)
(2833,)


In [31]:
#Build and compile the RNN
_,m,n = X_weightedPrice_mean.shape
from keras.models import Sequential
from keras.layers import Dense, SimpleRNN, Dropout,Flatten
model = Sequential()
model.add(SimpleRNN(128,activation = 'relu',return_sequences = True,input_shape = (m,n)))
model.add(SimpleRNN(64,activation = 'relu',return_sequences = True))
model.add(Dropout(0.5))
model.add(SimpleRNN(64,activation = 'relu',return_sequences = True))
model.add(Flatten())
model.add(Dense(1))
model.compile(loss = 'mean_squared_error',optimizer = 'adam',metrics - ['accuracy'])
model.summary()
history = model.fit(X_weightedPrice_mean,Y_weightedPrice_mean,epochs = 100,batch_size = 64)

Using TensorFlow backend.


Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
simple_rnn_1 (SimpleRNN)     (None, 100, 128)          16640     
_________________________________________________________________
simple_rnn_2 (SimpleRNN)     (None, 100, 64)           12352     
_________________________________________________________________
dropout_1 (Dropout)          (None, 100, 64)           0         
_________________________________________________________________
simple_rnn_3 (SimpleRNN)     (None, 100, 64)           8256      
_________________________________________________________________
flatten_1 (Flatten)          (None, 6400)              0         
_________________________________________________________________
dense_1 (Dense)              (None, 1)                 6401      
Total params: 43,649
Trainable params: 43,649
Non-trainable params: 0
__________________________________________________

KeyboardInterrupt: 

In [32]:
history.history 


NameError: name 'history' is not defined