# Development of data pipelines from mongodb to ML model

- Removal of matplotlib finanace module (deprecated)
- Usage of raw_data_pipeline folder for development of data pipeline
- Usage of in-line markdown cells in-notebook for readability and consistency
    

In [17]:
# Imports
import pandas as pd
import numpy as np
import requests
import json
from json import loads
import datetime 
import matplotlib.pyplot as plt
import time
import msgpack

# Removal of matplotlib finance module imports (deprecated; implement modern replacement)

# API-specific imports (local install required; do NOT use default pip install)
import gdax
    # Python setup.py install with environment activated to install/use
    # Install locally with 'python setup.py install' & development branch of gdax-python checked out
    # Do not use default gdax pip install package - that version of the package is currently broken
        # Default pip install has broken mongo connection and websocket connection close() error
        # Development branch of gdax-python has merged pull requests that fix those issues

# Pymongo import (connection to local client DB)
import pymongo
from pymongo import MongoClient

# Preprocessing imports
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler, MinMaxScaler, OneHotEncoder
from keras.utils import to_categorical 

# ML imports 
import tensorflow as tf
import keras
from keras.models import Sequential
from keras.layers import Dense, Activation, Dropout
from keras.layers import Embedding, Flatten
from keras.layers import LSTM, GRU
from keras.models import load_model
from keras.models import model_from_json
from keras import backend as K
from keras import optimizers
from keras.layers import Bidirectional
from keras.layers import TimeDistributed

# Auto Support and Resistance/autoSR() import requirements
from sklearn.cluster import MeanShift, estimate_bandwidth
pd.core.common.is_list_like = pd.api.types.is_list_like
from pandas_datareader import data, wb

In [18]:
# Import error for Pandas Datareader, cannot import name "is_list_like"
    # https://github.com/pydata/pandas-datareader/issues/545
    # Workaround for pandas 0.23: https://stackoverflow.com/a/50415484
        # Before pandas_datareader import:
        # pd.core.common.is_list_like = pd.api.types.is_list_like

In [19]:
###########################################################################    
#num_cores = 4
#config = tf.ConfigProto(intra_op_parallelism_threads=num_cores,\
        #inter_op_parallelism_threads=num_cores, allow_soft_placement=True,\
        #device_count = {'CPU' : 1, 'GPU' : 1})
#session = tf.Session(config=config)
#K.set_session(session)
###########################################################################

### Force Keras/TF to use CPU backend when GPU present by setting device_count (above) to:
    # {'CPU' : 1, 'GPU' : 0}

# Import to check check for GPU availability for tensorflow backend
from tensorflow.python.client import device_lib

# Verify GPU availability for tensorflow backend
print(device_lib.list_local_devices())
print("==============================================")
print(K.tensorflow_backend._get_available_gpus())

[name: "/device:CPU:0"
device_type: "CPU"
memory_limit: 268435456
locality {
}
incarnation: 9617758536166070181
]
[]


In [20]:
# Pandas Version
pd.__version__

'0.23.3'

In [21]:
# Jupyter/Ipython notebook environment variables
%matplotlib inline

In [22]:
# Global variables

# Boolean to drop existing mongo collection/scrape upon scrape() init
    # Default = False
dropFlag = False

# Boolean to set size_delta to l2update values for first update to snapshot
    # Inital value = False
firstUpdate_both = False

# Value to track if feature_creation_inital() was run
    # Inital value = False
inital_feature_run = False

# Import 1 hour of raw data + L2 updates into localized mongodb instance for pipeline development

1. Start new mongodb instance in new Terminal
2. Copy mongo_raw from raw_data to raw_data_pipeline folder
3. Navigate to raw_data_pipeline folder 
4. import mongo_raw.json into localized mongo instance:
`mongoimport --db btcusd_db --collection btcusd_collection --file mongo_raw.json`
5. verify db and collection presence with Terminal instance of mongod and/or MongoDB Compass

In [23]:
# Connection establishment (MongoDB)
    # Requires db and collection 'btcusd_db' and 'btcusd_collection' to be present on localized instance

# Establish connection to GDAX public endpoint
public_client = gdax.PublicClient()

# Mongo database and collection specification:
mongo_client = MongoClient('mongodb://localhost:27017/')
db = mongo_client.btcusd_db
btcusd_collection = db.btcusd_collection

In [24]:
# Intermediate on-disk format for pipeline target: 
    # HDF5, Paraquet, Feather, Msgpack  
import feather

In [25]:
# Function to load and parse data from Mongo into dataframes
def load_parse_feather():
    
    #Collection specification (in database)
    input_data = db.btcusd_collection 
    
    # Create individual dataframes for main response types: snapshot, l2update
    snapshot = pd.DataFrame(list(input_data.find({'type':'snapshot'})))
    l2update = pd.DataFrame(list(input_data.find({'type':'l2update'})))
    
    ### snapshot/orderbook state response load and parse ###
    
    # Extract asks/bid individual column of array of arrays into lists
    snapshot_asks = snapshot[['asks'][0]][0]
    snapshot_bids = snapshot[['bids'][0]][0]
    
    # Convert list (of array of arrays) into dataframe
    snapshot_asks_df =pd.DataFrame(snapshot_asks)
    snapshot_bids_df =pd.DataFrame(snapshot_bids)
    
    # Rename columns to snapshot array format:
        # snapshot array format: [price, size]
            # [side, price, size] format 
        # Ask = sell price, bid = buy price
    snapshot_asks_df.rename(columns ={0:'price',1:'size'}, inplace =True)
    snapshot_bids_df.rename(columns ={0:'price',1:'size'}, inplace =True)
    snapshot_asks_df['side'] = "sell"
    snapshot_bids_df['side'] = "buy"
    cols =['side','price','size']
    snapshot_asks_df = snapshot_asks_df[cols]
    snapshot_bids_df = snapshot_bids_df[cols]
    
    ### L2 update response load and parse ###
    
    # Restucture l2update to have [side,price,size] from 'changes' column
    l2update_clean = l2update[['changes','time']]
          
    # Convert changes list of lists -> into array 
    l2_array = np.ravel(l2update_clean['changes']) 
    # Flatten the list and remove outer bracket:
    flattened = [val for sublist in l2_array for val in sublist]
        # Reference: https://stackoverflow.com/questions/11264684/flatten-list-of-lists?
    # Convert back to dataframe and combine with timestamps from l2update:
    changes_df= pd.DataFrame.from_records(flattened)
    # Add time column back to L2 update dataframe
    l2update_formatted = pd.concat([changes_df,l2update_clean['time']],1)
    # Rename columns for [side, price, size]:
    l2update_formatted.rename({0:"side",1:"price",2:"size"}, axis ='columns',inplace=True)
    
    # Save parsed data to csv (API -> Mongo -> Dataframe -> .csv)
        # Save data to .csv format in raw_data folder
    l2update_formatted.to_feather("raw_data_pipeline/l2update.feather")#,header=True,encoding='utf-8',index =False)
    snapshot_asks_df.to_feather("raw_data_pipeline/snapshot_asks.feather")#,header=True,encoding='utf-8',index =False)
    snapshot_bids_df.to_feather("raw_data_pipeline/snapshot_bids.feather")#,header=True,encoding='utf-8',index =False)

In [26]:
load_parse_feather()
# Axis error on rename, Pandas upgrade required:
    # https://stackoverflow.com/questions/47800034/pandas-dataframe-rename-unexpected-keyword-argument-axis-when-using-mapper/47800303