In [1]:
# Testing loading data from mongo db into clean dataframes
    # Mongo db -> Pandas dataframe -> csv

In [2]:
# Imports
import pandas as pd
import numpy as np
import json

In [3]:
# Pymongo import for connection to local client DB
import pymongo
from pymongo import MongoClient

# Mongo database and collection specification:
mongo_client = MongoClient('mongodb://localhost:27017/')
db = mongo_client.btcusd_db # Database specification
input_data = db.btcusd_collection #Collection specification (in database)
data = pd.DataFrame(list(input_data.find()))

In [4]:
# Verify that data was loaded from mongo into dataframe
data.head(10)

Unnamed: 0,_id,asks,bids,changes,channels,maker_order_id,message,price,product_id,reason,sequence,side,size,taker_order_id,time,trade_id,type
0,5ad545e5e3ae712cb8091acb,"[[8042.91, 10.65614728], [8042.92, 0.00118097]...","[[8042.9, 9.11800903], [8042.83, 0.002487], [8...",,,,,,BTC-USD,,,,,,,,snapshot
1,5ad545e5e3ae712cb8091acc,,,,,6279e0f7-567a-4c93-9b0c-5169ce21e43d,,8042.91,BTC-USD,,5693129000.0,sell,0.6535,2a6c1214-6232-4567-9ff6-5e5fb452b428,2018-04-17T00:54:57.887000Z,41766820.0,last_match
2,5ad545e5e3ae712cb8091acd,,,,"[{'name': 'level2', 'product_ids': ['BTC-USD']...",,,,,,,,,,,,subscriptions
3,5ad545e5e3ae712cb8091ace,,,,,,Failed to unsubscribe,,,You need to specify at least one product ID fo...,,,,,,,error
4,5ad545e5e3ae712cb8091acf,,,"[[buy, 8041.33000000, 0]]",,,,,BTC-USD,,,,,,2018-04-17T00:55:04.358Z,,l2update
5,5ad545e5e3ae712cb8091ad0,,,"[[buy, 8041.43000000, 0.02]]",,,,,BTC-USD,,,,,,2018-04-17T00:55:04.375Z,,l2update
6,5ad545e5e3ae712cb8091ad1,,,"[[buy, 7940.12000000, 0]]",,,,,BTC-USD,,,,,,2018-04-17T00:55:04.395Z,,l2update
7,5ad545e5e3ae712cb8091ad2,,,"[[buy, 8039.00000000, 0.001]]",,,,,BTC-USD,,,,,,2018-04-17T00:55:04.412Z,,l2update
8,5ad545e5e3ae712cb8091ad3,,,"[[buy, 7972.56000000, 0]]",,,,,BTC-USD,,,,,,2018-04-17T00:55:04.413Z,,l2update
9,5ad545e5e3ae712cb8091ad4,,,"[[buy, 8005.01000000, 0]]",,,,,BTC-USD,,,,,,2018-04-17T00:55:04.415Z,,l2update


In [5]:
# Show information about test dataset from mongodb -> dataframe
data.info(verbose=True,  memory_usage=True)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 18353 entries, 0 to 18352
Data columns (total 17 columns):
_id               18353 non-null object
asks              1 non-null object
bids              1 non-null object
changes           18093 non-null object
channels          1 non-null object
maker_order_id    257 non-null object
message           1 non-null object
price             257 non-null object
product_id        18351 non-null object
reason            1 non-null object
sequence          257 non-null float64
side              257 non-null object
size              257 non-null object
taker_order_id    257 non-null object
time              18350 non-null object
trade_id          257 non-null float64
type              18353 non-null object
dtypes: float64(2), object(15)
memory usage: 2.4+ MB


In [6]:
# Display unique response types from 'type' column
data['type'].unique()

array(['snapshot', 'last_match', 'subscriptions', 'error', 'l2update',
       'match'], dtype=object)

In [7]:
# Create individual dataframes for response types
    # Main relevant response types: snapshot, l2update, match, last_match
snapshot = pd.DataFrame(list(input_data.find({'type':'snapshot'})))
l2update = pd.DataFrame(list(input_data.find({'type':'l2update'})))
match = pd.DataFrame(list(input_data.find({'type':'match'})))

last_match = pd.DataFrame(list(input_data.find({'type':'last_match'})))
subscriptions = pd.DataFrame(list(input_data.find({'type':'subscriptions'})))
error = pd.DataFrame(list(input_data.find({'type':'error'})))
    # Error for unsubscribe message is issue with GDAX python API websocket call

In [8]:
snapshot.head()

Unnamed: 0,_id,asks,bids,product_id,type
0,5ad545e5e3ae712cb8091acb,"[[8042.91, 10.65614728], [8042.92, 0.00118097]...","[[8042.9, 9.11800903], [8042.83, 0.002487], [8...",BTC-USD,snapshot


In [9]:
l2update.head()

Unnamed: 0,_id,changes,product_id,time,type
0,5ad545e5e3ae712cb8091acf,"[[buy, 8041.33000000, 0]]",BTC-USD,2018-04-17T00:55:04.358Z,l2update
1,5ad545e5e3ae712cb8091ad0,"[[buy, 8041.43000000, 0.02]]",BTC-USD,2018-04-17T00:55:04.375Z,l2update
2,5ad545e5e3ae712cb8091ad1,"[[buy, 7940.12000000, 0]]",BTC-USD,2018-04-17T00:55:04.395Z,l2update
3,5ad545e5e3ae712cb8091ad2,"[[buy, 8039.00000000, 0.001]]",BTC-USD,2018-04-17T00:55:04.412Z,l2update
4,5ad545e5e3ae712cb8091ad3,"[[buy, 7972.56000000, 0]]",BTC-USD,2018-04-17T00:55:04.413Z,l2update


In [10]:
match.head()

Unnamed: 0,_id,maker_order_id,price,product_id,sequence,side,size,taker_order_id,time,trade_id,type
0,5ad545ece3ae712cb8091bd6,88a7fa8d-d0ca-466e-8efd-7c930a01510b,8042.9,BTC-USD,5693130267,buy,0.1552,b8f2f93a-aec2-4bbb-bb9c-01c43ad25cda,2018-04-17T00:55:11.045000Z,41766821,match
1,5ad54605e3ae712cb8091f86,6279e0f7-567a-4c93-9b0c-5169ce21e43d,8042.91,BTC-USD,5693131676,sell,0.0151,c4af3abb-d2cf-4fcd-9fc7-bc7aa7c38a73,2018-04-17T00:55:36.145000Z,41766822,match
2,5ad54611e3ae712cb809218a,88a7fa8d-d0ca-466e-8efd-7c930a01510b,8042.9,BTC-USD,5693132452,buy,0.0389,1e7e3262-475f-4910-9c31-6f71ddadbde8,2018-04-17T00:55:48.261000Z,41766823,match
3,5ad5461de3ae712cb80923eb,6279e0f7-567a-4c93-9b0c-5169ce21e43d,8042.91,BTC-USD,5693133369,sell,0.0811,ec3795e7-5d19-4bc9-8d5c-54821e5545a5,2018-04-17T00:56:00.104000Z,41766824,match
4,5ad54629e3ae712cb8092545,6279e0f7-567a-4c93-9b0c-5169ce21e43d,8042.91,BTC-USD,5693133890,sell,0.16050544,de79c882-101d-4e4c-89d2-2c8443012d9e,2018-04-17T00:56:11.887000Z,41766825,match


In [11]:
last_match.head()

Unnamed: 0,_id,maker_order_id,price,product_id,sequence,side,size,taker_order_id,time,trade_id,type
0,5ad545e5e3ae712cb8091acc,6279e0f7-567a-4c93-9b0c-5169ce21e43d,8042.91,BTC-USD,5693129275,sell,0.6535,2a6c1214-6232-4567-9ff6-5e5fb452b428,2018-04-17T00:54:57.887000Z,41766820,last_match


In [12]:
subscriptions['channels'][0]

[{'name': 'level2', 'product_ids': ['BTC-USD']},
 {'name': 'matches', 'product_ids': ['BTC-USD']}]

In [13]:
# Format snapshot dataframe (snapshot is initial snapshot of L2 orderbook state)
snapshot.head()

Unnamed: 0,_id,asks,bids,product_id,type
0,5ad545e5e3ae712cb8091acb,"[[8042.91, 10.65614728], [8042.92, 0.00118097]...","[[8042.9, 9.11800903], [8042.83, 0.002487], [8...",BTC-USD,snapshot


In [14]:
# Extract asks/bid individual column of array of arrays into lists
snapshot_asks = snapshot[['asks'][0]][0]
snapshot_bids = snapshot[['bids'][0]][0]

# Convert list (of array of arrays) into dataframe
snapshot_asks_df =pd.DataFrame(snapshot_asks)
snapshot_bids_df =pd.DataFrame(snapshot_bids)
   
# Rename columns to snapshot array format:
    # snapshot array format: [price, size]
    # Ask = sell price, bid = buy price
snapshot_asks_df.rename(columns ={0:'price',1:'size'}, inplace =True)
snapshot_bids_df.rename(columns ={0:'price',1:'size'}, inplace =True)
snapshot_asks_df['side'] = "sell"
snapshot_bids_df['side'] = "buy"

In [15]:
snapshot_asks_df.head()

Unnamed: 0,price,size,side
0,8042.91,10.65614728,sell
1,8042.92,0.00118097,sell
2,8042.93,0.00130538,sell
3,8042.94,0.00118106,sell
4,8042.95,1.26596588,sell


In [16]:
snapshot_bids_df.head()

Unnamed: 0,price,size,side
0,8042.9,9.11800903,buy
1,8042.83,0.002487,buy
2,8042.5,0.42,buy
3,8042.0,0.001,buy
4,8041.33,0.02,buy


In [17]:
match.tail()

Unnamed: 0,_id,maker_order_id,price,product_id,sequence,side,size,taker_order_id,time,trade_id,type
251,5ad54806e3ae712cb8095fe9,e59a4d82-ed7b-4819-a05c-db6d6e68c8ba,8039.99,BTC-USD,5693156291,buy,0.0662,d7e90f32-e773-4fd9-b960-2f28bbef30c3,2018-04-17T01:04:09.716000Z,41767072,match
252,5ad54812e3ae712cb80960bd,58c9cf26-f1cd-415f-ae8e-d08204375b85,8040.0,BTC-USD,5693156607,sell,0.00124006,24c0d885-938c-43f5-99c6-f048276d355b,2018-04-17T01:04:21.144000Z,41767073,match
253,5ad54813e3ae712cb80960c8,58c9cf26-f1cd-415f-ae8e-d08204375b85,8040.0,BTC-USD,5693156625,sell,0.0284,d3d20cfd-9e38-461b-87a6-faac41e1d892,2018-04-17T01:04:21.782000Z,41767074,match
254,5ad5482ce3ae712cb8096216,e59a4d82-ed7b-4819-a05c-db6d6e68c8ba,8039.99,BTC-USD,5693157127,buy,0.0241,03abbae6-4926-4b08-aca5-6da1e20c848e,2018-04-17T01:04:47.294000Z,41767075,match
255,5ad54837e3ae712cb8096253,58c9cf26-f1cd-415f-ae8e-d08204375b85,8040.0,BTC-USD,5693157221,sell,0.1175,6ad73528-9351-4bd6-9c39-c0510a1e8843,2018-04-17T01:04:58.719000Z,41767076,match


In [18]:
# Restucture l2update to have [side,price,size] from 'changes' column
# in seperate columns for parsing

# Check for matches between _id columns across match and l2update
    # If no matches, remove _id columns from both dataframes for now 
        # may only be relevant for real time input with FULL response (Level 3 Orderbook)
print(l2update['_id'].isin(match['_id']).unique())
print(match['_id'].isin(l2update['_id']).unique())

[False]
[False]


In [35]:
l2update_clean = l2update[['changes','time']]
print(l2update_clean.head())
print(l2update_clean.tail())

                         changes                      time
0      [[buy, 8041.33000000, 0]]  2018-04-17T00:55:04.358Z
1   [[buy, 8041.43000000, 0.02]]  2018-04-17T00:55:04.375Z
2      [[buy, 7940.12000000, 0]]  2018-04-17T00:55:04.395Z
3  [[buy, 8039.00000000, 0.001]]  2018-04-17T00:55:04.412Z
4      [[buy, 7972.56000000, 0]]  2018-04-17T00:55:04.413Z
                                   changes                      time
18088           [[sell, 8078.07000000, 0]]  2018-04-17T01:05:03.247Z
18089            [[buy, 8039.05000000, 0]]  2018-04-17T01:05:03.330Z
18090  [[sell, 8040.52000000, 0.00532786]]  2018-04-17T01:05:03.431Z
18091            [[buy, 8039.06000000, 0]]  2018-04-17T01:05:03.452Z
18092      [[buy, 8039.00000000, 0.00375]]  2018-04-17T01:05:03.486Z


In [188]:
# Create test dataframe for l2update
l2update_test = l2update_clean
# Information on index number layout for [side,price,size]:
l2update_test['changes'][0][0]

['buy', '8041.33000000', '0']

In [200]:
l2update_test.head()


Unnamed: 0,changes,time,side
0,"[[buy, 8041.33000000, 0]]",2018-04-17T00:55:04.358Z,"[[buy, 8041.33000000, 0]]"
1,"[[buy, 8041.43000000, 0.02]]",2018-04-17T00:55:04.375Z,"[[buy, 8041.43000000, 0.02]]"
2,"[[buy, 7940.12000000, 0]]",2018-04-17T00:55:04.395Z,"[[buy, 7940.12000000, 0]]"
3,"[[buy, 8039.00000000, 0.001]]",2018-04-17T00:55:04.412Z,"[[buy, 8039.00000000, 0.001]]"
4,"[[buy, 7972.56000000, 0]]",2018-04-17T00:55:04.413Z,"[[buy, 7972.56000000, 0]]"


In [87]:
#print(l2update['changes'][0][0][0])
#print(l2update['changes'][0][0][1])
#print(l2update['changes'][0][0][2])

buy
8041.33000000
0


In [None]:
#l2update_clean['side'] = l2update['changes'][0][0][0]
#l2update_clean['price'] = l2update['changes'][0][0][1]
#l2update_clean['size'] = l2update['changes'][0][0][2]
#l2update_clean = l2update_clean.drop(['changes'], axis=1)

In [88]:
#l2update_clean.head()

Unnamed: 0,changes,time
0,"[[buy, 8041.33000000, 0]]",2018-04-17T00:55:04.358Z
1,"[[buy, 8041.43000000, 0.02]]",2018-04-17T00:55:04.375Z
2,"[[buy, 7940.12000000, 0]]",2018-04-17T00:55:04.395Z
3,"[[buy, 8039.00000000, 0.001]]",2018-04-17T00:55:04.412Z
4,"[[buy, 7972.56000000, 0]]",2018-04-17T00:55:04.413Z


In [26]:
# Save test data to .csv format
    #https://pandas.pydata.org/pandas-docs/stable/generated/pandas.DataFrame.to_csv.html
data.to_csv("test_data/raw_mongo_input.csv",header = True,encoding='utf-8',index =False)
#l2update.to_csv("test_data/l2update.csv",header=True,encoding='utf-8',index =False)
l2update_clean.to_csv("test_data/l2update.csv",header=True,encoding='utf-8',index =False)
snapshot_asks_df.to_csv("test_data/snapshot_asks.csv",header=True,encoding='utf-8',index =False)
snapshot_bids_df.to_csv("test_data/snapshot_bids.csv",header=True,encoding='utf-8',index =False)

last_match.to_csv("test_data/last_match.csv",header=True,encoding='utf-8')
match.to_csv("test_data/match.csv",header=True,encoding='utf-8',index =False)
subscriptions.to_csv("test_data/subscriptions.csv",header=True,encoding='utf-8',index =False)