In [None]:
import pandas as pd
import pandas_profiling
import missingno as msno
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np

import itertools

In [None]:
order_book_df = pd.read_pickle('data/processed/order_book_df')
trade_df = pd.read_pickle('data/processed/trade_df')

In [None]:
trade_df['source'].value_counts().plot(kind='bar')

In [None]:
trade_df['to'].value_counts().plot(kind='bar')

In [None]:
trade_df['from'].value_counts().plot(kind='bar')

In [None]:
trade_df['type'].value_counts().plot(kind='bar')

In [None]:
for col in order_book_df:
    print(order_book_df[col].unique())
    print()

In [None]:
order_book_df.head()

In [None]:
fig, axes = plt.subplots(2, 1, sharex=True)
trade_df.dt.hist(ax=axes[0])
order_book_df.dt.hist(ax=axes[1])

In [None]:
min_date = order_book_df['dt'].min()
max_date = trade_df['dt'].max()

In [None]:
def filter_df(df, source, to, from_, type_):
    source_mask = (df['source'] == source)
    dates_mask = ((df['dt'] >= min_date) & (df['dt'] <= max_date))
    to_mask = (df['to'] == to)
    from_mask = (df['from'] == from_)
    type_mask = (df['type'] == type_)
    return df[dates_mask & source_mask & to_mask & from_mask & type_mask]

In [None]:
agg_functions = {'price': {'min': 'min', 'max': 'max', 'mean': 'mean', 'median': 'median'},
'size': {'min': 'min', 'max': 'max', 'mean': 'mean', 'median': 'median', 'count': 'count', 'sum': 'sum'}}

sources = ['EXMO', 'BTCE']
froms = ['DOGE', 'DASH', 'USD', 'BTC', 'LTC', 'ETH', 'NMC', 'NVC', 'PPC', 'EUR']
tos = ['USD', 'RUB', 'BTC', 'RUR', 'EUR']
types = ['BID', 'ASK']

dfs = {}

for source, from_, to, type_ in itertools.product(sources, froms, tos, types):
    filtered_order_book_df = filter_df(order_book_df, source, from_, to, type_).groupby('dt').agg(agg)
    filtered_order_book_df.columns = ['_'.join(col).strip() for col in filtered_order_book_df.columns.values]
#     filtered_order_book_df.loc[pd.to_datetime('2016-03-04 23:59:59'), 'size_count'] = 0
    filtered_order_book_df.loc[pd.to_datetime('2016-03-07 23:59:59'), 'size_count'] = 0
    mask = '_'.join([source, from_, to, type_])
    resampled_order_book_df = filtered_order_book_df.resample('s').first().ffill()
    if resampled_order_book_df.shape[0] > 1:
#         dfs[mask] = {'order_book': resampled_order_book_df}
        if type_ == 'BID':
            type_ = 'BUY'
        else:
            type_ = 'SELL'
        mask = '_'.join([source, from_, to, type_])

        filtered_trade_df = filter_df(trade_df, source, from_, to, type_)
        joined_df = filtered_trade_df.set_index('dt').join(resampled_order_book_df)
        print('{}: {} (from {}) + {} -> {}'.format(mask, resampled_order_book_df.shape[0], filtered_order_book_df.shape[0], filtered_trade_df.shape[0], joined_df.shape))    
        dfs[mask] = {'order_book': resampled_order_book_df, 'trade': df.set_index('dt'), 'joined': joined_df}
        joined_df.to_pickle('data/processed/joined/{}_joined'.format(mask))
        

In [None]:
for mask in dfs.keys():
    resampled_order_book_df = dfs[mask]['order_book']
    filtered_trade_df = dfs[mask]['trade']
    joined_df = dfs[mask]['joined']

    fig, axes = plt.subplots(3, 1, sharex=True)
    fig.suptitle(mask)
    resampled_order_book_df[['size_sum']].plot(ax=axes[0])
    resampled_order_book_df[['price_mean']].plot(ax=axes[1])
    joined_df[['price']].plot(ax=axes[2])