## Load RData

In [None]:
import rpy2.robjects as robjects
import numpy as np

RDATA_FN = 'data/raw/data_env.rdata'
robjects.r['load'](RDATA_FN)

# show rdata variables
print(robjects.r['ls']())

In [None]:
# get it
order_book_df = robjects.r['order_book']
order_log_df = robjects.r['order_log']
trade_df = robjects.r['trade']

## Preprocess raw data

In [None]:
import pandas as pd
import pandas_profiling
import missingno as msno
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np

from pandas_keeper import col_metatypes, fvdf

np.seterr(all='ignore');

### Order Book 

In [None]:
order_book_df = pd.read_pickle('data/processed/order_book_df')

In [None]:
fvdf(order_book_df)

In [None]:
counts = order_book_df.groupby('dt').count()['type']
weights = np.ones_like(counts)/len(counts)
plt.hist(counts, weights=weights, bins=100);

In [None]:
order_book_df.groupby('dt').count()['type'].hist(bins=100)

In [None]:
counts = order_book_df['dt'].value_counts()
weights = np.ones_like(counts)/len(counts)
plt.hist(counts, weights=weights, bins=100);

In [None]:
# convert to appropriate type
order_book_df['dt'] = pd.to_datetime(order_book_df['dt'])
cat_cols = ['from', 'to', 'source', 'type']
for col in cat_cols:
    order_book_df[col] = order_book_df[col].astype('category')
fvdf(order_book_df)

### Order Log

In [None]:
fvdf(order_log_df)

In [None]:
# convert to appropriate type
order_log_df['dt'] = pd.to_datetime(order_log_df['dt'])
cat_cols = ['from', 'to', 'source']
for col in cat_cols:
    order_log_df[col] = order_log_df[col].astype('category')
fvdf(order_log_df)

### Trade 

In [None]:
fvdf(trade_df)

In [None]:
# convert to appropriate type
trade_df['dt'] = pd.to_datetime(trade_df['dt'])
cat_cols = ['from', 'to', 'source', 'type']
for col in cat_cols:
    trade_df[col] = trade_df[col].astype('category')
fvdf(trade_df)

### save pandas_profiling reports and pickled

In [None]:
for i in range(len(dfs)):
    name = [k for k, v in locals().items() if v is dfs[i]][0]
    print(name)
    df = dfs[i]
    metatypes = col_metatypes(df)
    nested_cols = [col for col, kind in metatypes.items() if kind in [list, dict]]
    profile = pandas_profiling.ProfileReport(df.drop(nested_cols, axis=1))
    profile.to_file(outputfile='profiles/{}.html'.format(name))
    df.to_pickle('data/processed/{}.pkl'.format(name))