In [11]:
import datetime
import numpy as np
import pandas as pd

from helpers import common as cm

In [2]:
df_raw = cm.load_data('datasets/BittrexChart')

In [8]:
    df_raw.columns = [
        'created_at',
        'currency',
        'exchange',
        'price',
        'price_close',
        'price_high',
        'price_low',
        'price_open',
        'quantity',
        'timestamp',
        'timestamp_close',
        'timestamp_open',
        'uuid',
    ]

In [14]:
def extract_time_components(x):
    date = datetime.datetime.fromtimestamp(x)
    return int(date.strftime('%Y%m%d')), int(date.strftime('%H')), int(date.strftime('%M'))

def add_time_component_columns(df):
    df['date'], df['hour'], df['minute'] = zip(*df['timestamp_close'].apply(extract_time_components))
    return df_raw

df_raw_with_time = add_time_component_columns(df_raw)

In [28]:
def group_by(df, columns):
    group = df.groupby(columns, axis=0)
    return [(key, group.get_group(key)) for key in group.groups.keys()]
    
def group_by_date(df):
    return sorted(group_by(df, ['date']), key=lambda x: x[0])

def group_by_currency(df):
    return sorted(group_by(df, ['currency']), key=lambda x: x[0])

def group_by_hour(df):
    return sorted(group_by(df, ['hour']), key=lambda x: x[0])

def group_by_minute(df):
    return sorted(group_by(df, ['minute']), key=lambda x: x[0])

def transform_data_with_new_columns(df):
    group_sorted = df.sort_values(
        ['timestamp_close'], ascending=[1]
    ).drop_duplicates(
        'timestamp_close', keep='last'
    )
    volume = sum(group_sorted['quantity'])
    price_open = group_sorted.iloc[0]['price_open']
    closing_prices = group_sorted['price_close']
    price_close = closing_prices.iloc[len(group_sorted) - 1]
    price_high = max(closing_prices)
    price_low = min(closing_prices)
    return volume, price_open, price_close, price_high, price_low

def transform_all(df):
    new_columns = [
        'date',
        'hour',
        'minute',
        'currency',
        'volume',
        'price_open',
        'price_close',
        'price_high',
        'price_low',
    ]
    d = {}
    count = 0
    for currency, g_by_c in group_by_currency(df):
        arr = []
        for date, g_by_d in group_by_date(g_by_c):
            for hour, g_by_h in group_by_hour(g_by_d):
                for minute, g_by_m in group_by_minute(g_by_h):
                    values = transform_data_with_new_columns(g_by_m)
                    arr.append((date, hour, minute, currency) + values)
                    count += 1
                    if count % 100000 == 0:
                        print(count)
        d[currency] = pd.DataFrame(data=arr, columns=new_columns)
    return d

In [29]:
df_transformed_initial = transform_all(df_raw_with_time)

100000
200000
300000
400000
500000
600000
700000
800000
900000
1000000
1100000
1200000
1300000
1400000
1500000
1600000
1700000
1800000
1900000
2000000
2100000
2200000
2300000
2400000
2500000
2600000
2700000
2800000
2900000
3000000
3100000
3200000
3300000
3400000
3500000
3600000
3700000
3800000
3900000
4000000
4100000
4200000
4300000
4400000


In [33]:
exchange = 'Bittrex'

for currency, df in df_transformed_initial.items():
    df.to_csv('data/currencies/{}/{}.csv'.format(exchange, currency), index=False)

In [35]:
import os

df_transformed = {}
currencies = []
for name in os.listdir('data/currencies/{}'.format(exchange)):
    if not name.startswith('.') :
        currencies.append(name.split('.csv')[0])

for currency in currencies:
    df_transformed[currency] = cm.load_data('data/currencies/{}/{}.csv'.format(exchange, currency))

In [36]:
{}['hey']

KeyError: 'hey'