In [10]:
import datetime
import numpy as np
import os
import pandas as pd

def load_data(file_path):
    return pd.read_csv(file_path)

In [11]:
%matplotlib inline

import matplotlib.pyplot as plt
import numpy as np

from IPython.display import Image

In [12]:
def group_by(df, columns):
    group = df.groupby(columns, axis=0)
    return [(key, group.get_group(key)) for key in group.groups.keys()]

def group_by_hour(df):
    return sorted(group_by(df, ['hour']), key=lambda x: x[0])
    
def group_by_date(df):
    return sorted(group_by(df, ['date']), key=lambda x: x[0])

def group_by_currency(df):
    return sorted(group_by(df, ['currency']), key=lambda x: x[0])

def transform_data_for_hour(df):
    group_sorted = df.sort_values(
        ['timestamp_close'], ascending=[1]
    ).drop_duplicates(
        'timestamp_close', keep='last'
    )
    volume = sum(group_sorted['quantity'])
    price_open = group_sorted.iloc[0]['price_open']
    closing_prices = group_sorted['price_close']
    price_close = closing_prices.iloc[len(group_sorted) - 1]
    price_high = max(closing_prices)
    price_low = min(closing_prices)
    return volume, price_open, price_close, price_high, price_low

In [13]:
new_columns = [
    'date',
    'hour',
    'currency',
    'volume',
    'price_open',
    'price_close',
    'price_high',
    'price_low',
]

def transform_all(df):
    d = {}
    for currency, g_by_c in group_by_currency(df):
        arr = []
        for date, g_by_d in group_by_date(g_by_c):
            for hour, g_by_h in group_by_hour(g_by_d):
                values = transform_data_for_hour(g_by_h)
                arr.append((date, hour, currency) + values)
        d[currency] = pd.DataFrame(data=arr, columns=new_columns)
    return d

def flatten(l):
    return [item for sublist in l for item in sublist]

In [14]:
data = load_data('datasets/BittrexChart')

In [15]:
data.columns = [
    'created_at',
    'currency',
    'exchange',
    'price',
    'price_close',
    'price_high',
    'price_low',
    'price_open',
    'quantity',
    'timestamp',
    'timestamp_close',
    'timestamp_open',
    'uuid',
]

In [16]:
def date_and_hour(x):
    date = datetime.datetime.fromtimestamp(x)
    return int(date.strftime('%Y%m%d')), int(date.strftime('%H'))

data['date'], data['hour'] = zip(*data['timestamp_close'].apply(date_and_hour))

In [17]:
import datetime
import matplotlib.pyplot as plt
import matplotlib.dates as mdates
import matplotlib.cbook as cbook

years = mdates.YearLocator()   # every year
months = mdates.MonthLocator()  # every month
yearsFmt = mdates.DateFormatter('%Y')

def date_and_hour(row):
    date = row['date']
    hour = row['hour']
    return datetime.datetime.strptime(str(date), '%Y%m%d') + datetime.timedelta(hours=hour)
    
def plot_currency(currency, d1):
    plt.figure(figsize=(20, 10))

    plt.gca().xaxis.set_major_locator(mdates.DayLocator())
    plt.gca().xaxis.set_major_formatter(mdates.DateFormatter('%d'))
    plt.gca().xaxis.set_minor_locator(mdates.HourLocator())
    
    datemin = datetime.datetime.strptime(str(min(d1['date'])), "%Y%m%d").date() - datetime.timedelta(days=1)
    datemax = datetime.datetime.strptime(str(max(d1['date'])), "%Y%m%d").date() + datetime.timedelta(days=1)

    plt.axis([
        datemin,
        datemax,
        min(d1['price_close']),
        max(d1['price_close']),
    ])
    
    plt.plot(
        d1.apply(date_and_hour, axis=1),
        d1['price_close'],
        'b.'
    )
    
    plt.xlabel(currency)
    
# plot_currency('ZCL', df_transformed['ZCL'])
# plot_currency('LTC', df_transformed['LTC'])
# plot_currency('ETH', df_transformed['ETH'])

In [18]:
df_transformed_initial = transform_all(data)

In [19]:
for currency, df in df_transformed_initial.items():
    df.to_csv('data/currencies/{}.csv'.format(currency), index=False)

In [20]:
import os

df_transformed = {}
currencies = []
for name in os.listdir('data/currencies'):
    if not name.startswith('.') :
        currencies.append(name.split('.csv')[0])

for currency in currencies:
    df_transformed[currency] = load_data('data/currencies/{}.csv'.format(currency))

In [21]:
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import FeatureUnion, Pipeline
from sklearn.preprocessing import Imputer, StandardScaler

columns_as_feature = [
    'volume',
    'price_open',
    'price_close',
    'price_high',
    'price_low',
]

class DateFrameSelector(BaseEstimator, TransformerMixin):
    def __init__(self, attribute_names):
        self.attribute_names = attribute_names
        
    def fit(self, X, y=None):
        return self
    
    def transform(self, X, y=None):
        return X[self.attribute_names].values
    
def build_pipeline(numerical_attributes):
    scalar_pipeline = Pipeline([
        ('selector', DateFrameSelector(numerical_attributes)),
        ('standard_scalar', StandardScaler()),
    ])
    return FeatureUnion(transformer_list=[
        ('scalar_pipeline', scalar_pipeline),
    ])

def scale_data(df):
    pipeline = build_pipeline(columns_as_feature)
    df_scaled = pd.DataFrame(data=pipeline.fit_transform(df), columns=columns_as_feature)
    df_scaled[['date', 'hour_of_day']] = df[['date', 'hour']]
    return df_scaled

In [22]:
hours = np.ndarray(shape=(24, 1), dtype=np.float64)
for i in range(0, 24):
    hours[i] = np.array([i], dtype=np.float64)
hours_scaled = StandardScaler().fit_transform(hours)

In [32]:
scaled_data = {}
original_data = {}

for currency, df in df_transformed.items():
    scaled_data[currency] = {}
    original_data[currency] = {}
    
    for idx, row in scale_data(df).iterrows():
        date = row['date']
        hour_of_day = row['hour_of_day']
        
        if not scaled_data[currency].get(date, False):
            scaled_data[currency][date] = {}
            
        scaled_data[currency][date][hour_of_day] = row[columns_as_feature]
    
    for idx, row in df.iterrows():
        date = row['date']
        hour_of_day = row['hour']
            
        if not original_data[currency].get(date, False):
            original_data[currency][date] = {}
            
        original_data[currency][date][hour_of_day] = row[columns_as_feature]

In [24]:
currencies = sorted(list(scaled_data.keys()))
all_hours = sorted(flatten(hours))
all_dates = sorted(list(set(
    flatten([scaled_data[curr].keys() for curr in currencies])
)))

In [25]:
shared_initial_columns = [
    'date',
    'hour',
    'hour_scaled',
]
currency_feature_columns = flatten(
    [['{}_{}'.format(col_name, curr)for curr in currencies for col_name in columns_as_feature]]
)
df_shared_columns = shared_initial_columns + currency_feature_columns

In [33]:
vectors = []
vectors_original = []

for date in all_dates:
    for hour in all_hours:
        vector = [
            date,
            hour,
            hours_scaled[int(hour)][0],
        ]
        vector_original = [
            date,
            hour,
            hours_scaled[int(hour)][0],
        ]
        for currency in currencies:
            d1 = scaled_data[currency]  
            if d1.get(date, False) and not d1[date].get(hour, pd.Series()).empty:
                vector += list(d1[date][hour])
            else:
                vector += [np.nan for i in range(0, len(columns_as_feature))]

            d2 = original_data[currency]
            if d2.get(date, False) and not d2[date].get(hour, pd.Series()).empty:
                vector_original += list(d2[date][hour])
            else:
                vector_original += [np.nan for i in range(0, len(columns_as_feature))]

        vectors.append(vector)
        vectors_original.append(vector_original)

In [34]:
df_shared_new = pd.DataFrame(vectors, columns=df_shared_columns)
df_shared_new.to_csv('data/shared/transformed.csv', index=False)

df_shared_original_new = pd.DataFrame(vectors_original, columns=df_shared_columns)
df_shared_original_new.to_csv('data/shared/original.csv', index=False)

---

# Training section below

In [845]:
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.preprocessing import PolynomialFeatures

currency_to_predict = 'ZCL'
features_to_use = columns_as_feature
currencies_to_use = currencies

model_to_use = PolynomialFeatures()
use_logistic_regression = type(model_to_use) is LogisticRegression

In [846]:
def build_label(label):
    return pd.DataFrame([label], columns=['label'])

In [847]:
df_t = df_transformed[currency_to_predict]
labels = pd.DataFrame(columns=['label'])

for date in all_dates:
    for hour in all_hours:
        label = df_t.loc[(df_t['date'] == date) & (df_t['hour'] == hour + 1)]
        
        if label['price_close'].empty:
            price_close = np.nan
        else:
            price_close = float(label['price_close'])
        labels = labels.append(
            build_label(
                price_close,
            ),
        )

In [850]:
df_shared = load_data('data/shared/transformed.csv')

In [851]:
# Chop off last row since there is are no future change to know
def reset_index(df):
    return df.reset_index().drop(['index'], axis=1)

if use_logistic_regression:
    rows_to_trim_start = 36 # Logistic regression
    number_of_rows_to_trim_end = 1 # Logistic regression
else:
    rows_to_trim_start = 0
    number_of_rows_to_trim_end = 0

df_shared_final = reset_index(df_shared[rows_to_trim_start:row_count - number_of_rows_to_trim_end])

In [852]:
row_count = df_shared.shape[0] - 2

In [853]:
# Add labels
if use_logistic_regression:
    labels = pd.DataFrame(columns=['label'])
    
    for i in range(0, row_count - 1):
        label_idx = len(shared_initial_columns) + (
            (
                currencies_to_use.index(currency_to_predict) * len(features_to_use)
            ) + 2
        )
        label_now = df_shared.iloc[i][label_idx]
        label_next_hour = df_shared.iloc[i + 1][label_idx]

        if label_next_hour > label_now:
            label = 1
        else:
            label = 0

        labels = labels.append(build_label(label))

In [854]:
labels_final = reset_index(labels[rows_to_trim_start:row_count])
len(df_shared_final), len(labels_final)

(374, 374)

In [855]:
from sklearn.model_selection import StratifiedShuffleSplit

def labels_to_array(df):
    return list(df['label'])

def training_and_test_sets(df):
    # Split data
    stratified_split = StratifiedShuffleSplit(
        n_splits=1,
        test_size=0.16,
        random_state=42,
    )
    # for train_index, test_index in 
    gen = stratified_split.split(
        df,
        df[['date', 'hour']],
    )
    for training_indices, test_indices in gen:
        training_set = df.loc[training_indices].drop(['date', 'hour'], axis=1)
        training_set_labels = labels_final.loc[training_indices]

        test_set = df.loc[test_indices].drop(['date', 'hour'], axis=1)
        test_set_labels = labels_final.loc[test_indices]
    
    return (training_set, training_set_labels, test_set, test_set_labels)

In [856]:
all_features = ['hour_scaled'] + currency_feature_columns

In [857]:
from sklearn.model_selection import GridSearchCV

def train(opts={}):
    features_to_use = opts.get('features_to_use', all_features)
    r_model = opts['model_to_use']
    verbose = opts.get('verbose', False)
    
    training_set, training_set_labels, test_set, test_set_labels = training_and_test_sets(df_shared_final)
    training_set_to_use = training_set[features_to_use]
    test_set_to_use = test_set[features_to_use]
    
    model_name = type(model_to_use)
    param_grid = {
        LinearRegression: {  
        },
        LogisticRegression: {
            # {'C': 10000.0, 'max_iter': 100, 'random_state': 0, 'solver': 'liblinear'}
            # C_range = np.logspace(-2, 10, num=3)
            # solver = ['newton-cg', 'lbfgs', 'liblinear', 'sag']
            
            'C': [10000],
            'max_iter': [100],
            'random_state': [0],
            'solver': ['liblinear'],
        },
        PolynomialFeatures: {
            'degree': [2 + i for i in range(1, 4)],
            'include_bias': [True, False],
            'interaction_only': [False, True],
        },
    }[model_name]
    
    scoring = {
        LinearRegression: 'neg_mean_squared_error',
        LogisticRegression: 'accuracy',
        PolynomialFeatures: 'neg_mean_squared_error',
    }[model_name]
    
    gs = GridSearchCV(
        r_model,
        param_grid,
        cv=2,
        scoring=scoring,
        verbose=(10 if verbose else 0),
    )
    gs.fit(training_set_to_use, labels_to_array(training_set_labels))

    if verbose:
        print(gs.best_params_)
        
    gs.best_estimator_
    gs.cv_results_
    
    model = gs.best_estimator_
    score = model.score(test_set_to_use, labels_to_array(test_set_labels))

    if verbose:
        print('Score: {}'.format(score))

    feature_importances = sorted(
        zip(
            model.coef_[0] if use_logistic_regression else model.coef_,
            features_to_use,
        ),
        key=lambda x: np.absolute(x[0]),
        reverse=True,
    )
    best_features = [f for s, f in feature_importances]

    if verbose:
        print('Number of features: {}'.format(len(best_features)))
        
    return (
        score,
        best_features,
        model,
        feature_importances,
    )

In [858]:
scores_and_features = []
best_features_to_use = None

n_times_to_try = 200
for i in range(0, n_times_to_try):
    if best_features_to_use:
        end_idx = max([n_times_to_try - i, 1])
        arr = best_features_to_use[0:end_idx]
    else:
        arr = all_features
    score, best_features_to_use, model, feature_importances = train({
        'features_to_use': arr,
        'model_to_use': model_to_use,
    }) 
    scores_and_features.append((score, best_features_to_use, model, feature_importances))

AttributeError: 'PolynomialFeatures' object has no attribute 'predict'

In [None]:
best_scores = sorted(scores_and_features, key=lambda x: x[0], reverse=True)
for tup in best_scores[0:3]:
    print(tup[0], len(tup[1]))
best_score = best_scores[0]
best_score[1]

In [None]:
from sklearn.externals import joblib

model_name = '{}_{}'.format(str(type(model_to_use)).lower(), currency_to_predict.lower())

joblib.dump(best_score[2], 'models/{}.pkl'.format(model_name))
loaded_model = joblib.load('models/{}.pkl'.format(model_name))

In [859]:
from sklearn.metrics import mean_squared_error

# Test loaded data
_, _, test_set, test_set_labels = training_and_test_sets(df_shared_final)
if use_logistic_regression:
    score = loaded_model.score(test_set[best_score[1]], labels_to_array(test_set_labels))
else:
    y = labels_to_array(test_set_labels)
    y_predict = loaded_model.predict(test_set[best_score[1]])
    score = mean_squared_error(y, y_predict)
    percent_diff = [np.absolute(x) for x in (y - y_predict) / y]
    print(sum(percent_diff) / len(percent_diff)) # Off by 12% on average
    
print(score)

597.385757042
0.49667175761


In [None]:
pd.DataFrame(best_score[3], columns=['weight', 'feature']).to_csv(
    'models/{}_features.csv'.format(model_name),
    index=False,
)

# Polynomial Features
---

In [933]:
# next_hour_price = loaded_model.predict(df_shared_final[best_score[1]])
# df_shared_final['next_hour_price'] = StandardScaler().fit_transform(pd.DataFrame(next_hour_price))
# df_shared_final

training_set, training_set_labels, test_set, test_set_labels = training_and_test_sets(df_shared_final)
poly = PolynomialFeatures(degree=3)
X_ = poly.fit_transform(training_set)

KeyboardInterrupt: 

In [None]:
lg = LinearRegression()
lg.fit(X_, labels_to_array(training_set_labels))

In [None]:
X_test_ = poly.fit_transform(test_set)
predictions = lg.predict(X_test_)

In [None]:
from sklearn.metrics import r2_score

r2_score(labels_to_array(test_set_labels), predictions)

In [None]:
y = labels_to_array(test_set_labels)
percent_diff = [np.absolute(x) for x in (y - predictions) / y]
print(sum(percent_diff) / len(percent_diff)) # Off by 12% on average

In [None]:
n_output_features = poly.n_output_features_
feature_names = poly.get_feature_names()

feature_importances = sorted(
    zip(
        lg.coef_,
        feature_names,
        [i for i in range(0, n_output_features)],
    ),
    key=lambda x: np.absolute(x[0]),
    reverse=True,
)
best_features = [(f, i) for s, f, i in feature_importances]

In [None]:
best_feature_names = [f for f, i in best_features[0:100]]

In [None]:
training_set, training_set_labels, test_set, test_set_labels = training_and_test_sets(df_shared_final)
poly = PolynomialFeatures(degree=2)
X_ = poly.fit_transform(training_set)
X_df = pd.DataFrame(X_, columns=feature_names)

lg = LinearRegression()
lg.fit(X_df[best_feature_names], labels_to_array(training_set_labels))

In [None]:
X_test_ = poly.fit_transform(test_set)
X_test_df_ = pd.DataFrame(X_test_, columns=feature_names)
predictions = lg.predict(X_test_df_[best_feature_names])

In [None]:
from sklearn.metrics import r2_score

r2_score(labels_to_array(test_set_labels), predictions)

In [None]:
y = labels_to_array(test_set_labels)
percent_diff = [np.absolute(x) for x in (y - predictions) / y]
print(sum(percent_diff) / len(percent_diff)) # Off by 12% on average