In [27]:
import numpy as np
import pandas as pd

from helpers import cleaner as cl
from helpers import common as cm
from helpers import trainer as tr

In [28]:
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.metrics import make_scorer, mean_squared_error, precision_score
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import PolynomialFeatures

In [29]:
exchange = 'Bittrex'

In [30]:
df_shared = cm.load_data('data/shared/{}/transformed.csv'.format(exchange))
df_original = cm.load_data('data/shared/{}/original.csv'.format(exchange))

# Shared
---

In [31]:
# Variables
currency_to_predict = 'LTC'
all_columns = df_shared.columns
label_column_name = 'price_close'
column_name = '{}_{}'.format(label_column_name, currency_to_predict)

In [32]:
currencies = cm.all_currencies()
all_dates = sorted(list(set(df_original['date'])))
all_hours = cm.all_hours()
all_minutes = cm.all_minutes()

In [33]:
labels_original = pd.DataFrame(columns=['label'])
labels_original['label'] = df_original[[column_name]]

# Logistic Regression
---

In [67]:
df_cleaned, labels_cleaned = cl.clean_data_using_labels(
    df_shared,
    labels_original,
    5,
    lambda x, y: x > y,
)

In [68]:
X_train, y_train, X_test, y_test = tr.training_and_test_sets(
    df_cleaned,
    labels_cleaned,
    ['date', 'hour', 'minute'],
)

In [69]:
param_grid = {
    'C': np.logspace(-4, 4, num=3),
    'max_iter': [1000, 2000, 3000],
    'random_state': [0],
    'solver': [
        'newton-cg',
        'lbfgs',
        'liblinear',
        'sag',
    ],
}

param_grid = {
    'C': [10000],
    'max_iter': [1000],
    'random_state': [0],
    'solver': ['sag'],
}

gs = GridSearchCV(
    LogisticRegression(),
    param_grid,
    cv=5,
    scoring=make_scorer(precision_score),
    verbose=0,
)

In [70]:
training_opts = {
    'features_to_use': X_train.columns,
#     'n_features_to_try': [len(X_train.columns)],
    'coef_extractor': lambda x: x.coef_[0],
    'grid_search': gs,
    'data': X_train,
    'labels': y_train,
    'columns_to_stratify': None,
}

In [71]:
score, model, feature_importances, features_to_use = tr.train(training_opts)



476: 0.5941488317298252




441: 0.5933634400157078




400: 0.5974867465148243




361: 0.5994502258001179




324: 0.5947378755154134




289: 0.5967013548007069




256: 0.5955232672295308




225: 0.5947378755154134




196: 0.5925780483015904




169: 0.5927743962301197




144: 0.5967013548007069
121: 0.5915963086589436
100: 0.5913999607304143
81: 0.5886510897310033
64: 0.5833496956607108
49: 0.5874730021598272
36: 0.5843314353033575
25: 0.5833496956607108
16: 0.5768702140192421
9: 0.5794227370901237
4: 0.5778519536618889


  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


1: 0.5169840958177891


Top 3:
0.5994502258 361


Top 3:
0.597486746515 400


Top 3:
0.596701354801 289


  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


In [72]:
y_predict = model.predict(X_test[features_to_use])
precision_score(y_test, y_predict)

0.47252155172413796

# Linear Regression
---

In [None]:
df_cleaned, labels_cleaned = cl.clean_data_using_labels(
    df_shared,
    labels_original,
    1,
    lambda x, y: (x - y) / y
#     lambda x, y: x
)

In [None]:
X_train, y_train, X_test, y_test = tr.training_and_test_sets(
    df_cleaned,
    labels_cleaned,
    ['date', 'hour', 'minute'],
)

In [None]:
r_model = LinearRegression()

param_grid = {}

gs = GridSearchCV(
    r_model,
    param_grid,
    cv=3,
    scoring=make_scorer(mean_squared_error),
    verbose=0,
)

In [None]:
gs.fit(X_train, cm.labels_to_array(y_train))

In [None]:
best_model = gs.best_estimator_
cv_results = gs.cv_results_

feature_importances = sorted(
    zip(
        best_model.coef_,
        X_train.columns,
    ),
    key=lambda x: np.absolute(x[0]),
    reverse=True,
)
best_features = [f for s, f in feature_importances]
feature_importances[0:20]

In [None]:
y_predict = best_model.predict(X_test)
mean_squared_error(y_test, y_predict)