In [1]:
import numpy as np
import pandas as pd

from helpers import cleaner as cl
from helpers import common as cm
from helpers import trainer as tr

In [2]:
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.metrics import make_scorer, mean_squared_error, precision_score
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import PolynomialFeatures

In [3]:
exchange = 'Bittrex'

In [4]:
df_shared = cm.load_data('data/shared/{}/transformed.csv'.format(exchange))
df_original = cm.load_data('data/shared/{}/original.csv'.format(exchange))

# Shared
---

In [5]:
# Variables
currency_to_predict = 'LTC'
all_columns = df_shared.columns
label_column_name = 'price_close'
column_name = '{}_{}'.format(label_column_name, currency_to_predict)

In [6]:
currencies = cm.all_currencies()
all_dates = sorted(list(set(df_original['date'])))
all_hours = cm.all_hours()
all_minutes = cm.all_minutes()

In [7]:
labels_original = pd.DataFrame(columns=['label'])
labels_original['label'] = df_original[[column_name]]

# Logistic Regression
---

In [8]:
df_cleaned, labels_cleaned = cl.clean_data_using_labels(
    df_shared,
    labels_original,
    5,
    lambda x, y: x > y,
)

In [9]:
X_train, y_train, X_test, y_test = tr.training_and_test_sets(
    df_cleaned,
    labels_cleaned,
    ['date', 'hour', 'minute'],
)

In [14]:
param_grid = {
    'C': np.logspace(-2, 10, num=3),
    'max_iter': [1000, 2000, 3000],
    'random_state': [0],
    'solver': [
        'newton-cg',
        'lbfgs',
        'liblinear',
        'sag',
    ],
}

param_grid = {
    'C': [10000],
    'max_iter': [3000],
    'random_state': [0],
    'solver': ['sag'],
}

gs = GridSearchCV(
    LogisticRegression(),
    param_grid,
    cv=2,
    scoring=make_scorer(precision_score),
    verbose=0,
)

In [15]:
training_opts = {
    'features_to_use': X_train.columns,
    'coef_extractor': lambda x: x.coef_[0],
    'grid_search': gs,
    'data': X_train,
    'labels': y_train,
    'columns_to_stratify': None,
}

score, model, feature_importances, features_to_use = tr.train(training_opts)



476 0.59414883173 476




475 0.59414883173 475




472 0.593952483801 472




467 0.594345179658 467


KeyboardInterrupt: 

In [None]:
y_predict = model.predict(X_test)
precision_score(y_test, y_predict)

# Linear Regression
---

In [None]:
df_cleaned, labels_cleaned = cl.clean_data_using_labels(
    df_shared,
    labels_original,
    1,
    lambda x, y: (x - y) / y
#     lambda x, y: x
)

In [None]:
X_train, y_train, X_test, y_test = tr.training_and_test_sets(
    df_cleaned,
    labels_cleaned,
    ['date', 'hour', 'minute'],
)

In [None]:
r_model = LinearRegression()

param_grid = {}

gs = GridSearchCV(
    r_model,
    param_grid,
    cv=3,
    scoring=make_scorer(mean_squared_error),
    verbose=0,
)

In [None]:
gs.fit(X_train, cm.labels_to_array(y_train))

In [None]:
best_model = gs.best_estimator_
cv_results = gs.cv_results_

feature_importances = sorted(
    zip(
        best_model.coef_,
        X_train.columns,
    ),
    key=lambda x: np.absolute(x[0]),
    reverse=True,
)
best_features = [f for s, f in feature_importances]
feature_importances[0:20]

In [None]:
y_predict = best_model.predict(X_test)
mean_squared_error(y_test, y_predict)