In [16]:
import numpy as np
import pandas as pd

from helpers import cleaner as cl
from helpers import common as cm
from helpers import trainer as tr

In [17]:
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.metrics import make_scorer, mean_squared_error, precision_score
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import PolynomialFeatures

In [18]:
exchange = 'Bittrex'

In [19]:
df_shared = cm.load_data('data/shared/{}/transformed.csv'.format(exchange))
df_original = cm.load_data('data/shared/{}/original.csv'.format(exchange))

# Shared
---

In [20]:
# Variables
currency_to_predict = 'LTC'
all_columns = df_shared.columns
label_column_name = 'price_close'
column_name = '{}_{}'.format(label_column_name, currency_to_predict)

In [21]:
currencies = cm.all_currencies()
all_dates = sorted(list(set(df_original['date'])))
all_hours = cm.all_hours()
all_minutes = cm.all_minutes()

In [22]:
labels_original = pd.DataFrame(columns=['label'])
labels_original['label'] = df_original[[column_name]]

# Logistic Regression
---

In [23]:
df_cleaned, labels_cleaned = cl.clean_data_using_labels(
    df_shared,
    labels_original,
    1,
    lambda x, y: x > y,
)

In [24]:
X_train, y_train, X_test, y_test = tr.training_and_test_sets(
    df_cleaned,
    labels_cleaned,
    ['date', 'hour', 'minute'],
)

In [25]:
param_grid = {
    'C': np.logspace(-2, 10, num=3),
    'max_iter': [1000, 2000, 3000],
    'random_state': [0],
    'solver': [
        'newton-cg',
        'lbfgs',
        'liblinear',
        'sag',
    ],
}

gs = GridSearchCV(
    LogisticRegression(),
    param_grid,
    cv=3,
    scoring=make_scorer(precision_score),
    verbose=10,
)

In [27]:
training_opts = {
    'features_to_use': X_train.columns[2:],
    'n_times_to_try': 1,
    'coef_extractor': lambda x: x.coef_[0],
    'grid_search': gs,
    'data': X_train,
    'labels': y_train,
    'columns_to_stratify': None,
}

score, model, feature_importances, features_to_use = tr.train_using_best_features(training_opts)

Fitting 3 folds for each of 36 candidates, totalling 108 fits
[CV] C=0.01, max_iter=1000, random_state=0, solver=newton-cg .........
[CV]  C=0.01, max_iter=1000, random_state=0, solver=newton-cg, score=0.546006, total=   1.4s
[CV] C=0.01, max_iter=1000, random_state=0, solver=newton-cg .........


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    1.4s remaining:    0.0s


[CV]  C=0.01, max_iter=1000, random_state=0, solver=newton-cg, score=0.510040, total=   1.4s
[CV] C=0.01, max_iter=1000, random_state=0, solver=newton-cg .........


[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    2.8s remaining:    0.0s


[CV]  C=0.01, max_iter=1000, random_state=0, solver=newton-cg, score=0.493540, total=   1.6s
[CV] C=0.01, max_iter=1000, random_state=0, solver=lbfgs .............


[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:    4.4s remaining:    0.0s


[CV]  C=0.01, max_iter=1000, random_state=0, solver=lbfgs, score=0.546006, total=   0.7s
[CV] C=0.01, max_iter=1000, random_state=0, solver=lbfgs .............


[Parallel(n_jobs=1)]: Done   4 out of   4 | elapsed:    5.1s remaining:    0.0s


[CV]  C=0.01, max_iter=1000, random_state=0, solver=lbfgs, score=0.510040, total=   0.7s
[CV] C=0.01, max_iter=1000, random_state=0, solver=lbfgs .............


[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:    5.9s remaining:    0.0s


[CV]  C=0.01, max_iter=1000, random_state=0, solver=lbfgs, score=0.493103, total=   0.8s
[CV] C=0.01, max_iter=1000, random_state=0, solver=liblinear .........


[Parallel(n_jobs=1)]: Done   6 out of   6 | elapsed:    6.7s remaining:    0.0s


[CV]  C=0.01, max_iter=1000, random_state=0, solver=liblinear, score=0.535961, total=   2.2s
[CV] C=0.01, max_iter=1000, random_state=0, solver=liblinear .........


[Parallel(n_jobs=1)]: Done   7 out of   7 | elapsed:    8.9s remaining:    0.0s


[CV]  C=0.01, max_iter=1000, random_state=0, solver=liblinear, score=0.508991, total=   2.1s
[CV] C=0.01, max_iter=1000, random_state=0, solver=liblinear .........


[Parallel(n_jobs=1)]: Done   8 out of   8 | elapsed:   11.0s remaining:    0.0s


[CV]  C=0.01, max_iter=1000, random_state=0, solver=liblinear, score=0.489238, total=   2.1s
[CV] C=0.01, max_iter=1000, random_state=0, solver=sag ...............


[Parallel(n_jobs=1)]: Done   9 out of   9 | elapsed:   13.2s remaining:    0.0s


[CV]  C=0.01, max_iter=1000, random_state=0, solver=sag, score=0.546006, total=   9.8s
[CV] C=0.01, max_iter=1000, random_state=0, solver=sag ...............
[CV]  C=0.01, max_iter=1000, random_state=0, solver=sag, score=0.510040, total=  10.0s
[CV] C=0.01, max_iter=1000, random_state=0, solver=sag ...............
[CV]  C=0.01, max_iter=1000, random_state=0, solver=sag, score=0.493540, total=   7.9s
[CV] C=0.01, max_iter=2000, random_state=0, solver=newton-cg .........
[CV]  C=0.01, max_iter=2000, random_state=0, solver=newton-cg, score=0.546006, total=   1.3s
[CV] C=0.01, max_iter=2000, random_state=0, solver=newton-cg .........
[CV]  C=0.01, max_iter=2000, random_state=0, solver=newton-cg, score=0.510040, total=   1.3s
[CV] C=0.01, max_iter=2000, random_state=0, solver=newton-cg .........
[CV]  C=0.01, max_iter=2000, random_state=0, solver=newton-cg, score=0.493540, total=   1.4s
[CV] C=0.01, max_iter=2000, random_state=0, solver=lbfgs .............
[CV]  C=0.01, max_iter=2000, rando



[CV]  C=10000.0, max_iter=1000, random_state=0, solver=sag, score=0.585759, total= 1.0min
[CV] C=10000.0, max_iter=1000, random_state=0, solver=sag ............




[CV]  C=10000.0, max_iter=1000, random_state=0, solver=sag, score=0.579611, total=  59.7s
[CV] C=10000.0, max_iter=1000, random_state=0, solver=sag ............




[CV]  C=10000.0, max_iter=1000, random_state=0, solver=sag, score=0.569119, total=  57.1s
[CV] C=10000.0, max_iter=2000, random_state=0, solver=newton-cg ......
[CV]  C=10000.0, max_iter=2000, random_state=0, solver=newton-cg, score=0.577458, total=   9.8s
[CV] C=10000.0, max_iter=2000, random_state=0, solver=newton-cg ......
[CV]  C=10000.0, max_iter=2000, random_state=0, solver=newton-cg, score=0.581384, total=   8.3s
[CV] C=10000.0, max_iter=2000, random_state=0, solver=newton-cg ......
[CV]  C=10000.0, max_iter=2000, random_state=0, solver=newton-cg, score=0.556526, total=  10.1s
[CV] C=10000.0, max_iter=2000, random_state=0, solver=lbfgs ..........
[CV]  C=10000.0, max_iter=2000, random_state=0, solver=lbfgs, score=0.577102, total=  10.1s
[CV] C=10000.0, max_iter=2000, random_state=0, solver=lbfgs ..........
[CV]  C=10000.0, max_iter=2000, random_state=0, solver=lbfgs, score=0.580744, total=   9.1s
[CV] C=10000.0, max_iter=2000, random_state=0, solver=lbfgs ..........
[CV]  C=1000



[CV]  C=10000.0, max_iter=2000, random_state=0, solver=sag, score=0.582642, total= 2.0min
[CV] C=10000.0, max_iter=2000, random_state=0, solver=sag ............




[CV]  C=10000.0, max_iter=2000, random_state=0, solver=sag, score=0.580628, total= 2.0min
[CV] C=10000.0, max_iter=2000, random_state=0, solver=sag ............
[CV]  C=10000.0, max_iter=2000, random_state=0, solver=sag, score=0.565011, total= 1.8min
[CV] C=10000.0, max_iter=3000, random_state=0, solver=newton-cg ......
[CV]  C=10000.0, max_iter=3000, random_state=0, solver=newton-cg, score=0.577458, total=   7.5s
[CV] C=10000.0, max_iter=3000, random_state=0, solver=newton-cg ......
[CV]  C=10000.0, max_iter=3000, random_state=0, solver=newton-cg, score=0.581384, total=   7.1s
[CV] C=10000.0, max_iter=3000, random_state=0, solver=newton-cg ......
[CV]  C=10000.0, max_iter=3000, random_state=0, solver=newton-cg, score=0.556526, total=   8.5s
[CV] C=10000.0, max_iter=3000, random_state=0, solver=lbfgs ..........
[CV]  C=10000.0, max_iter=3000, random_state=0, solver=lbfgs, score=0.577102, total=   8.6s
[CV] C=10000.0, max_iter=3000, random_state=0, solver=lbfgs ..........
[CV]  C=10000.



[CV]  C=10000000000.0, max_iter=1000, random_state=0, solver=sag, score=0.585759, total=  59.1s
[CV] C=10000000000.0, max_iter=1000, random_state=0, solver=sag ......




[CV]  C=10000000000.0, max_iter=1000, random_state=0, solver=sag, score=0.579611, total=  58.3s
[CV] C=10000000000.0, max_iter=1000, random_state=0, solver=sag ......




[CV]  C=10000000000.0, max_iter=1000, random_state=0, solver=sag, score=0.569119, total=  53.3s
[CV] C=10000000000.0, max_iter=2000, random_state=0, solver=newton-cg 
[CV]  C=10000000000.0, max_iter=2000, random_state=0, solver=newton-cg, score=0.577458, total=   6.9s
[CV] C=10000000000.0, max_iter=2000, random_state=0, solver=newton-cg 
[CV]  C=10000000000.0, max_iter=2000, random_state=0, solver=newton-cg, score=0.581384, total=   7.2s
[CV] C=10000000000.0, max_iter=2000, random_state=0, solver=newton-cg 
[CV]  C=10000000000.0, max_iter=2000, random_state=0, solver=newton-cg, score=0.556526, total=   6.8s
[CV] C=10000000000.0, max_iter=2000, random_state=0, solver=lbfgs ....
[CV]  C=10000000000.0, max_iter=2000, random_state=0, solver=lbfgs, score=0.578053, total=   8.3s
[CV] C=10000000000.0, max_iter=2000, random_state=0, solver=lbfgs ....
[CV]  C=10000000000.0, max_iter=2000, random_state=0, solver=lbfgs, score=0.578466, total=   6.3s
[CV] C=10000000000.0, max_iter=2000, random_sta



[CV]  C=10000000000.0, max_iter=2000, random_state=0, solver=sag, score=0.582642, total= 1.8min
[CV] C=10000000000.0, max_iter=2000, random_state=0, solver=sag ......




[CV]  C=10000000000.0, max_iter=2000, random_state=0, solver=sag, score=0.580628, total= 1.8min
[CV] C=10000000000.0, max_iter=2000, random_state=0, solver=sag ......
[CV]  C=10000000000.0, max_iter=2000, random_state=0, solver=sag, score=0.565011, total= 1.7min
[CV] C=10000000000.0, max_iter=3000, random_state=0, solver=newton-cg 
[CV]  C=10000000000.0, max_iter=3000, random_state=0, solver=newton-cg, score=0.577458, total=   6.8s
[CV] C=10000000000.0, max_iter=3000, random_state=0, solver=newton-cg 
[CV]  C=10000000000.0, max_iter=3000, random_state=0, solver=newton-cg, score=0.581384, total=   7.1s
[CV] C=10000000000.0, max_iter=3000, random_state=0, solver=newton-cg 
[CV]  C=10000000000.0, max_iter=3000, random_state=0, solver=newton-cg, score=0.556526, total=   6.7s
[CV] C=10000000000.0, max_iter=3000, random_state=0, solver=lbfgs ....
[CV]  C=10000000000.0, max_iter=3000, random_state=0, solver=lbfgs, score=0.578053, total=   8.4s
[CV] C=10000000000.0, max_iter=3000, random_state

[Parallel(n_jobs=1)]: Done 108 out of 108 | elapsed: 42.6min finished


TypeError: sorted() takes at most 3 arguments (4 given)

In [32]:
# y_predict = model.predict(X_test)
# precision_score(y_test, y_predict)

# Linear Regression
---

In [None]:
df_cleaned, labels_cleaned = cl.clean_data_using_labels(
    df_shared,
    labels_original,
    1,
    lambda x, y: (x - y) / y
#     lambda x, y: x
)

In [None]:
X_train, y_train, X_test, y_test = tr.training_and_test_sets(
    df_cleaned,
    labels_cleaned,
    ['date', 'hour', 'minute'],
)

In [None]:
r_model = LinearRegression()

param_grid = {}

gs = GridSearchCV(
    r_model,
    param_grid,
    cv=3,
    scoring=make_scorer(mean_squared_error),
    verbose=0,
)

In [None]:
gs.fit(X_train, cm.labels_to_array(y_train))

In [None]:
best_model = gs.best_estimator_
cv_results = gs.cv_results_

feature_importances = sorted(
    zip(
        best_model.coef_,
        X_train.columns,
    ),
    key=lambda x: np.absolute(x[0]),
    reverse=True,
)
best_features = [f for s, f in feature_importances]
feature_importances[0:20]

In [None]:
y_predict = best_model.predict(X_test)
mean_squared_error(y_test, y_predict)