In [72]:
import warnings
warnings.filterwarnings('ignore')
import glob
import os
import datetime
import matplotlib.pyplot as plt
import mplfinance
import pandas as pd
import numpy as np

from sklearn import preprocessing
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_validate
from sklearn.model_selection import TimeSeriesSplit
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import KFold
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import HistGradientBoostingClassifier
from xgboost import XGBClassifier
from xgboost import plot_importance
from lightgbm import LGBMClassifier
from sklearn.metrics import precision_score, recall_score, confusion_matrix, roc_auc_score
from sklearn.metrics import f1_score
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.metrics import accuracy_score
from sklearn import svm

import seaborn as sns; sns.set()

In [3]:
BTC = pd.read_csv('./data/BTC.csv', parse_dates=['Date'], index_col = 'Date')
GOLD = pd.read_csv('./data/ZG.csv', parse_dates=['Date'], index_col = 'Date')
NDAQ = pd.read_csv('./data/IXIC.csv', parse_dates=['Date'], index_col = 'Date')
US10YT = pd.read_csv('./data/US10YT.csv', parse_dates=['Date'], index_col = 'Date')
VIX = pd.read_csv('./data/VIX.csv', parse_dates=['Date'], index_col = 'Date')

In [4]:
# t1 = BTC.merge(GOLD, left_index=True,
#               right_index=True, how = 'outer',
#               suffixes=('_BTC', '_GLD'))
# list_df = ['NDAQ', 'US10YT', 'VIX']
# for x in list_df:
#     t1 = t1.merge(globals()[x].add_suffix('_'+str(x)),
#                  left_index=True, right_index=True,
#                  how='outer')

In [5]:
t1 = BTC.merge(GOLD, on='Date', how='left', suffixes=('_BTC', '_GLD'))

In [6]:
t1 = BTC.merge(GOLD, on='Date', how='left', suffixes=('_BTC', '_GLD'))
list_df = ['NDAQ', 'US10YT', 'VIX']
for x in list_df:
    t1 = t1.merge(globals()[x].add_suffix('_'+str(x)),
                 on='Date', how='left')
t1.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 3654 entries, 2012-04-13 to 2022-04-14
Data columns (total 29 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Close_BTC      3654 non-null   float64
 1   Open_BTC       3654 non-null   float64
 2   High_BTC       3654 non-null   float64
 3   Low_BTC        3654 non-null   float64
 4   Volume_BTC     3654 non-null   float64
 5   Change_BTC     3654 non-null   float64
 6   Close_GLD      2141 non-null   float64
 7   Open_GLD       2141 non-null   float64
 8   High_GLD       2141 non-null   float64
 9   Low_GLD        2141 non-null   float64
 10  Volume_GLD     2141 non-null   float64
 11  Change_GLD     2141 non-null   float64
 12  Close_NDAQ     2518 non-null   float64
 13  Open_NDAQ      2518 non-null   float64
 14  High_NDAQ      2518 non-null   float64
 15  Low_NDAQ       2518 non-null   float64
 16  Volume_NDAQ    2518 non-null   float64
 17  Change_NDAQ    2518 non-null   flo

In [7]:
t1.isna().sum()

Close_BTC           0
Open_BTC            0
High_BTC            0
Low_BTC             0
Volume_BTC          0
Change_BTC          0
Close_GLD        1513
Open_GLD         1513
High_GLD         1513
Low_GLD          1513
Volume_GLD       1513
Change_GLD       1513
Close_NDAQ       1136
Open_NDAQ        1136
High_NDAQ        1136
Low_NDAQ         1136
Volume_NDAQ      1136
Change_NDAQ      1136
Close_US10YT      644
Open_US10YT       644
High_US10YT       644
Low_US10YT        644
Change_US10YT     644
Close_VIX        1128
Open_VIX         1128
High_VIX         1128
Low_VIX          1128
Volume_VIX       1128
Change_VIX       1128
dtype: int64

In [8]:
# 휴일장의 Change, Volume 값을 0으로 만들어야함
# 미국10년 국채엔 Volume 컬럼 없음
t1.Volume_GLD.fillna(0, inplace=True)
t1.Change_GLD.fillna(0, inplace=True)
t1.Volume_NDAQ.fillna(0, inplace=True)
t1.Change_NDAQ.fillna(0, inplace=True)
t1.Change_US10YT.fillna(0, inplace=True)
t1.Volume_VIX.fillna(0, inplace=True)
t1.Change_VIX.fillna(0, inplace=True)
    

In [9]:
# df = t1.dropna()
# df = t1.fillna(method='ffill')
# df = t1.fillna(method='bfill')
df = t1.fillna(0)
df.dropna(inplace=True)
df.isna().sum()

Close_BTC        0
Open_BTC         0
High_BTC         0
Low_BTC          0
Volume_BTC       0
Change_BTC       0
Close_GLD        0
Open_GLD         0
High_GLD         0
Low_GLD          0
Volume_GLD       0
Change_GLD       0
Close_NDAQ       0
Open_NDAQ        0
High_NDAQ        0
Low_NDAQ         0
Volume_NDAQ      0
Change_NDAQ      0
Close_US10YT     0
Open_US10YT      0
High_US10YT      0
Low_US10YT       0
Change_US10YT    0
Close_VIX        0
Open_VIX         0
High_VIX         0
Low_VIX          0
Volume_VIX       0
Change_VIX       0
dtype: int64

In [10]:
# 장이 열리지 않은 날의 가격정보(low, high, open, close)는 남기고 거래량, 변화율은 0으로 데이터프레임에 저장
df.loc['2020-12-01':'2020-12-31', 'Close_GLD':'Change_GLD']

Unnamed: 0_level_0,Close_GLD,Open_GLD,High_GLD,Low_GLD,Volume_GLD,Change_GLD
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2020-12-01,0.0,0.0,0.0,0.0,0.0,0.0
2020-12-02,0.0,0.0,0.0,0.0,0.0,0.0
2020-12-03,1858.0,1859.6,1862.3,1858.8,10.0,0.0057
2020-12-04,0.0,0.0,0.0,0.0,0.0,0.0
2020-12-05,0.0,0.0,0.0,0.0,0.0,0.0
2020-12-06,0.0,0.0,0.0,0.0,0.0,0.0
2020-12-07,0.0,0.0,0.0,0.0,0.0,0.0
2020-12-08,1892.7,1877.0,1886.6,1874.5,200.0,0.0045
2020-12-09,1856.0,1883.1,1883.1,1844.0,110.0,-0.0194
2020-12-10,1855.4,1866.3,1866.3,1866.3,10.0,-0.0003


In [11]:
df.head(7)

Unnamed: 0_level_0,Close_BTC,Open_BTC,High_BTC,Low_BTC,Volume_BTC,Change_BTC,Close_GLD,Open_GLD,High_GLD,Low_GLD,...,Open_US10YT,High_US10YT,Low_US10YT,Change_US10YT,Close_VIX,Open_VIX,High_VIX,Low_VIX,Volume_VIX,Change_VIX
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2012-04-13,4.9,4.9,4.9,4.7,93110.0,0.0,1659.1,1673.3,1674.8,1650.0,...,1.989,1.989,1.989,-0.0326,19.55,17.95,19.62,17.85,0.0,0.1366
2012-04-14,5.0,4.9,5.0,4.9,63890.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2012-04-15,5.0,5.0,5.0,4.9,33340.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2012-04-16,4.9,5.0,5.0,4.9,31160.0,0.0,1648.7,1650.8,1656.8,1641.2,...,1.982,1.982,1.982,-0.0035,19.55,18.87,20.42,18.6,0.0,0.0
2012-04-17,5.0,4.9,5.0,4.9,30650.0,0.0101,1650.3,1652.0,1653.0,1637.0,...,1.996,1.996,1.996,0.0071,18.46,18.66,18.66,17.58,0.0,-0.0558
2012-04-18,5.1,5.0,5.2,5.0,79130.0,0.0281,1638.8,1653.2,1653.3,1638.1,...,1.975,1.975,1.975,-0.0105,18.64,19.02,19.17,18.03,0.0,0.0098
2012-04-19,5.1,5.1,5.2,5.1,46680.0,0.0,1640.6,1642.1,1653.1,1632.0,...,1.968,1.968,1.968,-0.0035,18.36,18.51,19.69,17.69,0.0,-0.015


In [12]:
df.tail()

Unnamed: 0_level_0,Close_BTC,Open_BTC,High_BTC,Low_BTC,Volume_BTC,Change_BTC,Close_GLD,Open_GLD,High_GLD,Low_GLD,...,Open_US10YT,High_US10YT,Low_US10YT,Change_US10YT,Close_VIX,Open_VIX,High_VIX,Low_VIX,Volume_VIX,Change_VIX
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2022-04-10,42138.0,42760.0,43421.0,41884.0,255830000.0,-0.0147,0.0,0.0,0.0,0.0,...,2.719,2.723,2.719,0.007,0.0,0.0,0.0,0.0,0.0,0.0
2022-04-11,39497.0,42144.0,42418.0,39202.0,608380000.0,-0.0627,1948.2,1949.6,1974.6,1942.9,...,2.723,2.793,2.711,0.019,24.37,23.09,24.42,22.09,0.0,0.1517
2022-04-12,40078.0,39507.0,40678.0,39265.0,468270000.0,0.0147,1976.1,1957.4,1982.7,1953.0,...,2.803,2.836,2.674,-0.017,24.26,24.94,25.38,22.27,0.0,-0.0045
2022-04-13,41133.0,40072.0,41515.0,39581.0,406700000.0,0.0263,1984.7,1970.5,1985.8,1966.3,...,2.727,2.79,2.646,-0.009,21.82,23.52,24.45,21.37,0.0,-0.1006
2022-04-14,40481.0,41133.0,41494.0,40340.0,341380000.0,-0.0159,0.0,0.0,0.0,0.0,...,2.699,2.717,2.656,0.0024,21.75,21.82,21.87,21.44,0.0,-0.0032


In [13]:
df.to_csv('./data/ETF.csv')

In [14]:
len(df)

3654

In [15]:
df['target'] = df['Close_BTC'].pct_change()

In [16]:
df['target'].describe()

count    3653.000000
mean        0.004237
std         0.075543
min        -0.572084
25%        -0.013045
50%         0.001810
75%         0.019909
max         3.367452
Name: target, dtype: float64

In [17]:
df['target'] = np.where(df['target'] > 0 , 1, -1)
df['target'].value_counts()

 1    1923
-1    1731
Name: target, dtype: int64

In [18]:
df['target'] = df['target'].shift(-1)

In [19]:
df = df.dropna()
len(df)

3653

In [20]:
df['target'] = df['target'].astype(np.int64)

In [21]:
y_var = df['target']
x_var = df.drop(['target', 'Volume_VIX'], axis=1) 
x_var.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 3653 entries, 2012-04-13 to 2022-04-13
Data columns (total 28 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Close_BTC      3653 non-null   float64
 1   Open_BTC       3653 non-null   float64
 2   High_BTC       3653 non-null   float64
 3   Low_BTC        3653 non-null   float64
 4   Volume_BTC     3653 non-null   float64
 5   Change_BTC     3653 non-null   float64
 6   Close_GLD      3653 non-null   float64
 7   Open_GLD       3653 non-null   float64
 8   High_GLD       3653 non-null   float64
 9   Low_GLD        3653 non-null   float64
 10  Volume_GLD     3653 non-null   float64
 11  Change_GLD     3653 non-null   float64
 12  Close_NDAQ     3653 non-null   float64
 13  Open_NDAQ      3653 non-null   float64
 14  High_NDAQ      3653 non-null   float64
 15  Low_NDAQ       3653 non-null   float64
 16  Volume_NDAQ    3653 non-null   float64
 17  Change_NDAQ    3653 non-null   flo

In [22]:
up=df[df['target']==1].target.count()
total=df.target.count()
print('up/down ratio: {0:.2f}'.format((up/total)))

up/down ratio: 0.53


In [23]:
X_train, X_test, y_train, y_test = train_test_split(x_var, 
                                                    y_var, 
                                                    test_size=0.3, 
                                                    shuffle=False, 
                                                    random_state=3)

train_count = y_train.count()
test_count = y_test.count()

print('train set label ratio')
print(y_train.value_counts()/train_count)
print('test set label ratio')
print(y_test.value_counts()/test_count)

train set label ratio
 1    0.527962
-1    0.472038
Name: target, dtype: float64
test set label ratio
 1    0.52281
-1    0.47719
Name: target, dtype: float64


In [24]:
def get_confusion_matrix(y_test, pred):
    confusion = confusion_matrix(y_test, pred)
    accuracy = accuracy_score(y_test, pred)
    precision = precision_score(y_test, pred)
    recall = recall_score(y_test, pred)
    f1 = f1_score(y_test, pred)
    roc_score = roc_auc_score(y_test, pred)
    print('confusion matrix')
    print('accuracy:{0:.4f},precision:{1:.4f},recall:{2:.4f},F1:{3:.4f},ROC AUC score:{4:.4f}'.format(accuracy, precision, recall, f1,roc_score))

In [77]:
n_estimators = range(10,400,10)

xg_parameters ={'max_depth' : [3,4,5,6],
                'objective':['binary:logistic'],
                'n_estimators': n_estimators, 
                'learning_rate':[0.01, 0.1], 
                'gamma': [0.5, 1, 2], 
                'random_state':[3],
               'early_stopping_rounds':[30],
               'eval_metric' : ['logloss'], 
                'eval_set' : [[X_test, y_test]],
               'scoring':['f1']}

# xgb_cv = TimeSeriesSplit(n_splits=5).split(X_train)
xgb_cv=KFold(n_splits=5)

In [78]:
# xgb_dis = XGBClassifier(eval_metric='error', n_estimators=400, learning_rate=0.1, max_depth=3, random_state=3, early_stopping_rounds=30)
xgb_model = XGBClassifier()

xgb_clf = GridSearchCV(xgb_model, xg_parameters, n_jobs=-1, 
                       cv=xgb_cv
#                    cv=xgb_cv, n_folds=5, shuffle=True, 
#                    scoring='roc_auc',
                      )

# xgb_dis.fit(X_train, y_train)
# xgb_pred = xgb_dis.predict(X_test)

xgb_clf.fit(X_train, y_train)
xgb_pred = xgb_clf.predict(X_test)

Parameters: { "early_stopping_rounds", "eval_set", "scoring" } might not be used.

  This could be a false alarm, with some parameters getting used by language bindings but
  then being mistakenly passed down to XGBoost core, or some parameter actually being used
  but getting flagged wrongly here. Please open an issue if you find any such cases.




In [81]:
print("xgb best params : ", xgb_clf.best_params_)
print(get_confusion_matrix(y_test, xgb_pred))

xgb best params :  {'early_stopping_rounds': 30, 'eval_metric': 'logloss', 'eval_set': [            Close_BTC  Open_BTC  High_BTC  Low_BTC   Volume_BTC  Change_BTC  \
Date                                                                          
2019-04-14     5134.8    5052.0    5153.5   5010.7     566380.0      0.0164   
2019-04-15     5032.3    5135.1    5168.0   4964.2     688200.0     -0.0200   
2019-04-16     5180.9    5031.4    5198.4   5014.4     631050.0      0.0295   
2019-04-17     5208.3    5182.1    5230.9   5165.5     806520.0      0.0053   
2019-04-18     5264.7    5208.3    5293.1   5205.0     727970.0      0.0108   
...               ...       ...       ...      ...          ...         ...   
2022-04-09    42767.0   42275.0   42809.0  42129.0  165160000.0      0.0116   
2022-04-10    42138.0   42760.0   43421.0  41884.0  255830000.0     -0.0147   
2022-04-11    39497.0   42144.0   42418.0  39202.0  608380000.0     -0.0627   
2022-04-12    40078.0   39507.0   40678.0  

In [82]:
xgb_clf.score(X_train, y_train)

0.6159561986703168

In [57]:
xgb_dis = XGBClassifier(eval_metric='error', n_estimators=400, learning_rate=0.025, max_depth=3, random_state=3, early_stopping_rounds=30)
xgb_dis.fit(X_train, y_train)
xgb_pred = xgb_dis.predict(X_test)

Parameters: { "early_stopping_rounds" } might not be used.

  This could be a false alarm, with some parameters getting used by language bindings but
  then being mistakenly passed down to XGBoost core, or some parameter actually being used
  but getting flagged wrongly here. Please open an issue if you find any such cases.




In [58]:
xgb_dis

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, early_stopping_rounds=30,
              enable_categorical=False, eval_metric='error', gamma=0, gpu_id=-1,
              importance_type=None, interaction_constraints='',
              learning_rate=0.025, max_delta_step=0, max_depth=3,
              min_child_weight=1, missing=nan, monotone_constraints='()',
              n_estimators=400, n_jobs=12, num_parallel_tree=1,
              predictor='auto', random_state=3, reg_alpha=0, reg_lambda=1,
              scale_pos_weight=1, subsample=1, tree_method='exact',
              validate_parameters=1, verbosity=None)

In [59]:
print(xgb_dis.score(X_train, y_train))

0.7305436057880329


In [60]:
get_confusion_matrix(y_test, xgb_pred)

confusion matrix
accuracy:0.5411,precision:0.5608,recall:0.5637,F1:0.5622,ROC AUC score:0.5400


In [28]:
n_estimators = range(10,200,10)

params = {
    'bootstrap': [True],
    'n_estimators':n_estimators,
    'max_depth':[4,6,8,10,12],
    'min_samples_leaf': [2, 3, 4, 5],
    'min_samples_split': [2, 4, 6, 8, 10],
    'max_features':[4]
}

In [29]:
my_cv = TimeSeriesSplit(n_splits=5).split(X_train)

In [30]:
my_cv

<generator object TimeSeriesSplit.split at 0x0000013CF8A71C80>

In [31]:
clf = GridSearchCV(RandomForestClassifier(), params, cv=my_cv, n_jobs=-1)

In [32]:
clf

GridSearchCV(cv=<generator object TimeSeriesSplit.split at 0x0000013CF8A71C80>,
             estimator=RandomForestClassifier(), n_jobs=-1,
             param_grid={'bootstrap': [True], 'max_depth': [4, 6, 8, 10, 12],
                         'max_features': [4], 'min_samples_leaf': [2, 3, 4, 5],
                         'min_samples_split': [2, 4, 6, 8, 10],
                         'n_estimators': range(10, 200, 10)})

In [33]:
import time

In [34]:
start_time = time.time()
clf.fit(X_train, y_train)
print(time.time() - start_time," seconds consumed")

242.0614469051361  seconds consumed


In [35]:
## Random Forest에서 best param
print('best parameter:\n', clf.best_params_)
print('best prediction:{0:.4f}'.format(clf.best_score_))

best parameter:
 {'bootstrap': True, 'max_depth': 4, 'max_features': 4, 'min_samples_leaf': 5, 'min_samples_split': 2, 'n_estimators': 60}
best prediction:0.5366


In [36]:
lgb = LGBMClassifier(n_estimators=400, learning_rate=0.1, max_depth=3, random_state=3)
lgb.fit(X_train, y_train)
lgb_pred = lgb.predict(X_test)

In [37]:
pred_con = clf.predict(X_test)
accuracy_con = accuracy_score(y_test, pred_con)
print('RF accuracy:{0:.4f}'.format(accuracy_con))
get_confusion_matrix(y_test, pred_con)

print("xgb score :", xgb_dis.score(X_train, y_train))
get_confusion_matrix(y_test, xgb_pred)
http://localhost:8889/notebooks/Trader/Trader.ipynb#
print("lgb score :", lgb.score(X_train, y_train))
get_confusion_matrix(y_test, lgb_pred)

RF accuracy:0.5155
confusion matrix
accuracy:0.5155,precision:0.5362,recall:0.5428,F1:0.5395,ROC AUC score:0.5142
xgb score : 0.879546343371138
confusion matrix
accuracy:0.5401,precision:0.5891,recall:0.3979,F1:0.4750,ROC AUC score:0.5469
lgb score : 0.8596010950332421
confusion matrix
accuracy:0.5246,precision:0.5463,recall:0.5358,F1:0.5410,ROC AUC score:0.5241


In [38]:
cols = {'close':'Close', 'open':'Open', 'high':'High', 'low':'Low', 'volume':'Volume'}
ETH = pd.read_csv('./data/ETHUSDT1d.csv', parse_dates=['time'], index_col = 'time')
ETH = ETH.rename_axis('Date')
ETH = ETH.iloc[:, 1:6]
ETH = ETH[['close', 'open', 'high', 'low', 'volume']]
ETH.rename(columns=cols, inplace=True)

In [39]:
test = ETH.merge(GOLD, on='Date', how='left', suffixes=('_ETH', '_GLD'))
list_df = ['NDAQ', 'US10YT', 'VIX']
for x in list_df:
    test = test.merge(globals()[x].add_suffix('_'+str(x)),
                 on='Date',
                 how='left')

In [40]:
# test.fillna(method='ffill', inplace=True)
# test.fillna(method='bfill', inplace=True)
test.fillna(0)
test.drop(['Volume_VIX'], axis=1, inplace=True)

In [41]:
pred_eth = clf.predict(test)

ValueError: Input contains NaN, infinity or a value too large for dtype('float32').

In [None]:
ETH['target'] = ETH['Close'].pct_change()
ETH['target'] = np.where(ETH['target'] > 0 , 1, -1)
ETH['target'] = ETH['target'].shift(-1)
ETH.fillna(method='ffill', inplace=True)

In [None]:
print(pred_eth[0], ETH.iloc[0]['target'])

In [None]:
good = 0
for i in range(len(ETH['target'])):
    if pred_eth[i] == int(ETH.iloc[i]['target']):
        good += 1
    else:
        continue

In [None]:
print("예측율 : ", good/len(ETH))