In [5]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import KFold, cross_val_score
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.metrics import make_scorer
import h2o
from h2o.automl import H2OAutoML
# from torch.nn import MSELoss
import math
import pickle


In [2]:
df = pd.read_csv('./BTC_features.csv')

In [39]:
# RMSPE 계산 함수
def rmspe(y_true, y_pred):
    return  np.sqrt(np.mean(np.square((y_true - y_pred) / (y_true))))

# Automl 을 활용한 ML 모델 선정

In [30]:
df_ml = df.loc[:, :'trade.tau']

df['time_id'] = pd.to_datetime(df['time_id'])

def categorize_hour(hour):
    if 0 <= hour < 6:
        return 1.0
    elif 6 <= hour < 12:
        return 2.0
    elif 12 <= hour < 18:
        return 3.0
    else:
        return 4.0

df_ml['time_category'] = df_ml['time_id'].dt.hour.apply(categorize_hour)

# 원핫인코딩
# df_encoded = pd.get_dummies(df_ml['time_category'], prefix='time')

df_ml = df_ml.drop(['time_id', 'window_start', 'window_end','window_end_150_ticker', 'window_end_300_ticker',
             'window_end_450_ticker', 'window_end_150_orderbook', 'window_end_300_orderbook', 'window_end_450_orderbook'], axis=1)
# X = pd.merge(X ,df_encoded, left_index=True, how = 'left')


# y = df_ml['dv1_realized_volatility']

numeric_df = df_ml.select_dtypes(include=[np.number])
inf_columns = np.isinf(numeric_df).any()

columns_with_inf = inf_columns[inf_columns].index

for col in columns_with_inf:
    df_ml[col] = df_ml[col].replace([np.inf, -np.inf], np.nan)

df_ml.fillna(df_ml.mean(), inplace=True)

In [11]:
# X = pd.merge(X, df_ml['time_category'], left_index=True, right_index=True, how='left')

In [31]:
X_col = df_ml.drop('dv1_realized_volatility', axis=1).columns
y_col = 'dv1_realized_volatility'


train, test = train_test_split(df_ml, test_size=0.2, random_state=42, shuffle=False)

In [34]:
h2o.init()

Checking whether there is an H2O instance running at http://localhost:54321. connected.


0,1
H2O_cluster_uptime:,11 mins 27 secs
H2O_cluster_timezone:,Asia/Seoul
H2O_data_parsing_timezone:,UTC
H2O_cluster_version:,3.42.0.2
H2O_cluster_version_age:,23 days
H2O_cluster_name:,H2O_from_python_seonukim_3flzit
H2O_cluster_total_nodes:,1
H2O_cluster_free_memory:,1.318 Gb
H2O_cluster_total_cores:,8
H2O_cluster_allowed_cores:,8


In [32]:
train = h2o.H2OFrame(train)
test = h2o.H2OFrame(test)

Parse progress: |████████████████████████████████████████████████████████████████| (done) 100%
Parse progress: |████████████████████████████████████████████████████████████████| (done) 100%


In [36]:
aml = H2OAutoML(seed=1, project_name = 'timeseries_forcasting')
aml.train(x=list(X), y=y_col, training_frame=train)

AutoML progress: |███████████████████████████████████████████████████████████████| (done) 100%


key,value
Stacking strategy,cross_validation
Number of base models (used / total),6/6
# GBM base models (used / total),1/1
# XGBoost base models (used / total),1/1
# DRF base models (used / total),2/2
# GLM base models (used / total),1/1
# DeepLearning base models (used / total),1/1
Metalearner algorithm,GBM
Metalearner fold assignment scheme,Random
Metalearner nfolds,5

Unnamed: 0,mean,sd,cv_1_valid,cv_2_valid,cv_3_valid,cv_4_valid,cv_5_valid
mae,4.86e-05,5e-07,4.81e-05,4.85e-05,4.93e-05,4.82e-05,4.87e-05
mean_residual_deviance,0.0,0.0,0.0,0.0,0.0,0.0,0.0
mse,0.0,0.0,0.0,0.0,0.0,0.0,0.0
r2,0.4297276,0.0059213,0.4244013,0.4367357,0.4229307,0.4336051,0.430965
residual_deviance,0.0,0.0,0.0,0.0,0.0,0.0,0.0
rmse,6.27e-05,7e-07,6.21e-05,6.26e-05,6.38e-05,6.23e-05,6.29e-05
rmsle,6.27e-05,7e-07,6.21e-05,6.26e-05,6.38e-05,6.23e-05,6.29e-05


In [40]:
predictions = aml.leader.predict(test)

y_true = test[y_col].as_data_frame().values.ravel()
y_pred = predictions.as_data_frame().values.ravel()

# RMSPE 계산
score = rmspe(y_true, y_pred)
print("RMSPE:", score)

stackedensemble prediction progress: |███████████████████████████████████████████| (done) 100%
RMSPE: 0.49925616872828554


# updated features

In [19]:
df_2 = pd.read_parquet('/Users/seonukim/Documents/TMP/Data/my.parquet')
df_2.head()

Unnamed: 0,window_start,window_end,realized_volatility,num_trades,lowest_return,highest_return,high_low_gap,trade_vol,volume_power,time_id,...,dv1_realized_volatility_nn64_tvpl5_m_p2_mean,dv1_realized_volatility_nn64_tvpl10_m_p2_mean,dv1_realized_volatility_nn64_tvpl10two_c_mean,dv1_realized_volatility_nn64_two_m_mean,dv1_realized_volatility_nn64_sev_high_nn_m_mean,dv1_realized_volatility_nn64_sev_low_nn_m_mean,dv1_realized_volatility_nn64_sev_high_abs_nn_m_mean,dv1_realized_volatility_nn64_sev_low_abs_nn_m_mean,dv1_realized_volatility_nn64_all_nn_m_p1_mean,dv1_realized_volatility_nn64_all_nn_m_p2_mean
0,2022-12-16 21:06,2022-12-16 21:16,0.000148,2.595496,-0.000842,4.4e-05,0.000886,1.19205,-1.524966201,2022-12-16 21:06,...,0.000164,0.000179,0.000147,0.000148,0.000143,0.000184,0.000165,0.000159,0.000129,0.000133
1,2022-12-16 21:07,2022-12-16 21:17,0.000139,2.599883,-0.000842,-0.000133,0.000709,1.156965,-1.861047686,2022-12-16 21:07,...,0.000158,0.000187,0.000163,0.000161,0.000154,0.000156,0.000143,0.000167,0.000137,0.000131
2,2022-12-16 21:08,2022-12-16 21:18,0.000141,2.571709,-0.000177,0.000443,0.000621,0.996534,-2.354036803,2022-12-16 21:08,...,0.000183,0.000195,0.000167,0.000166,0.000136,0.000154,0.000147,0.000155,0.000138,0.00014
3,2022-12-16 21:09,2022-12-16 21:19,0.000147,2.542825,-8.9e-05,0.000488,0.000576,0.909188,-1.654264897,2022-12-16 21:09,...,0.000214,0.000194,0.000153,0.000149,0.000139,0.000144,0.000151,0.000177,0.000146,0.000153
4,2022-12-16 21:10,2022-12-16 21:20,0.000137,2.552668,-8.9e-05,0.000488,0.000576,0.918021,-1.820730623,2022-12-16 21:10,...,0.000184,0.000186,0.000148,0.000144,0.00013,0.000124,0.000102,0.000169,9.7e-05,0.0001


In [8]:
df_2.shape

(102335, 5029)

In [44]:
# selected_features= df_2.columns.to_list()

# # 리스트 변수를 pkl 파일로 저장
# with open('import.pkl', 'wb') as f:
#     pickle.dump(selected_features, f)

# pkl 파일에서 리스트 변수 복원
with open('import.pkl', 'rb') as f:
    selected_features = pickle.load(f)

# 128개의 Feature만 사용
selected_features = selected_features[:512].to_list()



In [45]:
selected_features.append('dv1_realized_volatility')


In [46]:
df_22 = df_2[selected_features]

In [25]:
h2o.init()

Checking whether there is an H2O instance running at http://localhost:54321. connected.


0,1
H2O_cluster_uptime:,1 day 7 hours 7 mins
H2O_cluster_timezone:,Asia/Seoul
H2O_data_parsing_timezone:,UTC
H2O_cluster_version:,3.42.0.2
H2O_cluster_version_age:,26 days
H2O_cluster_name:,H2O_from_python_seonukim_nrf229
H2O_cluster_total_nodes:,1
H2O_cluster_free_memory:,0
H2O_cluster_total_cores:,8
H2O_cluster_allowed_cores:,8


In [47]:
# df_22 = df_2.drop(['time_id', 'window_start', 'window_end'], axis=1)

numeric_df_2 = df_22.select_dtypes(include=[np.number])
inf_columns = np.isinf(numeric_df_2).any()

columns_with_inf = inf_columns[inf_columns].index

for col in columns_with_inf:
    df_22[col] = df_22[col].replace([np.inf, -np.inf], np.nan)

df_22.fillna(df_22.mean(), inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_22.fillna(df_22.mean(), inplace=True)


In [7]:
# half_length = len(df_22) // 2
# df_222 = df_22.iloc[:half_length]

In [49]:
X_col = df_22.drop('dv1_realized_volatility', axis=1).columns
y_col = 'dv1_realized_volatility'


train, test = train_test_split(df_22, test_size=0.2, random_state=42, shuffle=True)

In [50]:
train = h2o.H2OFrame(train)
test = h2o.H2OFrame(test)

H2OServerError: HTTP 500 Server Error:
<html>
<head>
<meta http-equiv="Content-Type" content="text/html;charset=utf-8"/>
<title>Error 500 Server Error</title>
</head>
<body><h2>HTTP ERROR 500</h2>
<p>Problem accessing /3/PostFile. Reason:
<pre>    Server Error</pre></p><h3>Caused by:</h3><pre>java.lang.OutOfMemoryError: Java heap space
</pre>

</body>
</html>
