In [7]:
# import packages
import numpy as np
import pandas as pd
from tools import FeatureToolkit

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.linear_model import LinearRegression, Ridge
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error

In [8]:
# load and prepare data
bnb = pd.read_csv('bnb.csv') # load data
bnb = bnb.sample(n=10000, replace=False, random_state=1) # 10000 samples
tool = FeatureToolkit()
bnb = tool.build_technical_indicators(bnb) # construct technical indicators
bnb.drop(columns=['Asset_ID', 'KAMA', 'PSAR+', 'PSAR-'], inplace=True) # drop columns with too many NAs
bnb.dropna(axis=0, inplace=True) # drop rows contain missing
r = bnb.index[np.isinf(bnb).any(1)]
bnb.drop([i for i in r], axis=0, inplace=True) # drop rows contain infinity

  dip[idx] = 100 * (self._dip[idx] / value)
  din[idx] = 100 * (self._din[idx] / value)


In [9]:
# set parameters
split_ratio = 0.2
scaler = MinMaxScaler()
x = bnb.drop(columns=['Target', 'Open', 'High', 'Low'])
y = bnb['Target']
clf = RandomForestRegressor()

In [10]:
# train
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=split_ratio, random_state=42)
x_train = scaler.fit_transform(x_train)
x_test = scaler.transform(x_test)
clf.fit(x_train, y_train)
y_pred = clf.predict(x_test)
print('Mean Squared Error:', mean_squared_error(y_test, y_pred))

Mean Squared Error: 1.7128461943223378e-05


In [11]:
# check feature importance
clf.feature_importances_

array([0.02547189, 0.02368129, 0.00825139, 0.03930373, 0.00938635,
       0.02491112, 0.0446105 , 0.01424301, 0.01709376, 0.01793543,
       0.02617453, 0.01430969, 0.00656022, 0.00766406, 0.02314102,
       0.01859061, 0.01916649, 0.00654783, 0.00893876, 0.01541647,
       0.01516959, 0.01472862, 0.01467353, 0.0076638 , 0.01549659,
       0.00737986, 0.01335427, 0.01967774, 0.01025579, 0.01930616,
       0.0179603 , 0.00044521, 0.0146985 , 0.03396438, 0.0107379 ,
       0.007751  , 0.01211679, 0.00367019, 0.01009773, 0.01438371,
       0.00786446, 0.01826602, 0.00835404, 0.01020118, 0.0139534 ,
       0.00451672, 0.00460957, 0.00399049, 0.01277358, 0.01456756,
       0.01713835, 0.01427951, 0.00631402, 0.00884661, 0.00719349,
       0.00803035, 0.01223214, 0.01841371, 0.00936148, 0.01049745,
       0.01769074, 0.01646166, 0.01690553, 0.00815518, 0.02052722,
       0.01570222, 0.0201972 , 0.00962301, 0.00779213, 0.01006023,
       0.01055092])

In [31]:
threshold = 0.01
not_important = {}
important_features = {}
feature_name = [item for item in x.columns]
feature_importance = clf.feature_importances_

In [32]:
for idx, val in enumerate(fi):
    if val > threshold:
        important_features[feature_name[idx]] = round(val, 4)
    else:
        not_important[feature_name[idx]] = round(val, 4)
        
print(important_features)
print()
print(not_important)

{'timestamp': 0.0255, 'Count': 0.0237, 'Volume': 0.0393, 'open_sub_close': 0.0249, 'high_div_low': 0.0446, 'ma8_vol': 0.0142, 'ma21_vol': 0.0171, 'ma50_vol': 0.0179, 'ma200_vol': 0.0262, 'AO': 0.0143, 'PVO': 0.0231, 'PVO_signal': 0.0186, 'ROC': 0.0192, 'RSI_stoch_d': 0.0154, 'RSI_stoch_k': 0.0152, 'stoch': 0.0147, 'stoch_signal': 0.0147, 'ult': 0.0155, 'ADI': 0.0134, 'CMF': 0.0197, 'EoM': 0.0103, 'EoM_signal': 0.0193, 'MFI': 0.018, 'OBV': 0.0147, 'VPT': 0.034, 'ATR': 0.0107, 'BOLL-': 0.0121, 'BOLL_percent': 0.0101, 'BOLL_width': 0.0144, 'DC-': 0.0183, 'DC_percent': 0.0102, 'DC_width': 0.014, 'KC_percent': 0.0128, 'KC_width': 0.0146, 'Ulcer': 0.0171, 'ADX': 0.0143, 'ema50_price': 0.0122, 'ema200_price': 0.0184, 'MACD_signal': 0.0105, 'MI': 0.0177, 'PSAR': 0.0165, 'STC': 0.0169, 'VI': 0.0205, 'VI+': 0.0157, 'VI-': 0.0202, 'DLR': 0.0101, 'DR': 0.0106}

{'Close': 0.0083, 'VWAP': 0.0094, 'PPO': 0.0066, 'PPO_signal': 0.0077, 'RSI': 0.0065, 'RSI_stoch': 0.0089, 'TSI': 0.0077, 'WRI': 0.0074, '