# OHCL - Analytics

### Import Library

In [21]:
import numpy as np
import pandas as pd
import numpy as np
import pandas_ta as ta
import seaborn as sns
import math

import matplotlib.pyplot as plt
plt.rcParams['figure.dpi'] = 120
plt.rcParams['figure.constrained_layout.use'] = True
import warnings
warnings.filterwarnings('ignore')

### Load Price Data

In [22]:
import os
from pathlib import Path
notebook_path = os.getcwd()
algo_dir = Path(notebook_path).parent.parent
csv_file = str(algo_dir) + '/vn-stock-data/VN30ps/VN30F1M_5minutes.csv'
is_file = os.path.isfile(csv_file)
if is_file:
    dataset = pd.read_csv(csv_file, index_col='Date', parse_dates=True)
else:
    print('remote')
    dataset = pd.read_csv("https://raw.githubusercontent.com/zuongthaotn/vn-stock-data/main/VN30ps/VN30F1M_5minutes.csv", index_col='Date', parse_dates=True)

In [23]:
data_tmp = dataset.copy()
def bar9h(tick):
    tick=tick[(tick.index.hour == 9) & (tick.index.minute == 0)]
    if len(tick):
        return tick[0]
def bar14h(tick):
    tick=tick[(tick.index.hour == 14) & (tick.index.minute == 0)]
    if len(tick):
        return tick[0]
data_tmp['Close_9h'] = data_tmp['Close']
data_tmp['Close_14h'] = data_tmp['Close']
day_data = data_tmp.resample('D').agg({
        'Open': 'first',
        'Close': 'last',
        'High': 'max',
        'Low': 'min',
        'Close_9h': bar9h,
        'Close_14h': bar14h
    })
day_data.dropna(subset=['Close'], inplace=True)
day_data['Close_shift'] = day_data['Close'].shift(1)
day_data['Close_14h_shift'] = day_data['Close_14h'].shift(1)
day_data['Day_High_shift'] = day_data['High'].shift(1)
day_data['Day_Low_shift'] = day_data['Low'].shift(1)
day_data = day_data[['Close', 'High', 'Low', 'Close_shift', 'Close_14h_shift', 'Day_High_shift', 'Day_Low_shift']].rename(columns={'Close': 'Day_Close', 'High': 'Day_High', 'Low': 'Day_Low'})

In [24]:
day_data

Unnamed: 0_level_0,Day_Close,Day_High,Day_Low,Close_shift,Close_14h_shift,Day_High_shift,Day_Low_shift
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2018-08-13,954.2,954.6,942.3,,,,
2018-08-14,959.3,961.8,952.1,954.2,949.5,954.6,942.3
2018-08-15,946.6,962.3,946.6,959.3,960.8,961.8,952.1
2018-08-16,947.0,947.3,935.5,946.6,957.7,962.3,946.6
2018-08-17,947.0,954.6,945.0,947.0,944.0,947.3,935.5
...,...,...,...,...,...,...,...
2024-07-29,1287.8,1294.7,1287.8,1285.0,1278.4,1285.0,1276.8
2024-07-30,1292.4,1292.4,1284.2,1287.8,1292.9,1294.7,1287.8
2024-07-31,1304.3,1304.3,1291.1,1292.4,1284.9,1292.4,1284.2
2024-08-01,1275.0,1306.2,1271.0,1304.3,1300.5,1304.3,1291.1


In [25]:
data = dataset.copy()
data = data.assign(time_d=pd.PeriodIndex(data.index, freq='1D').to_timestamp())
data = pd.merge(data, day_data, left_on="time_d", right_index=True, how="left")

In [26]:
data = data[data.High != data.Low]
def get_percent(r):
    return (r['Close'] - r['Day_Low']) / (r['Day_High'] - r['Day_Low'])
def get_percent_group(r):
    return math.ceil(r['percent']*10)
data['percent'] = data.apply(lambda r: get_percent(r), axis=1)
data['percent_group'] = data.apply(lambda r: get_percent_group(r), axis=1)

In [27]:
data

Unnamed: 0_level_0,Open,High,Low,Close,Volume,time_d,Day_Close,Day_High,Day_Low,Close_shift,Close_14h_shift,Day_High_shift,Day_Low_shift,percent,percent_group
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
2018-08-13 09:00:00,943.5,943.6,942.9,943.1,1812,2018-08-13,954.2,954.6,942.3,,,,,0.065041,1
2018-08-13 09:05:00,943.1,943.5,942.9,943.3,1323,2018-08-13,954.2,954.6,942.3,,,,,0.081301,1
2018-08-13 09:10:00,943.2,943.3,942.6,943.1,1207,2018-08-13,954.2,954.6,942.3,,,,,0.065041,1
2018-08-13 09:15:00,943.1,943.1,942.3,942.6,1196,2018-08-13,954.2,954.6,942.3,,,,,0.024390,1
2018-08-13 09:20:00,942.6,943.7,942.4,943.7,1765,2018-08-13,954.2,954.6,942.3,,,,,0.113821,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2024-08-02 14:05:00,1269.2,1273.4,1269.2,1270.8,22701,2024-08-02,1275.9,1277.5,1256.3,1275.0,1283.5,1306.2,1271.0,0.683962,7
2024-08-02 14:10:00,1270.9,1271.6,1268.9,1271.2,12412,2024-08-02,1275.9,1277.5,1256.3,1275.0,1283.5,1306.2,1271.0,0.702830,8
2024-08-02 14:15:00,1271.4,1276.8,1270.7,1274.7,14456,2024-08-02,1275.9,1277.5,1256.3,1275.0,1283.5,1306.2,1271.0,0.867925,9
2024-08-02 14:20:00,1274.6,1277.5,1274.5,1276.8,10914,2024-08-02,1275.9,1277.5,1256.3,1275.0,1283.5,1306.2,1271.0,0.966981,10


In [28]:
data['last_close_rate'] = 1000*(data.Close - data.Close_shift) / data.Close_shift
data['last_highlow_rate'] = (data.Close - data.Close_14h_shift) /  (data.Day_High_shift - data.Day_Low_shift)

In [29]:
data.dropna(inplace=True)

In [30]:
data_test = data.copy()
data_test = data_test[data_test.index > '2024-01-01 00:00:00']
### Ignore this year data => tranh over fitting
data = data[data.index < '2024-01-01 00:00:00']

In [31]:
# data = data.drop(data[(data.percent > 0.2) & (data.percent < 0.8)].sample(frac=.75, random_state=1).index)

In [32]:
import xgboost as xg 
from sklearn.metrics import mean_squared_error as MSE

In [33]:
feature_cols = ["last_close_rate", "last_highlow_rate"]
# Splitting 
X_train = data[feature_cols]
y_train = data[['percent_group']]
X_test = data_test[feature_cols]
y_test = data_test[['percent_group']]

In [34]:
# Instantiation 
xgb_r = xg.XGBClassifier(n_estimators=100, random_state=42)
  
# Fitting the model 
xgb_r.fit(X_train, y_train) 

In [35]:
# Predict the model 
y_pred = xgb_r.predict(X_test) 
  
# RMSE Computation 
rmse = np.sqrt(MSE(y_test, y_pred)) 
print("RMSE : % f" %(rmse)) 

RMSE :  3.069908


In [36]:
X_result = X_test.copy()
X_result = X_result.assign(Predicts=y_pred)
X_result = X_result[['Predicts']]
data_test = pd.merge(data_test, X_result, left_index=True, right_index=True, how="left")
data_test.dropna(inplace=True)

In [37]:
data_test

Unnamed: 0_level_0,Open,High,Low,Close,Volume,time_d,Day_Close,Day_High,Day_Low,Close_shift,Close_14h_shift,Day_High_shift,Day_Low_shift,percent,percent_group,last_close_rate,last_highlow_rate,Predicts
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
2024-01-02 09:00:00,1138.5,1139.7,1138.5,1138.9,6206,2024-01-02,1133.5,1141.8,1131.0,1134.6,1131.9,1139.5,1130.9,0.731481,8,3.789882,0.813953,7
2024-01-02 09:05:00,1138.9,1139.0,1138.6,1139.0,1424,2024-01-02,1133.5,1141.8,1131.0,1134.6,1131.9,1139.5,1130.9,0.740741,8,3.878019,0.825581,7
2024-01-02 09:10:00,1139.0,1141.5,1138.9,1141.4,6265,2024-01-02,1133.5,1141.8,1131.0,1134.6,1131.9,1139.5,1130.9,0.962963,10,5.993302,1.104651,9
2024-01-02 09:15:00,1141.3,1141.8,1140.3,1140.8,3739,2024-01-02,1133.5,1141.8,1131.0,1134.6,1131.9,1139.5,1130.9,0.907407,10,5.464481,1.034884,9
2024-01-02 09:20:00,1140.7,1141.2,1140.2,1140.2,1943,2024-01-02,1133.5,1141.8,1131.0,1134.6,1131.9,1139.5,1130.9,0.851852,9,4.935660,0.965116,5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2024-08-02 14:05:00,1269.2,1273.4,1269.2,1270.8,22701,2024-08-02,1275.9,1277.5,1256.3,1275.0,1283.5,1306.2,1271.0,0.683962,7,-3.294118,-0.360795,6
2024-08-02 14:10:00,1270.9,1271.6,1268.9,1271.2,12412,2024-08-02,1275.9,1277.5,1256.3,1275.0,1283.5,1306.2,1271.0,0.702830,8,-2.980392,-0.349432,7
2024-08-02 14:15:00,1271.4,1276.8,1270.7,1274.7,14456,2024-08-02,1275.9,1277.5,1256.3,1275.0,1283.5,1306.2,1271.0,0.867925,9,-0.235294,-0.250000,6
2024-08-02 14:20:00,1274.6,1277.5,1274.5,1276.8,10914,2024-08-02,1275.9,1277.5,1256.3,1275.0,1283.5,1306.2,1271.0,0.966981,10,1.411765,-0.190341,7


In [38]:
data_test[(data_test.percent_group > 8)]

Unnamed: 0_level_0,Open,High,Low,Close,Volume,time_d,Day_Close,Day_High,Day_Low,Close_shift,Close_14h_shift,Day_High_shift,Day_Low_shift,percent,percent_group,last_close_rate,last_highlow_rate,Predicts
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
2024-01-02 09:10:00,1139.0,1141.5,1138.9,1141.4,6265,2024-01-02,1133.5,1141.8,1131.0,1134.6,1131.9,1139.5,1130.9,0.962963,10,5.993302,1.104651,9
2024-01-02 09:15:00,1141.3,1141.8,1140.3,1140.8,3739,2024-01-02,1133.5,1141.8,1131.0,1134.6,1131.9,1139.5,1130.9,0.907407,10,5.464481,1.034884,9
2024-01-02 09:20:00,1140.7,1141.2,1140.2,1140.2,1943,2024-01-02,1133.5,1141.8,1131.0,1134.6,1131.9,1139.5,1130.9,0.851852,9,4.935660,0.965116,5
2024-01-02 09:25:00,1140.2,1140.5,1139.6,1139.7,3265,2024-01-02,1133.5,1141.8,1131.0,1134.6,1131.9,1139.5,1130.9,0.805556,9,4.494976,0.906977,4
2024-01-02 09:30:00,1139.9,1140.0,1139.6,1139.9,2584,2024-01-02,1133.5,1141.8,1131.0,1134.6,1131.9,1139.5,1130.9,0.824074,9,4.671250,0.930233,5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2024-08-01 11:10:00,1299.7,1300.2,1299.0,1299.6,1964,2024-08-01,1275.0,1306.2,1271.0,1304.3,1300.5,1304.3,1291.1,0.812500,9,-3.603465,-0.068182,6
2024-08-01 11:20:00,1298.8,1300.0,1298.2,1299.3,3764,2024-08-01,1275.0,1306.2,1271.0,1304.3,1300.5,1304.3,1291.1,0.803977,9,-3.833474,-0.090909,7
2024-08-02 14:15:00,1271.4,1276.8,1270.7,1274.7,14456,2024-08-02,1275.9,1277.5,1256.3,1275.0,1283.5,1306.2,1271.0,0.867925,9,-0.235294,-0.250000,6
2024-08-02 14:20:00,1274.6,1277.5,1274.5,1276.8,10914,2024-08-02,1275.9,1277.5,1256.3,1275.0,1283.5,1306.2,1271.0,0.966981,10,1.411765,-0.190341,7


In [41]:
data_test[(data_test.percent_group > 8) & (data_test.Predicts > 8)]

Unnamed: 0_level_0,Open,High,Low,Close,Volume,time_d,Day_Close,Day_High,Day_Low,Close_shift,Close_14h_shift,Day_High_shift,Day_Low_shift,percent,percent_group,last_close_rate,last_highlow_rate,Predicts
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
2024-01-02 09:10:00,1139.0,1141.5,1138.9,1141.4,6265,2024-01-02,1133.5,1141.8,1131.0,1134.6,1131.9,1139.5,1130.9,0.962963,10,5.993302,1.104651,9
2024-01-02 09:15:00,1141.3,1141.8,1140.3,1140.8,3739,2024-01-02,1133.5,1141.8,1131.0,1134.6,1131.9,1139.5,1130.9,0.907407,10,5.464481,1.034884,9
2024-01-02 09:45:00,1139.9,1140.7,1139.7,1140.7,2011,2024-01-02,1133.5,1141.8,1131.0,1134.6,1131.9,1139.5,1130.9,0.898148,9,5.376344,1.023256,9
2024-01-02 09:55:00,1140.5,1140.8,1140.3,1140.8,1424,2024-01-02,1133.5,1141.8,1131.0,1134.6,1131.9,1139.5,1130.9,0.907407,10,5.464481,1.034884,9
2024-01-08 09:20:00,1171.5,1171.9,1170.7,1171.8,2479,2024-01-08,1162.0,1173.3,1160.6,1166.0,1157.9,1166.0,1154.2,0.881890,9,4.974271,1.177966,9
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2024-07-23 11:10:00,1299.5,1299.7,1298.8,1299.4,2758,2024-07-23,1284.2,1303.7,1280.1,1300.0,1297.2,1304.4,1291.3,0.817797,9,-0.461538,0.167939,10
2024-07-23 11:15:00,1299.3,1300.0,1299.1,1299.4,957,2024-07-23,1284.2,1303.7,1280.1,1300.0,1297.2,1304.4,1291.3,0.817797,9,-0.461538,0.167939,10
2024-08-01 09:05:00,1305.3,1306.2,1305.3,1306.1,2761,2024-08-01,1275.0,1306.2,1271.0,1304.3,1300.5,1304.3,1291.1,0.997159,10,1.380051,0.424242,9
2024-08-01 09:10:00,1306.1,1306.1,1305.6,1306.0,1570,2024-08-01,1275.0,1306.2,1271.0,1304.3,1300.5,1304.3,1291.1,0.994318,10,1.303381,0.416667,9
