## Import Library

In [1]:
import numpy as np
import pandas as pd
import numpy as np
import seaborn as sns

import matplotlib.pyplot as plt
plt.rcParams['figure.figsize'] = [12, 6]
plt.rcParams['figure.dpi'] = 120

import warnings
warnings.filterwarnings('ignore')

## Load Price Data

In [2]:
import os
from pathlib import Path
notebook_path = os.getcwd()
algo_dir = Path(notebook_path).parent.parent
csv_file = str(algo_dir) + '/vn-stock-data/VN30ps/VN30F1M_5minutes.csv'
is_file = os.path.isfile(csv_file)
if is_file:
    raw_data = pd.read_csv(csv_file, index_col='Date', parse_dates=True)
else:
    raw_data = pd.read_csv("https://raw.githubusercontent.com/zuongthaotn/vn-stock-data/main/VN30ps/VN30F1M_5minutes.csv", index_col='Date', parse_dates=True)
raw_data.tail(10)

Unnamed: 0_level_0,Open,High,Low,Close,Volume
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2024-07-10 13:50:00,1315.3,1316.8,1314.3,1316.2,5186
2024-07-10 13:55:00,1316.6,1317.4,1315.9,1316.3,5079
2024-07-10 14:00:00,1316.4,1317.0,1315.0,1315.0,3423
2024-07-10 14:05:00,1315.2,1317.0,1315.1,1316.0,4463
2024-07-10 14:10:00,1316.0,1316.2,1312.3,1314.6,10712
2024-07-10 14:15:00,1314.6,1315.9,1312.1,1314.3,9556
2024-07-10 14:20:00,1314.2,1314.4,1310.3,1310.7,10901
2024-07-10 14:25:00,1310.7,1312.9,1310.0,1310.3,8964
2024-07-10 14:30:00,1309.7,1309.7,1309.7,1309.7,280
2024-07-10 14:45:00,1310.0,1310.0,1310.0,1310.0,5651


In [3]:
def cal_first_close(tick):
  if not tick.empty:
    return tick[0]


def cal_high_before(tick):
  tick = tick[100*tick.index.hour+tick.index.minute > 910]
  tick = tick[100*tick.index.hour+tick.index.minute < 1355]
  return tick.max()

def cal_high_after(tick):
  tick = tick[100*tick.index.hour+tick.index.minute > 1355]
  tick = tick[100*tick.index.hour+tick.index.minute < 1430]
  return tick.max()


def cal_low_before(tick):
  tick = tick[100*tick.index.hour+tick.index.minute > 910]
  tick = tick[100*tick.index.hour+tick.index.minute < 1355]
  return tick.min()


def cal_low_after(tick):
  tick = tick[100*tick.index.hour+tick.index.minute > 1355]
  tick = tick[100*tick.index.hour+tick.index.minute < 1430]
  return tick.min()


def cal_price(tick):
  tick = tick[100*tick.index.hour+tick.index.minute == 1355]
  if not tick.empty:
    return tick[0]

def cal_close(tick):
  tick = tick[100*tick.index.hour+tick.index.minute == 1425]
  if not tick.empty:
    return tick[0]

In [4]:
data = raw_data.copy()
data['first_close'] = data.Close
data['price'] = data.Close
data['prev_high'] = data.High
data['prev_low'] = data.Low
data['next_high'] = data.High
data['next_low'] = data.Low
price = data.resample("D").agg({
    'first_close': cal_first_close,
    'prev_high':cal_high_before,
    'prev_low': cal_low_before,
    'next_high':cal_high_after,
    'next_low': cal_low_after,
    'price': cal_price,
    'Close': cal_close
    })
price = price.dropna()

In [5]:
price['percent'] = 100 * (price.price - price.Close.shift(1)) / price.Close.shift(1)
price['returns'] = (price.price - price.first_close) / (price.prev_high - price.prev_low)
price['return'] = 1000 * (price.Close - price.price) / price.price
price.tail(10)

Unnamed: 0_level_0,first_close,prev_high,prev_low,next_high,next_low,price,Close,percent,returns,return
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
2024-06-27,1285.0,1289.0,1280.8,1291.7,1286.5,1286.5,1289.8,-0.232648,0.182927,2.565099
2024-06-28,1292.0,1292.4,1281.2,1288.1,1272.5,1283.9,1277.9,-0.457435,-0.723214,-4.673261
2024-07-01,1277.5,1279.2,1271.7,1285.8,1276.7,1279.4,1284.8,0.11738,0.253333,4.220728
2024-07-02,1286.6,1299.0,1284.6,1298.8,1293.7,1297.1,1296.6,0.957347,0.729167,-0.385475
2024-07-03,1296.5,1306.5,1294.5,1309.5,1305.0,1305.1,1305.5,0.655561,0.716667,0.30649
2024-07-04,1306.7,1313.3,1306.1,1309.5,1303.5,1308.3,1308.0,0.214477,0.222222,-0.229305
2024-07-05,1309.7,1314.5,1307.2,1316.5,1307.6,1308.8,1316.2,0.061162,-0.123288,5.654034
2024-07-08,1316.1,1316.8,1305.6,1313.0,1307.0,1310.7,1312.0,-0.41787,-0.482143,0.991836
2024-07-09,1315.1,1320.3,1304.5,1321.9,1314.4,1318.2,1319.5,0.472561,0.196203,0.986193
2024-07-10,1319.6,1321.9,1313.6,1317.0,1310.0,1316.3,1310.3,-0.242516,-0.39759,-4.558231


In [6]:
price = price[price['return'] > -30]
price = price[price['return'] <  30]
len(price)

1472

In [7]:
def group_data(r):
    group = 4
    if r['next_high'] - r['price'] > 3 and r['price'] - r['next_low'] > 3:
        # Do nothing group
        group = 0
    elif r['return'] > 0:
        # Long group
        group = 1
    else:
        # Short group
        group = 2
    return group
price['group'] = price.apply(lambda r: group_data(r), axis=1)

In [8]:
price.dropna(inplace=True)
price

Unnamed: 0_level_0,first_close,prev_high,prev_low,next_high,next_low,price,Close,percent,returns,return,group
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
2018-08-14,954.9,958.6,952.1,961.8,959.0,959.5,960.1,0.565978,0.707692,0.625326,1
2018-08-15,958.7,962.3,954.7,959.3,949.9,959.2,951.2,-0.093740,0.065789,-8.340284,2
2018-08-16,942.0,943.5,935.5,947.3,942.6,943.0,944.5,-0.862069,0.125000,1.590668,1
2018-08-17,953.0,954.6,946.4,949.9,945.0,946.5,947.0,0.211752,-0.792683,0.528262,1
2018-08-20,945.8,951.5,944.1,946.5,942.6,946.5,944.7,-0.052798,0.094595,-1.901743,2
...,...,...,...,...,...,...,...,...,...,...,...
2024-07-04,1306.7,1313.3,1306.1,1309.5,1303.5,1308.3,1308.0,0.214477,0.222222,-0.229305,2
2024-07-05,1309.7,1314.5,1307.2,1316.5,1307.6,1308.8,1316.2,0.061162,-0.123288,5.654034,1
2024-07-08,1316.1,1316.8,1305.6,1313.0,1307.0,1310.7,1312.0,-0.417870,-0.482143,0.991836,1
2024-07-09,1315.1,1320.3,1304.5,1321.9,1314.4,1318.2,1319.5,0.472561,0.196203,0.986193,0


In [28]:
# No trade
len(price[price['group']==0])

220

In [29]:
# Long
len(price[price['group']==1])

655

In [30]:
# Short
len(price[price['group']==2])

596

In [9]:
price['percent'].describe()

count    1471.000000
mean        0.046648
std         1.258515
min        -7.200119
25%        -0.411080
50%         0.091168
75%         0.626759
max         7.145188
Name: percent, dtype: float64

In [10]:
len(np.unique(price['percent']))

1464

## Kmeans Clustering

In [11]:
from sklearn.model_selection import train_test_split
from sklearn import metrics

In [12]:
 # split dataset in features and target variable
feature_cols = ["percent", "returns"]
X = price[feature_cols]  # Features
y = price['group'] # Target variable
# Split dataset into training set and test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, shuffle=False)  # 80% training and 20% test

In [13]:
from sklearn.cluster import KMeans
kmeans = KMeans(n_clusters=3, random_state=0).fit(X_train)

In [14]:
# Predict the response for test dataset
y_pred = kmeans.predict(X_test)
y_pred
# Model Accuracy, how often is the classifier correct?
print("Accuracy:", metrics.accuracy_score(y_test, y_pred))
print("R2_score:", metrics.r2_score(y_test, y_pred))

Accuracy: 0.30847457627118646
R2_score: -2.0662447452302852


In [15]:
X_result = X_test.copy()
X_result = X_result.assign(Group=y_test)
X_result = X_result.assign(Predicts=y_pred)
X_result

Unnamed: 0_level_0,percent,returns,Group,Predicts
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2021-04-06,-0.079777,-0.103175,2,0
2021-11-02,-0.059133,-0.037500,1,0
2020-05-04,-1.892969,0.294521,2,0
2019-04-10,-0.281690,0.550000,2,0
2019-05-20,1.658821,0.880000,1,1
...,...,...,...,...
2021-10-15,0.485566,-0.086538,2,1
2020-08-13,0.556539,0.052632,1,1
2024-06-17,-1.273933,-0.500000,1,0
2018-10-12,1.778549,0.966258,0,1


In [16]:
X_result.sort_index(inplace=True)
X_result

Unnamed: 0_level_0,percent,returns,Group,Predicts
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2018-09-07,0.630746,0.553719,1,1
2018-09-12,0.672128,0.434211,2,1
2018-09-21,0.597877,0.013514,1,1
2018-09-24,0.296342,0.408163,1,1
2018-09-27,0.335912,0.380952,2,1
...,...,...,...,...
2024-06-13,0.286058,0.151899,1,1
2024-06-17,-1.273933,-0.500000,1,0
2024-06-25,0.062155,-0.597403,0,0
2024-07-02,0.957347,0.729167,2,1


In [17]:
import xgboost as xgb
# Create XGBClassifier model
model_xgb = xgb.XGBClassifier(n_estimators=100, random_state=42)

# Train XGBClassifier
model_xgb = model_xgb.fit(X_train, y_train)

In [18]:
# Predict the response for test dataset
y_pred = model_xgb.predict(X_test)
# Model Accuracy, how often is the classifier correct?
print("Accuracy:", metrics.accuracy_score(y_test, y_pred))
print("R2_score:", metrics.r2_score(y_test, y_pred))

Accuracy: 0.44745762711864406
R2_score: -0.5194946181918969


In [19]:
X_result = X_test.copy()
X_result = X_result.assign(Group=y_test)
X_result = X_result.assign(Predicts=y_pred)
X_result

Unnamed: 0_level_0,percent,returns,Group,Predicts
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2021-04-06,-0.079777,-0.103175,2,1
2021-11-02,-0.059133,-0.037500,1,1
2020-05-04,-1.892969,0.294521,2,1
2019-04-10,-0.281690,0.550000,2,1
2019-05-20,1.658821,0.880000,1,2
...,...,...,...,...
2021-10-15,0.485566,-0.086538,2,1
2020-08-13,0.556539,0.052632,1,2
2024-06-17,-1.273933,-0.500000,1,1
2018-10-12,1.778549,0.966258,0,2


In [20]:
from sklearn.ensemble import GradientBoostingClassifier
# Create GradientBoostingClassifier model
gbc = GradientBoostingClassifier(learning_rate=0.01, random_state=50, n_estimators=150)

# Train
gbc = gbc.fit(X_train, y_train)

# Predict the response for test dataset
y_pred = gbc.predict(X_test)
# Model Accuracy, how often is the classifier correct?
print("Accuracy:", metrics.accuracy_score(y_test, y_pred))
print("R2_score:", metrics.r2_score(y_test, y_pred))

Accuracy: 0.4745762711864407
R2_score: -0.32189217905483414


In [21]:
from sklearn.tree import DecisionTreeClassifier
# Create Decision Tree classifier object
clf = DecisionTreeClassifier(criterion="entropy", max_depth=6, max_leaf_nodes=50)

# Train Decision Tree Classifier
clf = clf.fit(X_train, y_train)

# Predict the response for test dataset
y_pred = clf.predict(X_test)
# Model Accuracy, how often is the classifier correct?
print("Accuracy:", metrics.accuracy_score(y_test, y_pred))
print("R2_score:", metrics.r2_score(y_test, y_pred))

Accuracy: 0.4610169491525424
R2_score: -0.7375386889638282


In [22]:
from sklearn.ensemble import RandomForestClassifier

#Create a Gaussian Classifier
clf=RandomForestClassifier(n_estimators=100)

#Train the model using the training sets 	y_pred=clf.predict(X_test)
clf.fit(X_train,y_train)

y_pred=clf.predict(X_test)
# Model Accuracy, how often is the classifier correct?
print("Accuracy:", metrics.accuracy_score(y_test, y_pred))
print("R2_score:", metrics.r2_score(y_test, y_pred))

Accuracy: 0.4711864406779661
R2_score: -0.512680740980274


In [23]:
labels = list((range(1, 31)))
price['percent_group'] = pd.qcut(price.percent, q=30, labels=labels)
price['returns_group'] = pd.qcut(price.returns, q=30, labels=labels)
price[['percent_group', 'returns_group', 'group']]

Unnamed: 0_level_0,percent_group,returns_group,group
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2018-08-14,22,26,1
2018-08-15,12,16,2
2018-08-16,5,17,1
2018-08-17,18,3,1
2018-08-20,13,16,2
...,...,...,...
2024-07-04,18,19,2
2024-07-05,15,13,1
2024-07-08,8,8,1
2024-07-09,22,18,0


In [24]:
len(np.unique(price['percent_group']))

30

In [25]:
len(np.unique(price['returns_group']))

30

In [26]:
 # split dataset in features and target variable
feature_cols = ["percent_group", "returns_group"]
X = price[feature_cols]  # Features
y = price['group'] # Target variable
# Split dataset into training set and test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)  # 80% training and 20% test
from sklearn.cluster import KMeans
kmeans = KMeans(n_clusters=3, random_state=0).fit(X_train)
# Predict the response for test dataset
y_pred = kmeans.predict(X_test)
y_pred
# Model Accuracy, how often is the classifier correct?
print("Accuracy:", metrics.accuracy_score(y_test, y_pred))
print("R2_score:", metrics.r2_score(y_test, y_pred))

Accuracy: 0.3220338983050847
R2_score: -1.3197310202829975


In [34]:
Long = price[price.percent > .26]
Long = Long[Long.returns > .39]
len(Long[Long['group'] == 1]) / len(Long)

0.5459610027855153

In [35]:
len(Long[Long['group'] == 0]) / len(Long)

0.11420612813370473

In [36]:
len(Long[Long['group'] == 2]) / len(Long)

0.3398328690807799