# Daily Candlestick Color - Detection(Predict, Classification, Clustering)

### Import Library

In [1]:
import numpy as np
import pandas as pd
import numpy as np
import pandas_ta as ta
import seaborn as sns

import matplotlib.pyplot as plt
plt.rcParams['figure.figsize'] = [12, 6]
plt.rcParams['figure.dpi'] = 120
import warnings
warnings.filterwarnings('ignore')

### Load Price Data

In [2]:
import os
from pathlib import Path
notebook_path = os.getcwd()
current_dir = Path(notebook_path)
csv_file = str(current_dir) + '/VN30F1M_5minutes.csv'
is_file = os.path.isfile(csv_file)
if is_file:
    dataset = pd.read_csv(csv_file, index_col='Date', parse_dates=True)
else:
    print('remote')
    dataset = pd.read_csv("https://raw.githubusercontent.com/zuongthaotn/vn-stock-data/main/VN30ps/VN30F1M_5minutes.csv", index_col='Date', parse_dates=True)

In [3]:
data = dataset.copy()

In [4]:
data

Unnamed: 0_level_0,Open,High,Low,Close,Volume
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2018-08-13 09:00:00,943.5,943.6,942.9,943.1,1812
2018-08-13 09:05:00,943.1,943.5,942.9,943.3,1323
2018-08-13 09:10:00,943.2,943.3,942.6,943.1,1207
2018-08-13 09:15:00,943.1,943.1,942.3,942.6,1196
2018-08-13 09:20:00,942.6,943.7,942.4,943.7,1765
...,...,...,...,...,...
2025-02-14 14:15:00,1343.0,1343.0,1340.3,1341.3,7141
2025-02-14 14:20:00,1340.9,1341.9,1340.5,1341.4,4593
2025-02-14 14:25:00,1341.1,1342.5,1340.7,1342.5,4207
2025-02-14 14:30:00,1342.5,1342.5,1342.5,1342.5,150


## Prepare labels

In [5]:
data['RSI'] = ta.rsi(data["Close"], length=25)

In [6]:
daily_data = data.resample('D').agg({
        'Volume': 'sum',
        'Open': 'first',
        'Close': 'last',
        'High': 'max',
        'Low': 'min',
        'RSI': 'last'
    })
daily_data.dropna(subset=['Close'], inplace=True)
daily_data["color"] = daily_data.apply(lambda r: "Doji" if r["Close"] == r["Open"] else ("Green" if r["Close"] > r["Open"] else "Red") , axis=1)
daily_data["color_int"] = daily_data.apply(lambda r: 0 if r["color"] == "Doji" else (1 if r["color"] == "Green" else 2), axis=1)

## Tính Volume Profile theo ngày

In [7]:
def volume_profile(df):
    low_price = df["Low"].min()
    high_price = df["High"].max()
    n_bins = 24
    
    # --- Tạo bins ---
    bins = np.linspace(low_price, high_price, n_bins + 1)
    bin_centers = (bins[:-1] + bins[1:]) / 2
    volume_profile = pd.Series(0.0, index=bin_centers)
    
    # --- Phân bổ volume vào bins ---
    for _, row in df.iterrows():
        # các giá nằm trong khoảng nến
        mask = (bin_centers >= row["Low"]) & (bin_centers <= row["High"])
        count = mask.sum()
        if count > 0:
            vol_per_bin = row["Volume"] / count
            volume_profile.loc[mask] += vol_per_bin

    return volume_profile

def cal_poc(vp):
    poc_price = vp.idxmax()
    return poc_price

In [8]:
%%time
daily_data["POC"] = 0
for i, row in daily_data.iterrows():
    current_date = row.name.strftime('%Y-%m-%d ').format()
    # Tạo khoảng thời gian trong ngày
    start_time = pd.Timestamp(current_date + " 08:00:00")
    end_time   = pd.Timestamp(current_date + " 15:00:00")
    
    # Lọc dữ liệu 5m của ngày đó
    tmp_df = data[(data.index >= start_time) & (data.index <= end_time)]
    tmp_vp = volume_profile(tmp_df)
    poc = cal_poc(tmp_vp)
    daily_data.loc[i, 'POC'] = poc

CPU times: user 28.2 s, sys: 0 ns, total: 28.2 s
Wall time: 28.2 s


In [9]:
daily_data

Unnamed: 0_level_0,Volume,Open,Close,High,Low,RSI,color,color_int,POC
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
2018-08-13,79327,943.5,954.2,954.6,942.3,74.598880,Green,1,943.068750
2018-08-14,79266,955.5,959.3,961.8,952.1,61.041527,Green,1,955.939583
2018-08-15,89449,958.0,946.6,962.3,946.6,30.475502,Red,2,958.047917
2018-08-16,71410,942.4,947.0,947.3,935.5,57.919290,Green,1,941.645833
2018-08-17,98531,952.2,947.0,954.6,945.0,44.311240,Red,2,953.200000
...,...,...,...,...,...,...,...,...,...
2025-02-10,185266,1333.4,1330.5,1337.6,1325.2,42.009268,Red,2,1332.691667
2025-02-11,143632,1332.5,1334.1,1334.1,1327.6,57.469921,Green,1,1330.443750
2025-02-12,139180,1337.4,1329.5,1342.0,1329.5,27.606077,Red,2,1339.135417
2025-02-13,181417,1331.9,1337.5,1338.5,1326.2,58.987050,Green,1,1335.168750


In [11]:
daily_data["POC-1"] = daily_data["POC"].shift(1)
daily_data["POC-2"] = daily_data["POC"].shift(2)
daily_data["next_Open"] = daily_data["Open"].shift(-1)
daily_data["next_color"] = daily_data["color_int"].shift(-1)
daily_data.dropna(subset=['next_Open', "POC-2"], inplace=True)

## AI prediction

In [20]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
import xgboost as xgb
from sklearn import metrics
from xgboost import plot_importance

### Xgboost

In [15]:
# Define Features and Target Variable
features = ["POC-2", "POC-1", "POC", "Close", "next_Open"]
X = daily_data
y = daily_data["next_color"]

# Train-Test Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
X_train = X_train[features]
X_test2 = X_test[features]

In [16]:
# Create XGBClassifier model
model_xgb = xgb.XGBClassifier(n_estimators=100, random_state=42)

# Train XGBClassifier
model_xgb = model_xgb.fit(X_train, y_train)

# Predict the response for test dataset
y2_pred = model_xgb.predict(X_test2)
# Model Accuracy, how often is the classifier correct?
print("Accuracy:", metrics.accuracy_score(y_test, y2_pred))

Accuracy: 0.5061728395061729


### Kmeans

In [17]:
from sklearn.cluster import KMeans

In [18]:
kmeans = KMeans(n_clusters=3, random_state=0).fit(X_train)
pred_label = kmeans.predict(X_test2)

In [19]:
print("Accuracy:", metrics.accuracy_score(y_test, pred_label))

Accuracy: 0.3734567901234568
