# Distribution by Bayes - Analytics

In [26]:
# Apply Bayes formula to calculate the probability of the next candle being green or red

### Import Library

In [27]:
import numpy as np
import pandas as pd
import numpy as np
import pandas_ta as ta
import seaborn as sns

import matplotlib.pyplot as plt
plt.rcParams['figure.figsize'] = [12, 6]
plt.rcParams['figure.dpi'] = 120
import warnings
warnings.filterwarnings('ignore')

### Load Price Data

In [28]:
import os
from pathlib import Path
notebook_path = os.getcwd()
current_dir = Path(notebook_path)
csv_file = str(current_dir) + '/VN30F1M_1H.csv'
is_file = os.path.isfile(csv_file)
if is_file:
    dataset = pd.read_csv(csv_file, index_col='Date', parse_dates=True)
else:
    print('remote')
    dataset = pd.read_csv("https://raw.githubusercontent.com/zuongthaotn/vn-stock-data/main/VN30ps/VN30F1M_1H.csv", index_col='Date', parse_dates=True)

In [29]:
data = dataset.copy()

In [30]:
data['H'] = data.index.hour
data['color'] = data.apply(lambda r: "doji" if r["Close"] == r["Open"] else ("green" if r["Close"] > r["Open"] else "red"), axis=1)
data['color_shift1'] = data['color'].shift(1)
data['color_shift2'] = data['color'].shift(2)
data['color_shift3'] = data['color'].shift(3)
data.dropna(inplace=True)

In [31]:
data_train = data[data.index < '2024-11-01 00:00:00']

In [32]:
data_train

Unnamed: 0_level_0,Open,High,Low,Close,Volume,H,color,color_shift1,color_shift2,color_shift3
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
2018-08-13 13:00:00,947.5,950.2,946.7,949.9,21499,13,green,green,red,green
2018-08-13 14:00:00,949.7,954.6,948.9,954.2,14072,14,green,green,green,red
2018-08-14 09:00:00,955.5,957.0,954.3,955.8,13314,9,green,green,green,green
2018-08-14 10:00:00,955.8,957.3,952.2,954.0,17740,10,red,green,green,green
2018-08-14 11:00:00,954.1,955.0,952.1,954.3,8554,11,green,red,green,green
...,...,...,...,...,...,...,...,...,...,...
2024-10-31 09:00:00,1339.0,1340.5,1336.5,1337.6,34015,9,red,green,green,red
2024-10-31 10:00:00,1337.5,1337.9,1333.7,1336.1,34325,10,red,red,green,green
2024-10-31 11:00:00,1336.0,1338.4,1336.0,1337.8,17452,11,green,red,red,green
2024-10-31 13:00:00,1338.0,1345.3,1337.4,1344.2,65431,13,green,green,red,red


In [33]:
# Reference
# https://viblo.asia/p/mo-hinh-phan-lop-naive-bayes-vyDZO0A7lwj
# https://www.machinelearningplus.com/predictive-modeling/how-naive-bayes-algorithm-works-with-example-and-full-code/
# https://machinelearningcoban.com/2017/08/08/nbc/
# https://towardsdatascience.com/bernoulli-naive-bayes-explained-a-visual-guide-with-code-examples-for-beginners-aec39771ddd6/

In [34]:
total_green = len(data_train[data_train.color=="green"])
total_red = len(data_train[data_train.color=="red"])
total_doji = len(data_train[data_train.color=="doji"])

In [35]:
total_doji

127

In [36]:
total_s1green = len(data_train[data_train.color_shift1=="green"])
total_s1red = len(data_train[data_train.color_shift1=="red"])
total_s1doji = len(data_train[data_train.color_shift1=="doji"])
#
total_s2green = len(data_train[data_train.color_shift2=="green"])
total_s2red = len(data_train[data_train.color_shift2=="red"])
total_s2doji = len(data_train[data_train.color_shift2=="doji"])
#
total_s3green = len(data_train[data_train.color_shift3=="green"])
total_s3red = len(data_train[data_train.color_shift3=="red"])
total_s3doji = len(data_train[data_train.color_shift3=="doji"])
#
total_9h = len(data_train[data_train.H==9])
total_10h = len(data_train[data_train.H==10])
total_11h = len(data_train[data_train.H==11])
total_13h = len(data_train[data_train.H==13])
total_14h = len(data_train[data_train.H==14])

In [37]:
P_s1green_on_green = total_s1green / total_green

In [42]:
# G·ªçi x1 l√† s1_color
s1arr = np.array([
    ["green", total_s1green, total_s1green/total_green, total_s1green/total_red, total_s1green/total_doji],
    ["red", total_s1red, total_s1red/total_green, total_s1red/total_red, total_s1red/total_doji],
    ["doji", total_s1doji, total_s1doji/total_green, total_s1doji/total_red, total_s1doji/total_doji]
])

s1df = pd.DataFrame(s1arr, columns=['S1_Color', 'count', 'P(x1|green)', 'P(x1|red)', 'P(x1|doji)'])

In [43]:
s1df

Unnamed: 0,S1_Color,count,P(x1|green),P(x1|red),P(x1|doji)
0,green,3860,1.0002591344908007,1.0255047821466523,30.39370078740157
1,red,3763,0.9751230888831304,0.9997343251859724,29.62992125984252
2,doji,127,0.0329100803316921,0.033740701381509,1.0


# Apply Bernoulli Naive Bayes Simplified by Sklearn

In [57]:
import pandas as pd
from sklearn.naive_bayes import BernoulliNB
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, classification_report
from sklearn.model_selection import train_test_split

In [48]:
df = data.copy()

In [49]:
label_encoders = {}
for col in ["color_shift1", "color_shift2", "color_shift3", "color"]:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col])
    label_encoders[col] = le

In [50]:
df_train = df[df.index < '2024-11-01 00:00:00']
df_test = df[df.index > '2024-11-01 00:00:00']

In [51]:
X_train, y_train = df_train[["H", "color", "color_shift1", "color_shift2", "color_shift3"]], df_train["color"]
X_test, y_test = df_test[["H", "color", "color_shift1", "color_shift2", "color_shift3"]], df_test["color"]

In [52]:
# Train the model
nb_clf = BernoulliNB()
nb_clf.fit(X_train, y_train)

0,1,2
,alpha,1.0
,force_alpha,True
,binarize,0.0
,fit_prior,True
,class_prior,


In [55]:
y_pred = nb_clf.predict(X_test)

In [58]:
# === 6Ô∏è‚É£ ƒê√°nh gi√° ===
print("\nüîπ Accuracy:", accuracy_score(y_test, y_pred))
print("\nüîπ B√°o c√°o ph√¢n lo·∫°i:")
print(classification_report(y_test, y_pred))


üîπ Accuracy: 0.5247610773240661

üîπ B√°o c√°o ph√¢n lo·∫°i:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00        22
           1       0.52      0.97      0.67       580
           2       0.53      0.03      0.06       549

    accuracy                           0.52      1151
   macro avg       0.68      0.67      0.58      1151
weighted avg       0.53      0.52      0.39      1151

