# Trend (MA) & Sklearn

### Import Library

In [1]:
import numpy as np
import pandas as pd
import numpy as np
import seaborn as sns

import matplotlib.pyplot as plt
plt.rcParams['figure.figsize'] = [12, 6]
plt.rcParams['figure.dpi'] = 120

import warnings
warnings.filterwarnings('ignore')

### Load Price Data

In [2]:
import os
from pathlib import Path
notebook_path = os.getcwd()
algo_dir = Path(notebook_path).parent.parent
csv_file = str(algo_dir) + '/vn-stock-data/VN30ps/VN30F1M_5minutes.csv'
is_file = os.path.isfile(csv_file)
if is_file:
    dataset = pd.read_csv(csv_file, index_col='Date', parse_dates=True)
else:
    dataset = pd.read_csv("https://raw.githubusercontent.com/zuongthaotn/vn-stock-data/main/VN30ps/VN30F1M_5minutes.csv", index_col='Date', parse_dates=True)

In [3]:
data = dataset.copy()
data["ma_line"] = data["Close"].rolling(20).mean()
data['above_ma'] = data.apply(lambda r: 1 if r['Close'] > r['ma_line'] else 0, axis=1)
data['below_ma'] = data.apply(lambda r: 1 if r['Close'] < r['ma_line'] else 0, axis=1)
data['total_above_ma'] = data['above_ma'].rolling(150).sum()
data['total_below_ma'] = data['below_ma'].rolling(150).sum()
data['trend'] = data.apply(lambda r: 'switch' if r['total_above_ma'] == r['total_below_ma'] else ('up' if r['total_above_ma'] > r['total_below_ma'] else 'down'), axis=1)
data.dropna(inplace=True)

In [4]:
def cal_high_after(tick):
  tick = tick[100*tick.index.hour+tick.index.minute > 1300]
  tick = tick[100*tick.index.hour+tick.index.minute < 1430]
  return tick.max()

def cal_low_after(tick):
  tick = tick[100*tick.index.hour+tick.index.minute > 1300]
  tick = tick[100*tick.index.hour+tick.index.minute < 1430]
  return tick.min()


def cal_price(tick):
  tick = tick[100*tick.index.hour+tick.index.minute == 1300]
  if not tick.empty:
    return tick[0]

def cal_close(tick):
  tick = tick[100*tick.index.hour+tick.index.minute == 1425]
  if not tick.empty:
    return tick[0]

In [5]:
data2 = dataset.copy()
data2['price'] = data2.Close
data2['next_high'] = data2.High
data2['next_low'] = data2.Low
price = data2.resample("D").agg({
    'next_high':cal_high_after,
    'next_low': cal_low_after,
    'price': cal_price,
    'Close': cal_close
    })
def group_data(r):
    group = 4
    if r['next_high'] - r['price'] > 3 and r['price'] - r['next_low'] > 3:
        # Do nothing group
        group = 0
    elif r['return'] > 0:
        # Long group
        group = 1
    else:
        # Short group
        group = 2
    return group
price['return'] = 1000 * (price.Close - price.price) / price.price
price['group'] = price.apply(lambda r: group_data(r), axis=1)
price = price.dropna()
price = price[['group']]

In [6]:
data = data.assign(time_d=pd.PeriodIndex(data.index, freq='1D').to_timestamp())
data = pd.merge(data, price, left_on="time_d", right_index=True, how="left")
data = data.dropna()
data

Unnamed: 0_level_0,Open,High,Low,Close,Volume,ma_line,above_ma,below_ma,total_above_ma,total_below_ma,trend,time_d,group
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
2018-08-15 14:25:00,951.4,952.4,951.0,951.2,3377,955.960,0,1,82.0,49.0,up,2018-08-15,0.0
2018-08-15 14:30:00,951.5,951.5,951.5,951.5,24,955.625,0,1,82.0,50.0,up,2018-08-15,0.0
2018-08-15 14:45:00,946.6,946.6,946.6,946.6,2498,955.055,0,1,82.0,51.0,up,2018-08-15,0.0
2018-08-16 09:00:00,942.4,942.4,941.0,942.0,1666,954.330,0,1,82.0,52.0,up,2018-08-16,1.0
2018-08-16 09:05:00,942.0,942.1,941.0,941.8,1002,953.660,0,1,82.0,53.0,up,2018-08-16,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
2024-07-10 14:15:00,1314.6,1315.9,1312.1,1314.3,9556,1316.600,0,1,77.0,73.0,up,2024-07-10,2.0
2024-07-10 14:20:00,1314.2,1314.4,1310.3,1310.7,10901,1316.190,0,1,76.0,74.0,up,2024-07-10,2.0
2024-07-10 14:25:00,1310.7,1312.9,1310.0,1310.3,8964,1315.755,0,1,75.0,75.0,switch,2024-07-10,2.0
2024-07-10 14:30:00,1309.7,1309.7,1309.7,1309.7,280,1315.265,0,1,74.0,76.0,down,2024-07-10,2.0


In [7]:
_1pm_data = data[100*data.index.hour + data.index.minute == 1300]
_1pm_data

Unnamed: 0_level_0,Open,High,Low,Close,Volume,ma_line,above_ma,below_ma,total_above_ma,total_below_ma,trend,time_d,group
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
2018-08-16 13:00:00,938.5,939.5,938.5,938.7,1192,939.465,0,1,73.0,77.0,down,2018-08-16,1.0
2018-08-17 13:00:00,954.3,954.4,953.5,953.7,1494,951.925,1,0,76.0,74.0,up,2018-08-17,2.0
2018-08-20 13:00:00,950.7,951.0,949.9,950.3,1940,947.570,1,0,63.0,87.0,down,2018-08-20,2.0
2018-08-21 13:00:00,947.5,948.1,947.5,948.0,803,946.775,1,0,90.0,60.0,up,2018-08-21,1.0
2018-08-22 13:00:00,956.7,957.5,956.6,957.5,1138,958.690,0,1,88.0,62.0,up,2018-08-22,2.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
2024-07-04 13:00:00,1308.9,1309.4,1308.8,1309.2,1765,1309.130,1,0,123.0,27.0,up,2024-07-04,2.0
2024-07-05 13:00:00,1308.9,1309.4,1308.7,1309.4,2903,1310.655,0,1,99.0,51.0,up,2024-07-05,1.0
2024-07-08 13:00:00,1308.0,1308.4,1307.2,1307.7,1858,1309.980,0,1,84.0,66.0,up,2024-07-08,1.0
2024-07-09 13:00:00,1312.4,1316.7,1312.4,1316.7,7374,1309.700,1,0,75.0,75.0,switch,2024-07-09,1.0


In [8]:
len(_1pm_data[_1pm_data['group'] == 0])

360

In [9]:
len(_1pm_data[_1pm_data['group'] == 1])

557

In [10]:
len(_1pm_data[_1pm_data['group'] == 2])

552

### Kmeans Clustering

In [11]:
from sklearn.model_selection import train_test_split
from sklearn import metrics

In [12]:
 # split dataset in features and target variable
feature_cols = ["total_above_ma", "total_below_ma"]
X = _1pm_data[feature_cols]  # Features
y = _1pm_data['group'] # Target variable
# Split dataset into training set and test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)  # 80% training and 20% test

In [13]:
from sklearn.cluster import KMeans
kmeans = KMeans(n_clusters=3, random_state=0).fit(X_train)

In [14]:
# Predict the response for test dataset
y_pred = kmeans.predict(X_test)

In [15]:
y_test

Date
2023-11-07 13:00:00    2.0
2021-10-12 13:00:00    2.0
2020-01-22 13:00:00    1.0
2023-11-20 13:00:00    0.0
2020-01-02 13:00:00    1.0
                      ... 
2021-08-13 13:00:00    0.0
2024-07-02 13:00:00    1.0
2023-12-08 13:00:00    0.0
2019-11-18 13:00:00    2.0
2024-03-21 13:00:00    0.0
Name: group, Length: 294, dtype: float64

In [16]:
# Model Accuracy, how often is the classifier correct?
print("Accuracy:", metrics.accuracy_score(y_test, y_pred))
print("R2_score:", metrics.r2_score(y_test, y_pred))

Accuracy: 0.3843537414965986
R2_score: -0.6872466075494752


In [17]:
from sklearn.ensemble import GradientBoostingClassifier
# Create GradientBoostingClassifier model
gbc = GradientBoostingClassifier(learning_rate=0.01, random_state=50, n_estimators=150)

# Train
gbc = gbc.fit(X_train, y_train)

# Predict the response for test dataset
y_pred = gbc.predict(X_test)
# Model Accuracy, how often is the classifier correct?
print("Accuracy:", metrics.accuracy_score(y_test, y_pred))
print("R2_score:", metrics.r2_score(y_test, y_pred))

Accuracy: 0.32653061224489793
R2_score: -0.68180387655738
