# 1. Crawl the dataset

In [None]:
from datetime import datetime
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
import numpy as np
import tensorflow as tf
from tensorflow.keras.layers import Conv1D, MaxPooling1D, Flatten, Dense, LSTM
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

In [None]:
sma# Vietnam stock information crawling
# https://github.com/thinh-vu/vnstock

!pip install vnstock

from vnstock import *### Split the dataset into time windows to get data samples.  

compdata_list = []

vnindex30 = ['ACB', 'BCM', 'BID', 'BVH', 'CTG', 'FPT', 'GAS', 'GVR', 'HDB', 'HPG', 
             'MBB', 'MSN', 'MWG', 'NVL', 'PDR', 'PLX', 'POW', 'SAB', 'SSI', 'STB', 
             'TCB', 'TPB', 'VCB', 'VHM', 'VIB', 'VIC', 'VJC', 'VNM', 'VPB', 'VRE']

for comp in vnindex30:
    comp_data = stock_historical_data(comp, "2005-01-01", "2023-05-24")   
    compdata_list.append(comp_data)

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [None]:
compdata_list[0]

Unnamed: 0,Open,High,Low,Close,Volume,TradingDate
0,5050.0,8234.0,6532.0,7147.0,56500,2006-11-21
1,6038.0,7861.0,7136.0,7394.0,62300,2006-11-22
2,6093.0,7959.0,7136.0,7597.0,69900,2006-11-23
3,5928.0,8344.0,7685.0,8058.0,63600,2006-11-24
4,6203.0,8860.0,7410.0,8058.0,42400,2006-11-27
...,...,...,...,...,...,...
4105,24900.0,25100.0,24850.0,25000.0,3353042,2023-05-18
4106,25050.0,25050.0,24750.0,24950.0,4008117,2023-05-19
4107,25000.0,25050.0,24900.0,25050.0,4585906,2023-05-22
4108,25200.0,25600.0,25000.0,25350.0,21726379,2023-05-23


# 2. Data Processing

## Helper function

In [None]:
### Split the dataset into time windows to get data samples.  
# Only get the close price (index 3 in the data frame)

def split_into_windows(company_data, window_size, X_data, y_data):
  for i in range(0, len(company_data) - window_size - 1):
    first = company_data.iloc[i, 3]
    data_feature = []
    data_label = []
    
    # Get a window_size time frame for data feature
    for j in range(window_size):
        data_feature.append(company_data.iloc[i + j, 3])
        
    # Next value is the label (price of the next day) to be predicted
    data_label.append(company_data.iloc[i + window_size, 3])
        
    # Append new data sample (feature and label) to X_data and y_data
    X_data.append(np.array(data_feature).reshape(window_size, 1))
    y_data.append(np.array(data_label))

In [None]:
### Split the data into training, validation and test set
# Split data into train, val and test. Note that 'shuffle=False' due to time-series data.

def train_val_test_split(X_data, y_data, window_size):
  X_train, X_test, y_train, y_test = train_test_split(X_data, y_data, test_size=0.2, shuffle=False)
  X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=False)

  # Convert from lists to Numpy arrays for reshaping purpose
  X_train = np.array(X_train)
  X_val = np.array(X_val)
  X_test = np.array(X_test)
  y_train = np.array(y_train)
  y_val = np.array(y_val)
  y_test = np.array(y_test)

  # Reshape the numpy array to fit the neural network input shape requirement
  X_train = X_train.reshape(X_train.shape[0], window_size, 1)
  X_val = X_val.reshape(X_val.shape[0], window_size, 1)
  X_test = X_test.reshape(X_test.shape[0], window_size, 1)

  return X_train, y_train, X_val, y_val, X_test, y_test

In [None]:
### Normalization
def normalize(X_train, y_train, X_val, y_val):
  # MinMax normalize the training data: x=(x-min(x)/(max(x)-min(x))
  X_train_norm = X_train.copy()
  y_train_norm = y_train.copy()
  for i in range(0, len(X_train)):
      min_feature = np.min(X_train[i])
      max_feature = np.max(X_train[i])
      X_train_norm[i] = (X_train[i] - min_feature) / (max_feature - min_feature)
      y_train_norm[i] = (y_train[i] - min_feature) / (max_feature - min_feature)

  # MinMax normalize the validation data: x=(x-min(x)/(max(x)-min(x))
  X_val_norm = X_val.copy()
  y_val_norm = y_val.copy()
  for i in range(0, len(X_val)):
      min_feature = np.min(X_val[i])
      max_feature = np.max(X_val[i])
      X_val_norm[i] = (X_val[i] - min_feature) / (max_feature - min_feature)
      y_val_norm[i] = (y_val[i] - min_feature) / (max_feature - min_feature)
  
  return X_train_norm, y_train_norm, X_val_norm, y_val_norm

In [None]:
### Create and train the model

def build_model(window_size):
  # Build the model architecture
  model = tf.keras.Sequential()
  # model.add(LSTM(units = 64, return_sequences = True))
  model.add(Conv1D(64, kernel_size=3, activation='relu', input_shape=(window_size, 1), padding='same'))
  model.add(MaxPooling1D(pool_size=2))
  model.add(Conv1D(128, kernel_size=3, activation='relu', padding='same'))
  model.add(MaxPooling1D(2))
  model.add(Conv1D(64, kernel_size=3, activation='relu', padding='same'))
  model.add(MaxPooling1D(2))
  model.add(Flatten())
  model.add(Dense(100, activation='relu'))
  model.add(Dense(1)) # By default: activation='linear', i.e., passing through f(x) = x
  return model

In [None]:
def MinMaxScaling(X_test):
  # MinMax normalize the test data: norm_x = (x-min(x) / (max(x) - min(x))
  X_test_norm = X_test.copy()
  for i in range(0, len(X_test)):
      min_feature = np.min(X_test[i])
      max_feature = np.max(X_test[i])
      X_test_norm[i] = (X_test[i] - min_feature) / (max_feature - min_feature)
  
  return X_test_norm

In [None]:
def generate_features_prediction(company_data, window_size):
  X_predict_original = company_data.iloc[len(company_data) - window_size : len(company_data), 4]

  # Convert from lists to Numpy arrays for reshaping purpose
  X_predict = np.array(X_predict_original)

  # Reshape the numpy array to fit the neural network input shape requirement
  X_predict = X_predict.reshape(window_size, 1)
  X_predict = [X_predict]
  X_predict = np.array(X_predict)
  X_predict = X_predict.reshape(X_predict.shape[0], window_size,1)

  # MinMax normalize the test data: norm_x = (x-min(x) / (max(x) - min(x))
  X_predict_norm = X_predict.copy()

  for i in range(0, len(X_predict)):
      min_feature = np.min(X_predict[i])
      max_feature = np.max(X_predict[i])
      X_predict_norm[i] = (X_predict[i] - min_feature) / (max_feature - min_feature)
  
  return X_predict, X_predict_norm

In [None]:
def predict_next_day(company_data, window_size, model, X_predict, X_predict_norm):
    ### Get prediction on the test data and convert the result back to stock price (i.e., de-normalization)
    # Get prediction on the test data
    y_pred_norm = model.predict(X_predict_norm)

    # Convert the result back to stock price (i.e., de-normalization) for visualization purpose
    y_pred_denorm = y_pred_norm
    for i in range(0, len(y_pred_denorm)): # denorm_x = norm_x * (max(x) - min(x)) + min(x)
        min_feature = np.min(X_predict[i])
        max_feature = np.max(X_predict[i])
        y_pred_denorm[i] = y_pred_norm[i] * (max_feature - min_feature) + min_feature

    return y_pred_denorm

In [None]:
def predict_next_month(company_data, window_size, model, X_predict, X_predict_norm):
  next_Month_predict = []
  for next_day in range(30):
    next_Day_predict = predict_next_day(company_data, window_size, model, X_predict, X_predict_norm)
    next_Month_predict.append(next_Day_predict)

    next_Day_predict_norm = next_Day_predict.copy()
    for i in range(0, len(next_Day_predict)): # denorm_x = norm_x * (max(x) - min(x)) + min(x)
        min_feature = np.min(X_predict)
        max_feature = np.max(X_predict)
        next_Day_predict_norm[i] = (next_Day_predict[i] - min_feature) / (max_feature - min_feature)

    # Concatenate array2 with the result to get the final array
    X_predict_norm = np.concatenate((X_predict_norm[:, 1:window_size, :], next_Day_predict_norm[:, np.newaxis]), axis=1)
    X_predict = np.concatenate((X_predict[:, 1:window_size, :], next_Day_predict[:, np.newaxis]), axis=1)
  
  return next_Month_predict

## Main function

In [None]:
def oneday_pipeline(company, company_data, window_size, nextMonth_prediction):
    X_data = []
    y_data = []

    ### Split the dataset into time windows to get data samples. 
    split_into_windows(company_data, window_size, X_data, y_data)

    ### Train-Val-Test Split
    X_train, y_train, X_val, y_val, X_test, y_test = train_val_test_split(X_data, y_data, window_size)

    ### Normalizing training and validation data
    X_train_norm, y_train_norm, X_val_norm, y_val_norm = normalize(X_train, y_train, X_val, y_val)
    
    ### Create and train the model
    model = build_model(window_size)

    # Compile and train the model with Mean Squared Error loss function
    model.compile(optimizer= tf.keras.optimizers.Adam(learning_rate=0.01), loss='mse', metrics=['mse'])
    history = model.fit(X_train_norm, y_train_norm, validation_data=(X_val_norm,y_val_norm), epochs=10, batch_size=512)

    ### Normalizing test data
    X_test_norm = MinMaxScaling(X_test)
    y_test_norm = y_test.copy()
    for i in range(0, len(X_test)):
      min_feature = np.min(X_test[i])
      max_feature = np.max(X_test[i])
      y_test_norm[i] = (y_test[i] - min_feature) / (max_feature - min_feature)

    ### Get prediction on the test data and convert the result back to stock price (i.e., de-normalization)
    # Get prediction on the test data
    y_pred_norm = model.predict(X_test_norm)
    print("MSE on the test set: ", mean_squared_error(y_pred_norm, y_test_norm))

    # Convert the result back to stock price (i.e., de-normalization) for visualization purpose
    y_pred_denorm = y_pred_norm
    for i in range(0, len(y_pred_denorm)): # denorm_x = norm_x * (max(x) - min(x)) + min(x)
        min_feature = np.min(X_test[i])
        max_feature = np.max(X_test[i])
        y_pred_denorm[i] = y_pred_norm[i] * (max_feature - min_feature) + min_feature
    
    ### Prediction
    X_predict, X_predict_norm = generate_features_prediction(company_data, window_size)

    ## Next_month
    next_month_stock_price = predict_next_month(company_data, window_size, model, X_predict, X_predict_norm)
    next_month_stock_price = np.reshape(next_month_stock_price, (1, 30, 1))
    next_month_stock_price = next_month_stock_price.tolist()[0]
    next_month_stock_price = np.reshape(next_month_stock_price, (1, 30))
    next_month_day = [i for i in range(y_pred_norm.shape[0], y_pred_norm.shape[0] + 30)]
    nextMonth_prediction[company] = next_month_stock_price


    # Visualize preditec stock price versus real stock price
    plt.figure(figsize=(16, 8), dpi=300)
    plt.plot(y_pred_denorm, label='Predicted price', color = 'blue')
    plt.plot(y_test, label='Real price', color = 'green')
    plt.plot(next_month_day,next_month_stock_price[0], label = 'Predicted price the following month', color="red")
    plt.title('Stock trend prediction in one day' + " " + company, fontsize=16)
    plt.xlabel('Time (days)', fontsize=14)
    plt.ylabel('Close price in $', fontsize=14)
    plt.grid() # Add grid
    plt.legend() # Add legend
    plt.show()

In [None]:
def change_tracking(company_list, nextWeej):
  change_tracking = pd.DataFrame(index = range(0, 30), columns=["company_name", "latest_price", "one_week", "one_month"])
  i = 0
  for comp in company_list:
    change_tracking.iloc[i, 0] = comp
    change_tracking.iloc[i, 1] = compdata["Close"][(compdata["company_name"] == comp) & (compdata["Date"] == '2023-04-24')]
    change_tracking.iloc[i, 1] = change_tracking.iloc[i, 1][0]
    change_tracking.iloc[i, 2] = nextMonth_prediction[comp][0][0]
    change_tracking.iloc[i, 3] = nextMonth_prediction[comp][0][6]
    change_tracking.iloc[i, 4] = nextMonth_prediction[comp][0][-1]
    i+=1
  return change_tracking

In [None]:
## Visualize the change in price
def change_visualize(nextWeek_prediction):
  company_names = list(nextWeek_prediction.keys())
  price_changes = list(nextWeek_prediction.values())

  plt.figure(figsize=(16,9))
  for i in range(len(company_names)):
      plt.plot(price_changes[i][0], label=company_names[i])

  plt.xlabel('Time')
  plt.ylabel('Price Change')
  plt.title('Change in Price over Time')
  plt.legend(loc = 'best')
  plt.show()

# 3. Train model

In [None]:
def oneweek_pipeline(company, company_data, window_size, nextWeek_prediction):
    X_data = []
    y_data = []
    X_predict = []

  ### Split the dataset into time windows to get data samples.  
  # Only get the open price (index 2 in the data frame)
    for i in range(0, len(company_data) - window_size - 7):
      first = company_data.iloc[i, 3]
      data_feature = []
      data_label = []
      
      # Get a window_size time frame for data feature
      for j in range(window_size):
          data_feature.append(company_data.iloc[i + j, 3])
          
      # Next value is the label (price of the next day) to be predicted
      data_label.append(company_data.iloc[i + window_size + 7, 3])
          
      # Append new data sample (feature and label) to X_data and y_data
      X_data.append(np.array(data_feature).reshape(window_size, 1))
      y_data.append(np.array(data_label))

    ### Split the data into training, validation and test set
    # Split data into train, val and test. Note that 'shuffle=False' due to time-series data.
    X_train, X_test, y_train, y_test = train_test_split(X_data, y_data, test_size=0.2, shuffle=False)
    X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=False)

    # Convert from lists to Numpy arrays for reshaping purpose
    X_train = np.array(X_train)
    X_val = np.array(X_val)
    X_test = np.array(X_test)
    y_train = np.array(y_train)
    y_val = np.array(y_val)
    y_test = np.array(y_test)

    # Reshape the numpy array to fit the neural network input shape requirement
    X_train = X_train.reshape(X_train.shape[0], window_size, 1)
    X_val = X_val.reshape(X_val.shape[0], window_size, 1)
    X_test = X_test.reshape(X_test.shape[0], window_size, 1)

    ### Normalization
    # MinMax normalize the training data: x=(x-min(x)/(max(x)-min(x))
    X_train_norm = X_train.copy()
    y_train_norm = y_train.copy()
    for i in range(0, len(X_train)):
        min_feature = np.min(X_train[i])
        max_feature = np.max(X_train[i])
        X_train_norm[i] = (X_train[i] - min_feature) / (max_feature - min_feature)
        y_train_norm[i] = (y_train[i] - min_feature) / (max_feature - min_feature)

    # MinMax normalize the validation data: x=(x-min(x)/(max(x)-min(x))
    X_val_norm = X_val.copy()
    y_val_norm = y_val.copy()
    for i in range(0, len(X_val)):
        min_feature = np.min(X_val[i])
        max_feature = np.max(X_val[i])
        X_val_norm[i] = (X_val[i] - min_feature) / (max_feature - min_feature)
        y_val_norm[i] = (y_val[i] - min_feature) / (max_feature - min_feature)

    ### Create and train the model

    # Build the model architecture
    model = tf.keras.Sequential()
    model.add(Conv1D(64, kernel_size=3, activation='relu', input_shape=(window_size, 1), padding='same'))
    model.add(MaxPooling1D(pool_size=2))
    model.add(Conv1D(128, kernel_size=3, activation='relu', padding='same'))
    model.add(MaxPooling1D(2))
    model.add(Conv1D(64, kernel_size=3, activation='relu', padding='same'))
    model.add(MaxPooling1D(2))
    model.add(Flatten())
    model.add(Dense(100, activation='relu'))
    model.add(Dense(1)) # By default: activation='linear', i.e., passing through f(x) = x

    # Compile and train the model with Mean Squared Error loss function
    model.compile(optimizer= tf.keras.optimizers.Adam(learning_rate=0.01), loss='mse', metrics=['mse'])
    history = model.fit(X_train_norm, y_train_norm, validation_data=(X_val_norm,y_val_norm), epochs=1, batch_size=512)


    # MinMax normalize the test data: norm_x = (x-min(x) / (max(x) - min(x))
    X_test_norm = X_test.copy()
    y_test_norm = y_test.copy()
    for i in range(0, len(X_test)):
        min_feature = np.min(X_test[i])
        max_feature = np.max(X_test[i])
        X_test_norm[i] = (X_test[i] - min_feature) / (max_feature - min_feature)
        y_test_norm[i] = (y_test[i] - min_feature) / (max_feature - min_feature)

    ### Get prediction on the test data and convert the result back to stock price (i.e., de-normalization)
    # Get prediction on the test data
    y_pred_norm = model.predict(X_test_norm)
    print("MSE on the test set: ", mean_squared_error(y_pred_norm, y_test_norm))


    # Convert the result back to stock price (i.e., de-normalization) for visualization purpose
    y_pred_denorm = y_pred_norm
    for i in range(0, len(y_pred_denorm)): # denorm_x = norm_x * (max(x) - min(x)) + min(x)
        min_feature = np.min(X_test[i])
        max_feature = np.max(X_test[i])
        y_pred_denorm[i] = y_pred_norm[i] * (max_feature - min_feature) + min_feature
    
    ### Generate week prediction
    for i in range(len(company_data) - window_size - 7 - 1, len(company_data) - window_size - 1):
      data_feature = []
      
      # Get a window_size time frame for data feature
      for j in range(window_size):
          data_feature.append(company_data.iloc[i + j, 3])
          
      # Append new data sample (feature and label) to X_data and y_data
      X_predict.append(np.array(data_feature).reshape(window_size, 1))
    
    X_predict = np.array(X_predict)
    X_predict = X_predict.reshape(X_predict.shape[0], window_size, 1)
    X_predict_norm = X_predict.copy()
    
    for i in range(0, len(X_predict)):
      min_feature = np.min(X_predict[i])
      max_feature = np.max(X_predict[i])
      X_predict_norm[i] = (X_predict[i] - min_feature) / (max_feature - min_feature)

    y_predict_norm = model.predict(X_predict_norm)

    y_predict_denorm = y_predict_norm
    for i in range(0, len(y_predict_denorm)): # denorm_x = norm_x * (max(x) - min(x)) + min(x)
      min_feature = np.min(X_predict[i])
      max_feature = np.max(X_predict[i])
      y_predict_denorm[i] = y_predict_norm[i] * (max_feature - min_feature) + min_feature

    y_predict_denorm = np.reshape(y_predict_denorm, (1, 7, 1))
    y_predict_denorm = y_predict_denorm.tolist()[0]
    y_predict_denorm = np.reshape(y_predict_denorm, (7,))
    next_week_day = [i for i in range(y_pred_norm.shape[0], y_pred_norm.shape[0] + 7)]
    nextWeek_prediction[company] = y_predict_denorm

    # Visualize preditec stock price versus real stock price
    plt.figure(figsize=(16, 8), dpi=300)
    plt.plot(y_pred_denorm, label='Predicted price')
    plt.plot(next_week_day,y_predict_denorm, label='Predicted next price')
    plt.plot(y_test, label='Real price')
    plt.title('Stock trend prediction in one week', fontsize=16)
    plt.xlabel('Time (weeks)', fontsize=14)
    plt.ylabel('Open price in $', fontsize=14)
    plt.grid() # Add grid
    plt.legend() # Add legend
    plt.show()

In [None]:
# Creating a dictionary list to store the following month prediction
vnindex30_nextWeek = {}

# Train the model
for i in range(len(vnindex30)):
  window_size = 30
  company_data = compdata_list[i]
  company = vnindex30[i]
  oneweek_pipeline(company, company_data, window_size, vnindex30_nextWeek)

Output hidden; open in https://colab.research.google.com to view.

In [None]:
vnindex30_nextWeek

{'ACB': array([24337.52929688, 24332.42773438, 24332.28710938, 24331.19921875,
        24331.25976562, 24331.06835938, 24303.59765625]),
 'BCM': array([79383.3203125, 79242.1640625, 79170.6953125, 79022.296875 ,
        78784.4453125, 78642.0078125, 78502.21875  ]),
 'BID': array([43583.1171875 , 43577.26171875, 43568.8984375 , 43559.46875   ,
        43556.7421875 , 43550.3203125 , 43545.4609375 ]),
 'BVH': array([45488.4921875 , 45488.58203125, 45475.9609375 , 45468.47265625,
        45411.00390625, 44974.37890625, 44524.61328125]),
 'CTG': array([28041.05273438, 28030.90039062, 28026.86132812, 27707.73828125,
        27699.        , 27694.41796875, 27683.42382812]),
 'FPT': array([79393.203125 , 79457.578125 , 79596.859375 , 79693.0859375,
        80119.6484375, 80232.453125 , 80281.65625  ]),
 'GAS': array([92075.59375  , 92078.21875  , 92067.1015625, 92059.390625 ,
        92061.4296875, 92054.3125   , 92045.4921875]),
 'GVR': array([15403.00390625, 15433.49902344, 15639.63574219,

In [None]:
change_tracking = pd.DataFrame(index = range(0, 30), columns=["company_name", "latest_price", "one_week", "one_month", "change in 1 week", "change in 1 month"])
i = 0
for i in range(len(compdata_list)):
  # company name
  comp = vnindex30[i]
  change_tracking.iloc[i, 0] = comp
  # latest price
  compdata = compdata_list[i]
  change_tracking.iloc[i, 1] = compdata["Close"][compdata["TradingDate"] == '2023-04-24']
  change_tracking.iloc[i, 1] = change_tracking.iloc[i, 1][0] 
  # one-week
  change_tracking.iloc[i, 2] = vnindex30_nextWeek[comp][0]
  change_tracking.iloc[i, 4] = (change_tracking.iloc[i, 2] - change_tracking.iloc[i, 1])/change_tracking.iloc[i, 1]
  change_tracking.iloc[i, 4] = "{:.2%}".format(change_tracking.iloc[i, 4])
  # one-month
  change_tracking.iloc[i, 3] = vnindex30_nextWeek[comp][4]
  change_tracking.iloc[i, 5] = (change_tracking.iloc[i, 3] - change_tracking.iloc[i, 1])/change_tracking.iloc[i, 1]
  change_tracking.iloc[i, 5] = "{:.2%}".format(change_tracking.iloc[i, 5])

In [None]:
change_tracking

Unnamed: 0,company_name,latest_price,one_week,one_month,change in 1 week,change in 1 month
0,ACB,24200.0,24255.990234,24253.84375,0.23%,0.22%
1,BCM,79100.0,78350.859375,78023.40625,-0.95%,-1.36%
2,BID,43700.0,43654.207031,43637.019531,-0.10%,-0.14%
3,BVH,46400.0,44948.5625,44900.957031,-3.13%,-3.23%
4,CTG,28900.0,28204.705078,27902.373047,-2.41%,-3.45%
5,FPT,79100.0,77816.742188,77919.039062,-1.62%,-1.49%
6,GAS,93100.0,93596.8125,93364.75,0.53%,0.28%
7,GVR,15300.0,15090.967773,15326.458008,-1.37%,0.17%
8,HDB,18750.0,18527.964844,18693.802734,-1.18%,-0.30%
9,HPG,20650.0,20704.039062,20725.111328,0.26%,0.36%
