In [85]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [86]:
import pandas as pd
import numpy as np
from statsmodels.tsa.stattools import adfuller
from statsmodels.tsa.api import VAR
import warnings
warnings.filterwarnings("ignore")
import plotly.graph_objects as go

In [99]:
train_data = pd.read_csv('/content/drive/MyDrive/Khóa Luận Tốt Nghiệp/data/ComVN30_train.csv')
test_data = pd.read_csv('/content/drive/MyDrive/Khóa Luận Tốt Nghiệp/data/ComVN30_test.csv')
# Chuyển đổi cột 'time' thành datetime
train_data['time'] = pd.to_datetime(train_data['time'])
test_data['time'] = pd.to_datetime(test_data['time'])

# Lấy danh sách các mã chứng khoán
stock_codes = train_data['ticker'].unique()

In [100]:
train_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 36940 entries, 0 to 36939
Data columns (total 7 columns):
 #   Column  Non-Null Count  Dtype         
---  ------  --------------  -----         
 0   time    36940 non-null  datetime64[ns]
 1   open    36940 non-null  int64         
 2   high    36940 non-null  int64         
 3   low     36940 non-null  int64         
 4   close   36940 non-null  int64         
 5   volume  36940 non-null  int64         
 6   ticker  36940 non-null  object        
dtypes: datetime64[ns](1), int64(5), object(1)
memory usage: 2.0+ MB


In [101]:
train_data.tail()

Unnamed: 0,time,open,high,low,close,volume,ticker
36935,2023-12-25,94900,96200,94800,96000,2017000,FPT
36936,2023-12-26,96500,98600,96400,97200,3359900,FPT
36937,2023-12-27,97400,97800,96900,96900,1355900,FPT
36938,2023-12-28,97000,97000,96300,96600,1196600,FPT
36939,2023-12-29,96600,97000,96100,96100,1866600,FPT


In [102]:
test_data.head()

Unnamed: 0,time,open,high,low,close,volume,ticker
0,2024-01-02,33000,33200,32500,32500,18052300,SSI
1,2024-01-03,32350,32950,32299,32950,11408700,SSI
2,2024-01-04,33000,34200,33000,33600,58713300,SSI
3,2024-01-05,33800,34150,33600,34150,25137200,SSI
4,2024-01-08,34450,34500,34000,34100,20729000,SSI


In [103]:
from sklearn.metrics import mean_squared_error, mean_absolute_error
import numpy as np
import matplotlib.pyplot as plt

#Định nghĩa hàm kiểm tra tính dừng
def check_stationarity(timeseries):
    result = adfuller(timeseries)
    print('ADF Statistic: %f' % result[0])
    print('p-value: %f' % result[1])
    if result[1] > 0.05:
        print('Series is not stationary')
    else:
        print('Series is stationary')

In [104]:
# Xử lý các mục trùng lặp
train_data = train_data.groupby(['time', 'ticker']).mean().reset_index()
test_data = test_data.groupby(['time', 'ticker']).mean().reset_index()

#Chuẩn bị dữ liệu cho mô hình VAR
train_data_pivot = train_data.pivot(index='time', columns='ticker', values='close')
test_data_pivot = test_data.pivot(index='time', columns='ticker', values='close')

# Xử lý các giá trị NaN
train_data_pivot = train_data_pivot.fillna(method='ffill').fillna(method='bfill')
test_data_pivot = test_data_pivot.fillna(method='ffill').fillna(method='bfill')

In [105]:
train_data_pivot, test_data_pivot

(ticker          ACB      BCM      BID      BVH      CTG      FPT      GAS  \
 time                                                                        
 2019-01-02   9230.0  22290.0  22360.0  78110.0  12690.0  17800.0  60180.0   
 2019-01-03   8890.0  20430.0  21350.0  78290.0  12080.0  17630.0  58720.0   
 2019-01-04   8950.0  21080.0  21050.0  77760.0  12250.0  17760.0  58590.0   
 2019-01-07   9070.0  20980.0  21550.0  77760.0  12120.0  18060.0  59830.0   
 2019-01-08   9010.0  21080.0  21350.0  77410.0  11950.0  18140.0  61220.0   
 ...             ...      ...      ...      ...      ...      ...      ...   
 2023-12-25  23350.0  61900.0  43200.0  39300.0  26900.0  96000.0  76500.0   
 2023-12-26  23250.0  62300.0  43000.0  39500.0  26800.0  97200.0  76400.0   
 2023-12-27  23300.0  62600.0  43000.0  39550.0  26850.0  96900.0  76300.0   
 2023-12-28  23750.0  62700.0  42700.0  39600.0  27100.0  96600.0  76000.0   
 2023-12-29  23900.0  62900.0  43400.0  39500.0  27100.0  96100.

In [106]:
# Kiểm tra tính dừng của từng mã chứng khoán
for stock_code in train_data_pivot.columns:
    print(f'Checking stationarity for {stock_code}')
    check_stationarity(train_data_pivot[stock_code])

Checking stationarity for ACB
ADF Statistic: -0.994659
p-value: 0.755207
Series is not stationary
Checking stationarity for BCM
ADF Statistic: -1.504014
p-value: 0.531525
Series is not stationary
Checking stationarity for BID
ADF Statistic: -1.879486
p-value: 0.341772
Series is not stationary
Checking stationarity for BVH
ADF Statistic: -2.351075
p-value: 0.156024
Series is not stationary
Checking stationarity for CTG
ADF Statistic: -1.704095
p-value: 0.429004
Series is not stationary
Checking stationarity for FPT
ADF Statistic: 0.107561
p-value: 0.966574
Series is not stationary
Checking stationarity for GAS
ADF Statistic: -2.324913
p-value: 0.164059
Series is not stationary
Checking stationarity for GVR
ADF Statistic: -1.667198
p-value: 0.448087
Series is not stationary
Checking stationarity for HDB
ADF Statistic: -0.910435
p-value: 0.784517
Series is not stationary
Checking stationarity for HPG
ADF Statistic: -1.347381
p-value: 0.607227
Series is not stationary
Checking stationarity

In [107]:
#Xây dựng và huấn luyện mô hình
model = VAR(train_data_pivot)
fitted_model = model.fit(maxlags=15, ic='aic')
fitted_model.summary()

  Summary of Regression Results   
Model:                         VAR
Method:                        OLS
Date:           Sat, 03, Aug, 2024
Time:                     14:48:03
--------------------------------------------------------------------
No. of Equations:         30.0000    BIC:                    380.027
Nobs:                     1249.00    HQIC:                   377.643
Log likelihood:          -287179.    FPE:               2.42614e+163
AIC:                      376.207    Det(Omega_mle):    1.16276e+163
--------------------------------------------------------------------
Results for equation ACB
            coefficient       std. error           t-stat            prob
-------------------------------------------------------------------------
const       -160.173258       276.305708           -0.580           0.562
L1.ACB         0.858308         0.020531           41.805           0.000
L1.BCM         0.003467         0.002011            1.724           0.085
L1.BID        -0

In [108]:
print("Độ trể được chọn cho mô hình:")
print(fitted_model.k_ar)

Độ trể được chọn cho mô hình:
1


In [109]:
forecast_steps = len(test_data_pivot)
forecast = fitted_model.forecast(train_data_pivot.values[-fitted_model.k_ar:], steps=forecast_steps)

forecast_df = pd.DataFrame(forecast, index=test_data_pivot.index, columns=test_data_pivot.columns)
forecast_df

ticker,ACB,BCM,BID,BVH,CTG,FPT,GAS,GVR,HDB,HPG,...,TCB,TPB,VCB,VHM,VIB,VIC,VJC,VNM,VPB,VRE
time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2024-01-02,23954.935020,62988.373148,43286.692953,39589.244083,27242.337809,96003.853057,75567.298043,21315.029615,20251.391857,27880.533154,...,31754.487748,17436.303853,80639.453104,43341.484736,18530.284193,44539.306917,108171.134062,66537.039251,19207.073015,23142.222048
2024-01-03,24002.531186,63069.085771,43178.843769,39671.575509,27363.556430,95922.692540,75656.067728,21443.862783,20216.662008,27803.872940,...,31729.507021,17476.804901,80965.262089,43476.831032,18557.636408,44502.221577,108337.679042,66379.877893,19220.569010,23019.251293
2024-01-04,24043.213159,63144.793635,43075.221492,39745.812015,27466.581175,95852.827837,75757.092040,21582.035091,20192.705147,27722.094357,...,31721.588871,17520.409119,81274.102208,43607.438756,18581.806160,44484.448894,108500.036722,66257.578247,19238.266278,22925.254743
2024-01-05,24077.430967,63217.680140,42974.923214,39811.208953,27553.890506,95791.379375,75863.257273,21725.951551,20176.992174,27636.933775,...,31727.869942,17566.262466,81563.666287,43734.295588,18602.729182,44482.231064,108658.449854,66161.409173,19258.476551,22855.322579
2024-01-08,24105.646831,63289.529217,42877.304715,39867.369350,27627.592774,95736.117989,75969.155300,21872.731001,20167.471312,27549.841910,...,31745.983854,17613.703073,81832.468382,43858.070377,18620.480663,44492.293407,108813.046529,66084.436948,19279.937984,22805.322493
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2024-04-22,23905.822246,72517.218082,40369.586714,38658.255142,27996.187041,94598.346807,76246.096241,23686.670837,19684.346351,27366.438427,...,35384.309276,19763.501169,81790.203966,46759.138398,20211.377288,43367.954620,110942.004727,62864.642112,20683.809472,23554.594355
2024-04-23,23916.552627,72619.113831,40358.458921,38664.468828,28010.052877,94610.914353,76303.296036,23673.484590,19681.766624,27398.078753,...,35409.928906,19767.032484,81777.694178,46769.167855,20241.113329,43359.210839,110895.690048,62852.508226,20710.450822,23554.464407
2024-04-24,23927.529348,72719.055615,40347.202637,38671.119904,28024.124374,94623.783367,76361.367016,23660.769892,19679.418906,27429.399462,...,35435.041449,19770.022860,81765.849985,46779.502458,20270.372869,43351.692008,110848.624755,62841.344033,20736.915240,23554.268481
2024-04-25,23938.738144,72817.079908,40335.820433,38678.183275,28038.389634,94636.934647,76420.226717,23648.538321,19677.297434,27460.391951,...,35459.674813,19772.498980,81754.647370,46790.152800,20299.154126,43345.370875,110800.854448,62831.126672,20763.194751,23554.017669


In [112]:
# Thang đo đánh giá MAE và RMSE
evaluation_metrics = {}

for stock_code in test_data_pivot.columns:
    rmse = np.sqrt(mean_squared_error(test_data_pivot[stock_code], forecast_df[stock_code]))
    mae = mean_absolute_error(test_data_pivot[stock_code], forecast_df[stock_code])
    evaluation_metrics[stock_code] = {'RMSE': rmse, 'MAE': mae}

evaluation_metrics_df = pd.DataFrame(evaluation_metrics).T
evaluation_metrics_df

Unnamed: 0,RMSE,MAE
ACB,3184.369879,2985.773427
BCM,7852.864157,5501.548939
BID,9539.564124,8921.433211
BVH,2930.46883,2454.774066
CTG,6009.332242,5648.771219
FPT,15021.354845,12348.710875
GAS,3117.910148,2424.099042
GVR,5760.453663,4769.558263
HDB,2956.761635,2646.874261
HPG,2653.836096,2249.228397


In [111]:
for stock_code in test_data_pivot.columns:
    plt.figure(figsize=(14, 7))
    plt.plot(train_data_pivot[stock_code], label='Train Close')
    plt.plot(test_data_pivot[stock_code], label='Actual Close')
    plt.plot(forecast_df[stock_code], label='Forecasted Close', linestyle='--')
    plt.title(f'Actual vs Forecasted Close Prices for {stock_code}')
    plt.xlabel('Date')
    plt.ylabel('Close Price')
    plt.legend()
    plt.show()

Output hidden; open in https://colab.research.google.com to view.