In [97]:
import pandas as pd
import numpy as np

__closepath__ = 'data/vn_stock/price_volume/close_matrix_20120101-20240101.txt'
__openpath__ = 'data/vn_stock/price_volume/open_matrix_20120101-20240101.txt'
__highpath__ = 'data/vn_stock/price_volume/high_matrix_20120101-20240101.txt'
__lowpath__ = 'data/vn_stock/price_volume/low_matrix_20120101-20240101.txt'
__volumepath = 'data/vn_stock/price_volume/volume_matrix_20120101-20240101.txt'

def load_and_process_data(file_path):
    df = pd.read_csv(file_path, sep='\t')
    df.set_index('time', inplace=True)
    df = df.astype(float)
    return df

close = load_and_process_data(__closepath__)
open = load_and_process_data(__openpath__)
high = load_and_process_data(__highpath__)
low = load_and_process_data(__lowpath__)
volume = load_and_process_data(__volumepath)

# VN30

In [72]:
volume.index = pd.to_datetime(volume.index)

volume['Year'] = volume.index.year
yearly_sum = volume.groupby('Year').sum()
top_30_tickers = yearly_sum.apply(lambda x: x.nlargest(30).index.tolist(), axis=1)
top_30_tickers_df = pd.DataFrame(top_30_tickers.tolist(), index=top_30_tickers.index, columns=[f'top_{i+1}' for i in range(30)])
top_30_tickers_df

Unnamed: 0_level_0,top_1,top_2,top_3,top_4,top_5,top_6,top_7,top_8,top_9,top_10,...,top_21,top_22,top_23,top_24,top_25,top_26,top_27,top_28,top_29,top_30
Year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2012,PVX,VND,SHB,SCR,ITA,MBB,SSI,SAM,EIB,STB,...,WSS,SBS,SHN,APS,DCS,BVS,HAG,NVT,PXL,ASM
2013,SHB,PVX,ITA,SCR,FLC,HQC,PVT,VCG,VND,REE,...,HAR,CTG,VNE,TLH,PVL,VSH,DLG,DCS,PET,MCG
2014,FLC,PVX,ITA,KLF,SHB,SCR,SSI,HQC,HAG,OGC,...,HAR,SHN,VND,MBB,LCG,HUT,REE,CII,ASM,PET
2015,FLC,KLF,OGC,CII,FIT,DLG,HAI,SCR,SSI,ITA,...,SBT,JVC,PVS,DXG,ASM,HAR,TIG,HPG,VIX,TTF
2016,FLC,ITA,HAG,HQC,SCR,HPG,VHG,KBC,DLG,HHS,...,HSG,VCG,PVD,CII,HKB,BID,PVX,ASM,HUT,TLH
2017,FLC,SHB,HQC,ITA,KLF,HAG,OGC,HPG,STB,DXG,...,ASM,PVS,HSG,PVX,HHS,VCG,ACB,SBT,DCM,CTG
2018,SHB,STB,FLC,HAG,MBB,CTG,PVS,HPG,SSI,ACB,...,BID,PVD,ITA,HNG,HDB,LPB,VCB,VRE,DIG,HUT
2019,ROS,FLC,HPG,MBB,CTG,STB,SHB,HSG,ITA,DLG,...,AMD,TCB,HBC,POW,BSR,HAI,ACB,SSI,HDB,DXG
2020,STB,HPG,FLC,ROS,ITA,HSG,HQC,MBB,TCB,CTG,...,VPB,TCH,GEX,DXG,LDG,HUT,BSR,AMD,HAI,VRE
2021,STB,HPG,FLC,SHB,ROS,HQC,MBB,TCB,POW,ITA,...,KBC,SCR,DXG,GEX,PVD,ACB,TCH,LDG,KLF,FIT


## volume

In [73]:
volume_top30 = volume.copy()

for year in top_30_tickers_df.index:
    tickers_for_year = top_30_tickers_df.loc[year].dropna().values.tolist()
    tickers_not_in_top_30 = [ticker for ticker in volume.columns if ticker not in tickers_for_year]
    volume_top30.loc[volume.index.year == year, tickers_not_in_top_30] = 0
    volume_top30 = volume_top30.drop(columns=['Year'])

volume_top30.to_csv('data/vn_stock/price_volume/volume_matrix_top30_20120101-20240101.txt', sep='\t')

## close

In [74]:
close_top30 = close.copy()
close_top30.index = pd.to_datetime(close_top30.index)

for year in top_30_tickers_df.index.unique():
    tickers_for_year = top_30_tickers_df.loc[year].dropna().values.tolist()
    tickers_not_in_top_30 = [ticker for ticker in close.columns if ticker not in tickers_for_year]
    close_top30.loc[close_top30.index.year == year, tickers_not_in_top_30] = 0
close_top30.to_csv('data/vn_stock/price_volume/close_matrix_top30_20120101-20240101.txt', sep='\t')


## open

In [75]:
open_top30 = open.copy()
open_top30.index = pd.to_datetime(open_top30.index)

for year in top_30_tickers_df.index.unique():
    tickers_for_year = top_30_tickers_df.loc[year].dropna().values.tolist()
    tickers_not_in_top_30 = [ticker for ticker in open.columns if ticker not in tickers_for_year]
    open_top30.loc[open_top30.index.year == year, tickers_not_in_top_30] = 0
open_top30.to_csv('data/vn_stock/price_volume/open_matrix_top30_20120101-20240101.txt', sep='\t')

## high

In [76]:
high_top30 = high.copy()
high_top30.index = pd.to_datetime(high_top30.index)

for year in top_30_tickers_df.index.unique():
    tickers_for_year = top_30_tickers_df.loc[year].dropna().values.tolist()
    tickers_not_in_top_30 = [ticker for ticker in high.columns if ticker not in tickers_for_year]
    high_top30.loc[high_top30.index.year == year, tickers_not_in_top_30] = 0
high_top30.to_csv('data/vn_stock/price_volume/high_matrix_top30_20120101-20240101.txt', sep='\t')

## low

In [77]:
low_top30 = low.copy()
low_top30.index = pd.to_datetime(low_top30.index)

for year in top_30_tickers_df.index.unique():
    tickers_for_year = top_30_tickers_df.loc[year].dropna().values.tolist()
    tickers_not_in_top_30 = [ticker for ticker in low.columns if ticker not in tickers_for_year]
    low_top30.loc[low_top30.index.year == year, tickers_not_in_top_30] = 0
low_top30.to_csv('data/vn_stock/price_volume/low_matrix_top30_20120101-20240101.txt', sep='\t')

## load price vol

In [78]:
import pandas as pd
import numpy as np

__closepath__ = 'data/vn_stock/price_volume/close_matrix_top30_20120101-20240101.txt'
__openpath__ = 'data/vn_stock/price_volume/open_matrix_top30_20120101-20240101.txt'
__highpath__ = 'data/vn_stock/price_volume/high_matrix_top30_20120101-20240101.txt'
__lowpath__ = 'data/vn_stock/price_volume/low_matrix_top30_20120101-20240101.txt'
__volumepath = 'data/vn_stock/price_volume/volume_matrix_top30_20120101-20240101.txt'

def load_and_process_data(file_path):
    df = pd.read_csv(file_path, sep='\t')
    df.set_index('time', inplace=True)
    df = df.astype(float)
    return df

close = load_and_process_data(__closepath__)
open = load_and_process_data(__openpath__)
high = load_and_process_data(__highpath__)
low = load_and_process_data(__lowpath__)
volume = load_and_process_data(__volumepath)

## vwap

In [79]:
price = (high + low + close)/3
vwap = (price * volume).cumsum() / volume.cumsum()
vwap = vwap.round(2)
vwap.fillna(0, inplace=True)
vwap.to_csv('data/vn_stock/price_volume/vwap_matrix_top30_20120101-20240101.txt', sep='\t')

## adv20

In [80]:
adv20 = volume.rolling(window=20).mean()
adv20.fillna(0, inplace=True)
adv20.to_csv('data/vn_stock/price_volume/adv20_matrix_top30_20120101-20240101.txt', sep='\t')

## adv60

In [81]:
adv60 = volume.rolling(window=60).mean()
adv60.fillna(0, inplace=True)
adv60.to_csv('data/vn_stock/price_volume/adv60_matrix_top30_20120101-20240101.txt', sep='\t')

## adv120

In [82]:
adv120 = volume.rolling(window=120).mean()
adv120.fillna(0, inplace=True)
adv120.to_csv('data/vn_stock/price_volume/adv120_matrix_top30_20120101-20240101.txt', sep='\t')

## daily_return

In [83]:
price = (high + low + close)/3
daily_return = price.pct_change()
daily_return.fillna(0, inplace=True)
daily_return.to_csv('data/vn_stock/price_volume/daily_return_matrix_top30_20120101-20240101.txt', sep='\t')

# VN100

In [85]:
volume.index = pd.to_datetime(volume.index)

volume['Year'] = volume.index.year
yearly_sum = volume.groupby('Year').sum()
top100_tickers = yearly_sum.apply(lambda x: x.nlargest(100).index.tolist(), axis=1)
top100_tickers_df = pd.DataFrame(top100_tickers.tolist(), index=top100_tickers.index, columns=[f'top_{i+1}' for i in range(100)])
top100_tickers_df

Unnamed: 0_level_0,top_1,top_2,top_3,top_4,top_5,top_6,top_7,top_8,top_9,top_10,...,top_91,top_92,top_93,top_94,top_95,top_96,top_97,top_98,top_99,top_100
Year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2012,PVX,VND,SHB,SCR,ITA,MBB,SSI,SAM,EIB,STB,...,DBC,TDH,CTS,BCC,HBC,CMI,DIC,PVD,PXS,PSI
2013,SHB,PVX,ITA,SCR,FLC,HQC,PVT,VCG,VND,REE,...,HBC,VOS,SII,HCM,IDJ,SJS,VNM,KSD,SDD,TNT
2014,FLC,PVX,ITA,KLF,SHB,SCR,SSI,HQC,HAG,OGC,...,HBC,PVG,GAS,GMD,CSM,SBS,QCG,PFL,NVT,BVH
2015,FLC,KLF,OGC,CII,FIT,DLG,HAI,SCR,SSI,ITA,...,ACM,SHA,BII,PPC,TLH,MHC,IVS,VIP,MSN,PTL
2016,FLC,ITA,HAG,HQC,SCR,HPG,VHG,KBC,DLG,HHS,...,SAM,AAA,MHC,SGO,PVC,FID,KSH,MSN,PPC,SHA
2017,FLC,SHB,HQC,ITA,KLF,HAG,OGC,HPG,STB,DXG,...,TNI,VJC,GEX,HT1,PVT,LPB,TOP,HCD,NKG,IJC
2018,SHB,STB,FLC,HAG,MBB,CTG,PVS,HPG,SSI,ACB,...,DGW,JVC,DVN,HCD,VRC,NSH,SBS,SJF,SHN,TTB
2019,ROS,FLC,HPG,MBB,CTG,STB,SHB,HSG,ITA,DLG,...,EVG,NVL,PVT,VIB,BCG,VND,TTB,TIG,GEG,TPB
2020,STB,HPG,FLC,ROS,ITA,HSG,HQC,MBB,TCB,CTG,...,DST,TTB,FRT,VIX,HDC,DAH,APG,PET,PNJ,PLX
2021,STB,HPG,FLC,SHB,ROS,HQC,MBB,TCB,POW,ITA,...,ACM,TLH,APH,MBS,BII,VIC,OIL,DBC,TDH,TNI


## volume

In [86]:
volume_top100 = volume.copy()

for year in top100_tickers_df.index:
    tickers_for_year = top100_tickers_df.loc[year].dropna().values.tolist()
    tickers_not_in_top100 = [ticker for ticker in volume.columns if ticker not in tickers_for_year]
    volume_top100.loc[volume.index.year == year, tickers_not_in_top100] = 0
    volume_top100 = volume_top100.drop(columns=['Year'])

volume_top100.to_csv('data/vn_stock/price_volume/volume_matrix_top100_20120101-20240101.txt', sep='\t')

## close

In [87]:
close_top100 = close.copy()
close_top100.index = pd.to_datetime(close_top100.index)

for year in top100_tickers_df.index.unique():
    tickers_for_year = top100_tickers_df.loc[year].dropna().values.tolist()
    tickers_not_in_top_100 = [ticker for ticker in close.columns if ticker not in tickers_for_year]
    close_top100.loc[close_top100.index.year == year, tickers_not_in_top_100] = 0
close_top100.to_csv('data/vn_stock/price_volume/close_matrix_top100_20120101-20240101.txt', sep='\t')

## open

In [88]:
open_top100 = open.copy()
open_top100.index = pd.to_datetime(open_top100.index)

for year in top100_tickers_df.index.unique():
    tickers_for_year = top100_tickers_df.loc[year].dropna().values.tolist()
    tickers_not_in_top_100 = [ticker for ticker in open.columns if ticker not in tickers_for_year]
    open_top100.loc[open_top100.index.year == year, tickers_not_in_top_100] = 0
open_top100.to_csv('data/vn_stock/price_volume/open_matrix_top100_20120101-20240101.txt', sep='\t')

## high

In [89]:
high_top100 = high.copy()
high_top100.index = pd.to_datetime(high_top100.index)

for year in top100_tickers_df.index.unique():
    tickers_for_year = top100_tickers_df.loc[year].dropna().values.tolist()
    tickers_not_in_top_100 = [ticker for ticker in high.columns if ticker not in tickers_for_year]
    high_top100.loc[high_top100.index.year == year, tickers_not_in_top_100] = 0
high_top100.to_csv('data/vn_stock/price_volume/high_matrix_top100_20120101-20240101.txt', sep='\t')

## low

In [90]:
low_top100 = low.copy()
low_top100.index = pd.to_datetime(low_top100.index)

for year in top100_tickers_df.index.unique():
    tickers_for_year = top100_tickers_df.loc[year].dropna().values.tolist()
    tickers_not_in_top_100 = [ticker for ticker in low.columns if ticker not in tickers_for_year]
    low_top100.loc[low_top100.index.year == year, tickers_not_in_top_100] = 0
low_top100.to_csv('data/vn_stock/price_volume/low_matrix_top100_20120101-20240101.txt', sep='\t')

## load price vol

In [91]:
import pandas as pd
import numpy as np

__closepath__ = 'data/vn_stock/price_volume/close_matrix_top100_20120101-20240101.txt'
__openpath__ = 'data/vn_stock/price_volume/open_matrix_top100_20120101-20240101.txt'
__highpath__ = 'data/vn_stock/price_volume/high_matrix_top100_20120101-20240101.txt'
__lowpath__ = 'data/vn_stock/price_volume/low_matrix_top100_20120101-20240101.txt'
__volumepath = 'data/vn_stock/price_volume/volume_matrix_top100_20120101-20240101.txt'

def load_and_process_data(file_path):
    df = pd.read_csv(file_path, sep='\t')
    df.set_index('time', inplace=True)
    df = df.astype(float)
    return df

close = load_and_process_data(__closepath__)
open = load_and_process_data(__openpath__)
high = load_and_process_data(__highpath__)
low = load_and_process_data(__lowpath__)
volume = load_and_process_data(__volumepath)

## vwap

In [92]:
price = (high + low + close)/3
vwap = (price * volume).cumsum() / volume.cumsum()
vwap = vwap.round(2)
vwap.fillna(0, inplace=True)
vwap.to_csv('data/vn_stock/price_volume/vwap_matrix_top100_20120101-20240101.txt', sep='\t')

## adv20

In [93]:
adv20 = volume.rolling(window=20).mean()
adv20.fillna(0, inplace=True)
adv20.to_csv('data/vn_stock/price_volume/adv20_matrix_top100_20120101-20240101.txt', sep='\t')

## adv60

In [94]:
adv60 = volume.rolling(window=60).mean()
adv60.fillna(0, inplace=True)
adv60.to_csv('data/vn_stock/price_volume/adv60_matrix_top100_20120101-20240101.txt', sep='\t')

## adv120

In [95]:
adv120 = volume.rolling(window=120).mean()
adv120.fillna(0, inplace=True)
adv120.to_csv('data/vn_stock/price_volume/adv120_matrix_top100_20120101-20240101.txt', sep='\t')

## daily_return

In [96]:
price = (high + low + close)/3
daily_return = price.pct_change()
daily_return.fillna(0, inplace=True)
daily_return.to_csv('data/vn_stock/price_volume/daily_return_matrix_top100_20120101-20240101.txt', sep='\t')

# VNALL

## vwap

In [98]:
price = (high + low + close)/3
vwap = (price * volume).cumsum() / volume.cumsum()
vwap = vwap.round(2)
vwap.fillna(0, inplace=True)
vwap.to_csv('data/vn_stock/price_volume/vwap_matrix_20120101-20240101.txt', sep='\t')

## adv20

In [99]:
adv20 = volume.rolling(window=20).mean()
adv20.fillna(0, inplace=True)
adv20.to_csv('data/vn_stock/price_volume/adv20_matrix_20120101-20240101.txt', sep='\t')

## adv60

In [100]:
adv60 = volume.rolling(window=60).mean()
adv60.fillna(0, inplace=True)
adv60.to_csv('data/vn_stock/price_volume/adv60_matrix_20120101-20240101.txt', sep='\t')

## adv120

In [101]:
adv120 = volume.rolling(window=120).mean()
adv120.fillna(0, inplace=True)
adv120.to_csv('data/vn_stock/price_volume/adv120_matrix_20120101-20240101.txt', sep='\t')

## daily_return

In [102]:
price = (high + low + close)/3
daily_return = price.pct_change()
daily_return.fillna(0, inplace=True)
daily_return.to_csv('data/vn_stock/price_volume/daily_return_matrix_20120101-20240101.txt', sep='\t')