In [3]:
import pandas as pd
import numpy as np

__closepath__ = 'data/vn_stock/price_volume/close_matrix_20120101-20240101.txt'
__openpath__ = 'data/vn_stock/price_volume/open_matrix_20120101-20240101.txt'
__highpath__ = 'data/vn_stock/price_volume/high_matrix_20120101-20240101.txt'
__lowpath__ = 'data/vn_stock/price_volume/low_matrix_20120101-20240101.txt'
__volumepath = 'data/vn_stock/price_volume/volume_matrix_20120101-20240101.txt'

def load_and_process_data(file_path):
    df = pd.read_csv(file_path, sep='\t')
    df.set_index('time', inplace=True)
    df = df.astype(float)
    return df

close = load_and_process_data(__closepath__)
open = load_and_process_data(__openpath__)
high = load_and_process_data(__highpath__)
low = load_and_process_data(__lowpath__)
volume = load_and_process_data(__volumepath)

In [35]:
volume.index = pd.to_datetime(volume.index)

# Extract year from the index
volume['Year'] = volume.index.year

# Group by 'Year' and sum the values for each ticker
yearly_sum = volume.groupby('Year').sum()

# Find top 30 tickers with the highest value in each year
top_30_tickers = yearly_sum.apply(lambda x: x.nlargest(30).index.tolist(), axis=1)

# Convert the result to a DataFrame for better readability
top_30_tickers_df = pd.DataFrame(top_30_tickers.tolist(), index=top_30_tickers.index, columns=[f'top_{i+1}' for i in range(30)])
top_30_tickers_df

Unnamed: 0_level_0,top_1,top_2,top_3,top_4,top_5,top_6,top_7,top_8,top_9,top_10,...,top_21,top_22,top_23,top_24,top_25,top_26,top_27,top_28,top_29,top_30
Year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2012,PVX,VND,SHB,SCR,ITA,MBB,SSI,SAM,EIB,STB,...,WSS,SBS,SHN,APS,DCS,BVS,HAG,NVT,PXL,ASM
2013,SHB,PVX,ITA,SCR,FLC,HQC,PVT,VCG,VND,REE,...,HAR,CTG,VNE,TLH,PVL,VSH,DLG,DCS,PET,MCG
2014,FLC,PVX,ITA,KLF,SHB,SCR,SSI,HQC,HAG,OGC,...,HAR,SHN,VND,MBB,LCG,HUT,REE,CII,ASM,PET
2015,FLC,KLF,OGC,CII,FIT,DLG,HAI,SCR,SSI,ITA,...,SBT,JVC,PVS,DXG,ASM,HAR,TIG,HPG,VIX,TTF
2016,FLC,ITA,HAG,HQC,SCR,HPG,VHG,KBC,DLG,HHS,...,HSG,VCG,PVD,CII,HKB,BID,PVX,ASM,HUT,TLH
2017,FLC,SHB,HQC,ITA,KLF,HAG,OGC,HPG,STB,DXG,...,ASM,PVS,HSG,PVX,HHS,VCG,ACB,SBT,DCM,CTG
2018,SHB,STB,FLC,HAG,MBB,CTG,PVS,HPG,SSI,ACB,...,BID,PVD,ITA,HNG,HDB,LPB,VCB,VRE,DIG,HUT
2019,ROS,FLC,HPG,MBB,CTG,STB,SHB,HSG,ITA,DLG,...,AMD,TCB,HBC,POW,BSR,HAI,ACB,SSI,HDB,DXG
2020,STB,HPG,FLC,ROS,ITA,HSG,HQC,MBB,TCB,CTG,...,VPB,TCH,GEX,DXG,LDG,HUT,BSR,AMD,HAI,VRE
2021,STB,HPG,FLC,SHB,ROS,HQC,MBB,TCB,POW,ITA,...,KBC,SCR,DXG,GEX,PVD,ACB,TCH,LDG,KLF,FIT


In [2]:
price = (high + low + close)/3
vwap = (price * volume).cumsum() / volume.cumsum()
vwap = vwap.round(2)
vwap.fillna(0, inplace=True)
vwap.to_csv('data/vn_stock/price_volume/vwap_matrix_20120101-20240101.txt', sep='\t')

In [3]:
adv20 = volume.rolling(window=20).mean()
adv20.fillna(0, inplace=True)
adv20.to_csv('data/vn_stock/price_volume/adv20_matrix_20120101-20240101.txt', sep='\t')

In [4]:
adv60 = volume.rolling(window=60).mean()
adv60.fillna(0, inplace=True)
adv60.to_csv('data/vn_stock/price_volume/adv60_matrix_20120101-20240101.txt', sep='\t')

In [5]:
adv120 = volume.rolling(window=120).mean()
adv120.fillna(0, inplace=True)
adv120.to_csv('data/vn_stock/price_volume/adv120_matrix_20120101-20240101.txt', sep='\t')

In [6]:
price = (high + low + close)/3
daily_return = price.pct_change()
daily_return.fillna(0, inplace=True)
daily_return.to_csv('data/vn_stock/price_volume/daily_return_matrix_20120101-20240101.txt', sep='\t')

In [2]:
import requests
import pandas as pd
import time
from datetime import datetime, timedelta, timezone

def get_vn30f1m_trading():
    start = '1325376000'
    end = '1704067200'
    url = f"https://services.entrade.com.vn/chart-api/chart?from={start}&resolution=1&symbol=VN30F1M&to={end}"
    try:
        resp = requests.get(url)
        data = resp.json()
        if not data:
            print(f"No data returned for VN30F1M")
        vn30f1m = pd.DataFrame(data).iloc[:, :6]
        vn30f1m['t'] = vn30f1m['t'].astype(int).apply(lambda x: (datetime.fromtimestamp(x, timezone.utc) + timedelta(hours=7)).replace(tzinfo=None))
        vn30f1m.columns = ['date', 'open', 'high', 'low', 'close', 'volume']
        return vn30f1m
    except Exception as e:
        print(f"Error fetching data for VN30F1M: {e}.")
        return pd.DataFrame()
    
df = get_vn30f1m_trading()
df

Unnamed: 0,date,open,high,low,close,volume
0,2018-08-13 09:00:00,943.5,943.5,942.9,942.9,975
1,2018-08-13 09:01:00,943.0,943.1,942.9,943.1,220
2,2018-08-13 09:02:00,943.0,943.6,943.0,943.5,121
3,2018-08-13 09:03:00,943.3,943.4,943.3,943.4,135
4,2018-08-13 09:04:00,943.2,943.2,943.0,943.1,361
...,...,...,...,...,...,...
296481,2023-12-29 14:27:00,1133.2,1134.2,1133.0,1134.2,1232
296482,2023-12-29 14:28:00,1134.9,1134.9,1133.6,1133.6,1362
296483,2023-12-29 14:29:00,1133.9,1134.4,1133.5,1133.9,1458
296484,2023-12-29 14:30:00,1134.0,1134.0,1134.0,1134.0,76
