# Download and store STOOQ data
[ref](https://github.com/stefan-jansen/machine-learning-for-trading/blob/main/data/create_stooq_data.ipynb)

In [1]:
import warnings
warnings.filterwarnings('ignore')

## Imports & Settings

In [2]:
from pathlib import Path
import requests
from io import BytesIO
from zipfile import ZipFile, BadZipFile

import numpy as np
import pandas as pd
import pandas_datareader.data as web
from sklearn.datasets import fetch_openml

pd.set_option('display.expand_frame_repr', False)

## Set Data Store path

In [26]:
STORAGE_PATH = "E:"

In [27]:
DATA_STORE = Path('assets.h5')
try:
    DATA_STORE = STORAGE_PATH/ DATA_STORE
except:
    pass
print(DATA_STORE.cwd())

C:\Users\taest\Documents\github\research_market_finance\MLFT\data


In [28]:
stooq_path = Path("stooq")
try:
    stooq_path = STORAGE_PATH/stooq_path
    print(stooq_path)
except:
    pass
if not stooq_path.exists():
    stooq_path.mkdir()
print(stooq_path.cwd())

E:stooq
C:\Users\taest\Documents\github\research_market_finance\MLFT\data


### Stooq Historical Market Data

- 여기서 파일을 받아 `stooq_path'에 저장한다  
[url](https://stooq.com/db/h/)

In [36]:
def get_price_data(market='us'):
    file_path = f'd_{market}_txt.zip'
    zip_path = stooq_path/file_path
    with ZipFile(zip_path) as zip_file:
        for i, file in enumerate(zip_file.namelist()):
            if not file.endswith('.txt'):
                continue
            local_file = stooq_path / file
            local_file.parent.mkdir(parents=True, exist_ok=True)
            with local_file.open('wb') as output:
                for line in zip_file.open(file).readlines():
                    output.write(line)

In [37]:
for market in ['us', 'jp', 'macro','hk', 'world']:
    get_price_data(market=market)

### Add symbols

- 아래 링크에 접속하여 meta 파일을 만들어야 함.   
 `파일명: 69.csv` https://stooq.com/db/l/?g=69
- meta 목록 확인: https://stooq.com/db/

In [38]:
meta_path = stooq_path/"data/meta"
if not meta_path.exists():
    meta_path.mkdir()

In [40]:
metadata_dict = {
    ('jp', 'tse etfs'): 34,
    ('jp', 'tse stocks'): 32,
    ('jp', 'macro'): 46,
    ('us', 'nasdaq etfs'): 69,
    ('us', 'nasdaq stocks'): 27,
    ('us', 'nyse etfs'): 70,
    ('us', 'nyse stocks'): 28,
    ('us', 'nysemkt stocks'): 26,
    ('us', 'macro'): 50,
    ('kr', 'macro'): 87,
    ('cn', 'macro'): 42,
    ('world', 'bonds'): 53,
    ('world', 'money market'): 39,
    ('world', 'crypto'): 59,
    ('world', 'other_fx'): 52,
    ('world', 'index'): 1,
    # ('world', 'stooq_stock_idx'): 25,
    ('world', 'major_fx'): 3,
    # ('hk','hk cbbcs'): 79,
    ('hk','hk etfs'): 74,
    # ('hk','hk corporate bonds'): 77,
    # ('hk','hk reits'): 80,
    # ('hk','hk drs'): 75,
    ('hk','hk stocks'): 73,
    # ('hk','hk dws'): 78,
    # ('hk','hk treasury bonds'): 76,
}

In [41]:
for (market, asset_class), code in metadata_dict.items():
    df = pd.read_csv(meta_path/f'{code}.csv', sep='        ').apply(lambda x: x.str.strip())
    df.columns = ['ticker', 'name']
    df = df.drop_duplicates('ticker').dropna()
    print(market, asset_class, f'# tickers: {df.shape[0]:,.0f}')
    path = stooq_path / 'tickers' / market
    if not path.exists():
        path.mkdir(parents=True)
    df.to_csv(path / f'{asset_class}.csv', index=False)   

jp tse etfs # tickers: 392
jp tse stocks # tickers: 3,941
jp macro # tickers: 31
us nasdaq etfs # tickers: 356
us nasdaq stocks # tickers: 4,669
us nyse etfs # tickers: 2,104
us nyse stocks # tickers: 3,554
us nysemkt stocks # tickers: 319
us macro # tickers: 79
kr macro # tickers: 22
cn macro # tickers: 11
world bonds # tickers: 125
world money market # tickers: 44
world crypto # tickers: 271
world other_fx # tickers: 1,840
world index # tickers: 61
world major_fx # tickers: 66
hk hk etfs # tickers: 191
hk hk stocks # tickers: 2,555


## Store price data in HDF5 format

In [42]:
def get_stooq_prices_and_tickers(frequency='daily',
                                 market='us',
                                 asset_class='nasdaq etfs'):
    prices = []
    
    tickers = (pd.read_csv(stooq_path / 'tickers' / market / f'{asset_class}.csv'))

    if frequency in ['5 min', 'hourly']:
        parse_dates = [['date', 'time']]
        date_label = 'date_time'
    else:
        parse_dates = ['date']
        date_label = 'date'
    names = ['ticker', 'freq', 'date', 'time', 
             'open', 'high', 'low', 'close','volume', 'openint']
    
    usecols = ['ticker', 'open', 'high', 'low', 'close', 'volume'] + parse_dates
    path = stooq_path / 'data' / frequency / market / asset_class
    print(path.as_posix())
    files = path.glob('**/*.txt')
    for i, file in enumerate(files, 1):
        if i % 500 == 0:
            print(i)
        if file.stem not in set(tickers.ticker.str.lower()):
            print(file.stem, 'not available')
            file.unlink()
        else:
            try:
                df = (pd.read_csv(
                    file,
                    names=names,
                    usecols=usecols,
                    header=0,
                    parse_dates=parse_dates))
                prices.append(df)
            except pd.errors.EmptyDataError:
                print('\tdata missing', file.stem)
                file.unlink()

    prices = (pd.concat(prices, ignore_index=True)
              .rename(columns=str.lower)
              .set_index(['ticker', date_label])
              .apply(lambda x: pd.to_numeric(x, errors='coerce')))
    return prices, tickers

In [43]:
# load some Japanese and all US assets for 2000-2019
markets = {'jp': ['tse stocks'],
           'us': ['nasdaq etfs', 'nasdaq stocks', 'nyse etfs', 'nyse stocks', 'nysemkt stocks']
          }
frequency = 'daily'

idx = pd.IndexSlice
for market, asset_classes in markets.items():
    for asset_class in asset_classes:
        print(f'\n{asset_class}')
        prices, tickers = get_stooq_prices_and_tickers(frequency=frequency, 
                                                       market=market, 
                                                       asset_class=asset_class)
        
        prices = prices.sort_index().loc[idx[:, '2000': '2023'], :]
        names = prices.index.names
        prices = (prices
                  .reset_index()
                  .drop_duplicates()
                  .set_index(names)
                  .sort_index())
        
        print('\nNo. of observations per asset')
        print(prices.groupby('ticker').size().describe())
        key = f'stooq/{market}/{asset_class.replace(" ", "/")}/'
        
        print(prices.info(show_counts=True))
        
        prices.to_hdf(DATA_STORE, key + 'prices', format='t')
        
        print(tickers.info())
        tickers.to_hdf(DATA_STORE, key + 'tickers', format='t')


tse stocks
E:stooq/data/daily/jp/tse stocks
500
1000
1500
2000
2500
3000
3500

No. of observations per asset
count    3941.000000
mean     3335.391525
std      1596.491588
min         1.000000
25%      2138.000000
50%      3977.000000
75%      4597.000000
max      5881.000000
dtype: float64
<class 'pandas.core.frame.DataFrame'>
MultiIndex: 13144778 entries, ('1301.JP', Timestamp('2005-03-22 00:00:00')) to ('9997.JP', Timestamp('2023-12-28 00:00:00'))
Data columns (total 5 columns):
 #   Column  Non-Null Count     Dtype  
---  ------  --------------     -----  
 0   open    13144778 non-null  float64
 1   high    13144778 non-null  float64
 2   low     13144778 non-null  float64
 3   close   13144778 non-null  float64
 4   volume  13144778 non-null  float64
dtypes: float64(5)
memory usage: 551.9+ MB
None
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3941 entries, 0 to 3940
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   t

---