In [None]:
!pip install yfinance --upgrade


In [None]:
!pip install pyarrow

In [None]:
!pip install fastparquet

In [None]:
!pip install numba

In [None]:
!pip install numexpr

In [None]:
!pip install bottleneck

In [None]:
import pandas as pd
import io
import requests

url = 'https://www.hkex.com.hk/eng/services/trading/securities/securitieslists/ListOfSecurities.xlsx'
try:
    response = requests.get(url, timeout=10.0)
    response.raise_for_status()
except requests.RequestException as e:
    raise SystemExit(f"Error, {url} is not available: {e}")

hk_stk = pd.read_excel(io.BytesIO(response.content), skiprows=2, dtype={'Stock Code': str}, index_col= 'Stock Code')
hk_stk = hk_stk[(hk_stk.index.astype(int)<10000) & (hk_stk['Category'] == 'Equity')]
hk_stk.rename(index=lambda x: x[-4:] +'.HK', inplace=True)
hk_stk.index.name= 'Ticker'
hk_stk['exchange'] = 'HKG'
hk_stk

In [None]:
import pandas as pd
from io import BytesIO
import requests

STOCK_SOURCES = {
    'ASE': 'https://raw.githubusercontent.com/rreichel3/US-Stock-Symbols/refs/heads/main/amex/amex_full_tickers.json',
    'NMS': 'https://raw.githubusercontent.com/rreichel3/US-Stock-Symbols/refs/heads/main/nasdaq/nasdaq_full_tickers.json',
    'NYQ': 'https://raw.githubusercontent.com/rreichel3/US-Stock-Symbols/refs/heads/main/nyse/nyse_full_tickers.json',
}

def fetch_exchange_data(name, url):
    try:
        response = requests.get(url, timeout=10.0)
        response.raise_for_status()
        df = pd.read_json(BytesIO(response.content)).reset_index(drop=True).set_index('symbol')
        df.index.name = 'Ticker'
        df['exchage']= name
        return df
    except requests.RequestException as e:
        raise SystemExit(f'Failed to fetch{name} data : {e}')
    
us_stk = pd.concat([fetch_exchange_data(name, url) for name , url in STOCK_SOURCES.items()])
us_stk

In [None]:
import yfinance as yf
stk_list = hk_stk.index.tolist() + us_stk.index.tolist()
#stk_list =  us_stk.index.tolist()

stk_price = yf.download(tickers=stk_list, period='1y', group_by='ticker', actions=True, threads=True).stack(level=0).reindex()

stk_price

In [None]:
"""
Download daily history for a (potentially large) list of tickers from both
Hong Kong and U.S. markets, using yfinance in manageable batches.

Author : chi
"""

from __future__ import annotations

import itertools
from typing import Iterable, List

import pandas as pd
import yfinance as yf


# ──────────────────────────────────────────────────────────────────────────────
# Configuration
# ──────────────────────────────────────────────────────────────────────────────
BATCH_SIZE   = 25            # yfinance becomes flaky with >50-75 symbols / call
DOWNLOAD_KW  = dict(         # default yfinance options; tweak as you wish
    period   = "5y",
    group_by = "ticker",
    actions  = True,
    threads  = True,
    auto_adjust = True,
)


# ──────────────────────────────────────────────────────────────────────────────
# Utilities
# ──────────────────────────────────────────────────────────────────────────────
def chunked(iterable: Iterable[str], size: int) -> Iterable[List[str]]:
    "Yield lists of at most *size* items from *iterable*."
    iterator = iter(iterable)
    while (batch := list(itertools.islice(iterator, size))):
        yield batch


def unique_order_preserved(seq: Iterable[str]) -> List[str]:
    "Return a list with duplicates removed, keeping first-occurrence order."
    seen: set[str] = set()
    return [x for x in seq if not (x in seen or seen.add(x))]


# ──────────────────────────────────────────────────────────────────────────────
# Core logic
# ──────────────────────────────────────────────────────────────────────────────
def download_history(tickers: List[str],
                     *,
                     batch_size: int = BATCH_SIZE,
                     **kwargs) -> pd.DataFrame:
    """
    Fetch OHLCV history for *tickers* in successive batches.

    Returns a tidy DataFrame indexed by ['Date', 'Ticker'].
    """
    opts = {**DOWNLOAD_KW, **kwargs}
    frames: list[pd.DataFrame] = []

    for batch in chunked(tickers, batch_size):
        try:
            raw = yf.download(tickers=batch, **opts)          # Multi-column wide form
        except Exception as exc:                       # network hiccup, etc.
            print(f"⚠️  batch {batch[:3]}… failed: {exc}")
            continue

        if raw.empty:
            print(f"⚠️  batch {batch[:3]}… returned no data")
            continue

        frames.append(raw)

    if not frames:
        raise RuntimeError("No data fetched for any ticker.")

    # wide → long; put 'Ticker' in index level 1, preserve data columns level 2
    df = (pd.concat(frames, axis=1)
            .stack(level=0)                # (Date, DataColumn) × Ticker → (Date, Ticker) index
            .swaplevel(0, 1)               # want (Date, Ticker) not (Ticker, Date)
            .sort_index()
            .rename_axis(index=["Ticker", "Date"])
            .sort_index())
    return df


def build_ticker_list(us_df: pd.DataFrame, hk_df: pd.DataFrame) -> List[str]:
    "Merge HK and US ticker indices into a de-duplicated, deterministic list."
    return unique_order_preserved(us_df.index.tolist() + hk_df.index.tolist())


# ──────────────────────────────────────────────────────────────────────────────
# Public entry point
# ──────────────────────────────────────────────────────────────────────────────
def get_full_price_history(hk_stk: pd.DataFrame,us_stk: pd.DataFrame, **download_kw) -> pd.DataFrame:
#def get_full_price_history(us_stk: pd.DataFrame, **download_kw) -> pd.DataFrame:
    """
    Convenience wrapper: deduplicate tickers and call yfinance downloader.
    """
    tickers = build_ticker_list(us_stk, hk_stk)
    print(f"📈 Downloading history for {len(tickers)} symbols …")
    return download_history(tickers, **download_kw)


# ──────────────────────────────────────────────────────────────────────────────
# Example usage (uncomment for quick test)
# ──────────────────────────────────────────────────────────────────────────────

stk_price = get_full_price_history(hk_stk, us_stk)
stk_price.to_parquet("data/full_prices.parquet", compression="gzip", index=False)
print(stk_price.head())


In [None]:
stk_price.to_parquet("data/full_prices.parquet", compression="gzip", index=True)

In [None]:
stk_price.to_parquet("data/full_prices.parquet", index=False, engine='pyarrow', compression='gzip')

In [None]:
from __future__ import annotations

import pandas as pd
import yfinance as yf
from concurrent.futures import ThreadPoolExecutor, as_completed
from typing import Dict, List, Sequence


def fetch_info(ticker: str) -> Dict[str, object]:
    """
    Grab Yahoo 'info' for a single ticker.
    Strips the huge 'companyOfficers' blob.
    Returns a minimal placeholder dict if the call fails.
    """
    try:
        info: Dict[str, object] = {k: v for k, v in yf.Ticker(ticker).info.items() if isinstance(v, list)==False} or {} #yf.Ticker(ticker).info or {}
    except Exception as exc:                 # network hiccup, bad symbol, …
        return {"Ticker": ticker, "_error": str(exc)}

    #info.pop("companyOfficers", None)        # drop if present
    info["Ticker"] = ticker
    return info


def build_metadata_frame(
    tickers: Sequence[str],
    max_workers: int = 16,
) -> pd.DataFrame:
    """
    Parallel-download metadata for all tickers and return a tidy DataFrame.
    """
    rows: List[Dict[str, object]] = []

    with ThreadPoolExecutor(max_workers=max_workers) as pool:
        futures = {pool.submit(fetch_info, t): t for t in tickers}
        for fut in as_completed(futures):
            rows.append(fut.result())        # each result is a dict

    return pd.DataFrame.from_records(rows).set_index("Ticker")


# ---------------------------------------------------------------------
# Usage
# ---------------------------------------------------------------------
tickers = hk_stk.index.tolist() + us_stk.index.tolist()
df = build_metadata_frame(tickers)
df
