### Import and Config

In [1]:
# method: setup
import pandas as pd
from pathlib import Path

aapl_clean = Path("data/aapl_cleaned.csv")
msft_clean = Path("data/msft_cleaned.csv")
vnindex_clean = Path("data/vnindex_cleaned.csv")

out_dir = Path("data")
out_dir.mkdir(parents=True, exist_ok=True)

### Helpers

In [2]:
# method: reusable io + checks
def load_df(path: Path) -> pd.DataFrame:
    df = pd.read_csv(path, parse_dates=["date"])
    df = df.sort_values("date").drop_duplicates(subset="date", keep="last").reset_index(drop=True)
    if "close" not in df.columns: 
        raise ValueError(f"{path} missing 'close'")
    return df[["date","close"]].dropna()

def report(df: pd.DataFrame) -> dict:
    return {
        "rows": len(df),
        "min": df["date"].min(),
        "max": df["date"].max(),
        "date_is_dt": pd.api.types.is_datetime64_any_dtype(df["date"]),
        "close_is_num": pd.api.types.is_numeric_dtype(df["close"]),
        "dup": int(df["date"].duplicated().sum()),
        "na_close": int(df["close"].isna().sum()),
        "sorted": df["date"].is_monotonic_increasing,
        "neg_close": int((df["close"] < 0).sum()),
    }

def assert_ready(r: dict, min_rows=500):
    assert r["date_is_dt"] and r["close_is_num"]
    assert r["dup"] == 0 and r["na_close"] == 0
    assert r["sorted"] and r["neg_close"] == 0
    assert r["rows"] >= min_rows

### load & show current ranges

In [3]:
# method: load and inspect ranges
aapl = load_df(aapl_clean)
msft = load_df(msft_clean)
vnindex = load_df(vnindex_clean)

for name, df in [("AAPL", aapl), ("MSFT", msft), ("VNINDEX", vnindex)]:
    r = report(df)
    print(f"{name}: {r['min'].date()} → {r['max'].date()} | rows={r['rows']}")

AAPL: 2020-06-04 → 2025-06-02 | rows=1255
MSFT: 2018-01-02 → 2025-10-17 | rows=1960
VNINDEX: 2018-01-02 → 2023-03-22 | rows=1303


### compute true overlap and cut

In [4]:
# method: compute overlap = [max(start), min(end)]
start_common = max(aapl["date"].min(), msft["date"].min(), vnindex["date"].min())
end_common   = min(aapl["date"].max(), msft["date"].max(), vnindex["date"].max())
print("overlap:", start_common.date(), "→", end_common.date())

def cut(df: pd.DataFrame, s, e) -> pd.DataFrame:
    df2 = df[(df["date"] >= s) & (df["date"] <= e)].reset_index(drop=True)
    return df2

aapl_al   = cut(aapl,   start_common, end_common)
msft_al   = cut(msft,   start_common, end_common)
vnindex_al= cut(vnindex,start_common, end_common)

for name, df in [("AAPL_aligned", aapl_al), ("MSFT_aligned", msft_al), ("VNINDEX_aligned", vnindex_al)]:
    print(f"{name}: {df['date'].min().date()} → {df['date'].max().date()} | rows={len(df)}")

overlap: 2020-06-04 → 2023-03-22
AAPL_aligned: 2020-06-04 → 2023-03-22 | rows=705
MSFT_aligned: 2020-06-04 → 2023-03-22 | rows=705
VNINDEX_aligned: 2020-06-04 → 2023-03-22 | rows=701


### final readiness check on aligned data

In [5]:
# method: final validation before API
for name, df in [("AAPL_aligned", aapl_al), ("MSFT_aligned", msft_al), ("VNINDEX_aligned", vnindex_al)]:
    r = report(df)
    try:
        assert_ready(r, min_rows=500)
        print(f"{name}: READY | rows={r['rows']}")
    except AssertionError as e:
        print(f"{name}: NOT READY -> {e}")

AAPL_aligned: READY | rows=705
MSFT_aligned: READY | rows=705
VNINDEX_aligned: READY | rows=701


### save aligned csvs

In [6]:
aapl_out = out_dir / "aapl_aligned.csv"
msft_out = out_dir / "msft_aligned.csv"
vnindex_out = out_dir / "vnindex_aligned.csv"

aapl_al.to_csv(aapl_out, index=False)
msft_al.to_csv(msft_out, index=False)
vnindex_al.to_csv(vnindex_out, index=False)

print("saved:")
print(" -", aapl_out)
print(" -", msft_out)
print(" -", vnindex_out)

# ---- THÊM từ đây (tuỳ chọn) ----
merged = pd.concat([aapl_al.assign(symbol="AAPL"),
                    msft_al.assign(symbol="MSFT"),
                    vnindex_al.assign(symbol="VNINDEX")],
                   ignore_index=True)
merged.to_csv(out_dir / "all_aligned.csv", index=False)
print(" -", out_dir / "all_aligned.csv")
# ---- Hết phần thêm ----

saved:
 - data/aapl_aligned.csv
 - data/msft_aligned.csv
 - data/vnindex_aligned.csv
 - data/all_aligned.csv
