# Forecast

In [33]:
import os
import sys
from pathlib import Path
import pandas as pd
import numpy as np

# 1. Thư mục Notebook hiện tại
current_dir = Path.cwd()

# 2. Thư mục project_root là cha của Notebook (chứa folder Module, Clean_Data_2023-2025, ...)
project_root_dir = current_dir.parent

# 3. Thêm project_root vào sys.path để import Module.*
if str(project_root_dir) not in sys.path:
    sys.path.append(str(project_root_dir))

print(f"Project root: {project_root_dir}")

from Module.Load_Data import CleanDataLoader

Project root: c:\Users\Admin\OneDrive\Máy tính\Code\Project Python for Data\PythonProject


# Dự báo theo môn học

---

# Dự báo theo tỉ lệ tổ hợp

In [34]:
import pandas as pd
from pathlib import Path
from Module.Load_Data import CleanDataLoader
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression


def load_all_block_analysis_with_features(
    clean_data: CleanDataLoader,
) -> pd.DataFrame:
    """
    - Đọc toàn bộ Export_Analysis_<BLOCK>.csv và Export_Distribution_<BLOCK>.csv
      từ Block_Data để tạo bảng feature cho train.
    - Thêm:
        + n_students: số thí sinh của khối đó trong từng năm (từ distribution)
        + total_students_year: tổng số thí sinh toàn năm (từ clean_data.get_total_students())
        + share_in_year: tỉ lệ thí sinh khối đó / tổng thí sinh năm
    """

    block_root: Path = clean_data.block_data_root
    if not block_root.exists():
        raise FileNotFoundError(f"Block_Data folder không tồn tại: {block_root}")

    all_dfs = []

    # Tiền tố folder: CleanData_<BLOCK> (ví dụ CleanData_A00)
    folder_prefix = clean_data._block_data_f_prefix + "_"   # "CleanData_"

    for sub in block_root.iterdir():
        if not sub.is_dir():
            continue

        folder_name = sub.name
        if not folder_name.startswith(folder_prefix):
            continue

        # Lấy mã khối: CleanData_A00 -> A00
        block_code = folder_name[len(folder_prefix):]

        # 1) Đọc file analysis
        try:
            df_anal = clean_data.get_block_data(block=block_code, kind="analysis").copy()
        except FileNotFoundError:
            continue

        if "khoi" not in df_anal.columns:
            df_anal["khoi"] = block_code

        # 2) Đọc file distribution để đếm n_students theo năm
        try:
            df_dist = clean_data.get_block_data(block=block_code, kind="distribution").copy()
        except FileNotFoundError:
            df_dist = pd.DataFrame(columns=["nam_hoc", "so_hoc_sinh"])

        if not df_dist.empty:
            # Xác định cột đếm số học sinh
            if "so_hoc_sinh" in df_dist.columns:
                cnt_col = "so_hoc_sinh"
                counts = (
                    df_dist.groupby("nam_hoc")[cnt_col]
                           .sum()
                           .reset_index(name="n_students")
                )
            elif "count" in df_dist.columns:
                cnt_col = "count"
                counts = (
                    df_dist.groupby("nam_hoc")[cnt_col]
                           .sum()
                           .reset_index(name="n_students")
                )
            else:
                # fallback: chỉ đếm số dòng theo năm
                counts = (
                    df_dist.groupby("nam_hoc")
                           .size()
                           .reset_index(name="n_students")
                )
        else:
            counts = pd.DataFrame(columns=["nam_hoc", "n_students"])

        # 3) Merge n_students vào analysis
        df_merged = df_anal.merge(counts, on="nam_hoc", how="left")
        all_dfs.append(df_merged)

    if not all_dfs:
        return pd.DataFrame()

    block_analysis_all = pd.concat(all_dfs, ignore_index=True)

    # --- 4. Lấy tổng số thí sinh theo năm từ CleanDataLoader ---
    #    File: Clean_Data_2023-2025/Export_Yearly_Total_Students.csv
    #    Hàm: clean_data.get_total_students()
    df_total = clean_data.get_total_students().copy()

    # Chuẩn hoá tên cột tổng số thí sinh về 'total_students_year'
    if "total_students_year" not in df_total.columns:
        for cand in ["tong_thi_sinh", "total_students", "so_thi_sinh",
                     "tong_hoc_sinh", "n_students"]:
            if cand in df_total.columns:
                df_total = df_total.rename(columns={cand: "total_students_year"})
                break

    if "nam_hoc" not in df_total.columns:
        raise ValueError(
            "DataFrame từ get_total_students() phải có cột 'nam_hoc'."
        )

    # Merge tổng thí sinh năm vào bảng feature
    block_analysis_all = block_analysis_all.merge(
        df_total[["nam_hoc", "total_students_year"]],
        on="nam_hoc",
        how="left"
    )

    # 5) Tính share_in_year = n_students / total_students_year
    if "n_students" in block_analysis_all.columns and "total_students_year" in block_analysis_all.columns:
        block_analysis_all["share_in_year"] = (
            block_analysis_all["n_students"] / block_analysis_all["total_students_year"]
        )

    return block_analysis_all


In [35]:
clean_data = CleanDataLoader(project_root=project_root_dir)

df_block_features = load_all_block_analysis_with_features(clean_data=clean_data)
df_block_features

Unnamed: 0,nam_hoc,mean,median,mode,std,min,max,khoi,n_students,total_students_year,share_in_year
0,2023,20.774499,21.150,22.10,3.094093,0.45,29.35,A00,325902,1017584,0.320270
1,2024,20.904633,21.300,22.80,3.380354,2.15,29.60,A00,343800,1061604,0.323850
2,2025,19.384368,19.250,22.00,4.337350,1.70,30.00,A00,165467,1153072,0.143501
3,2023,20.275288,20.450,20.50,3.339521,5.05,29.80,A01,315146,1017584,0.309700
4,2024,20.473793,20.750,21.00,3.350473,5.20,29.60,A01,329761,1061604,0.310625
...,...,...,...,...,...,...,...,...,...,...,...
152,2025,20.486105,20.750,21.50,2.593046,8.95,26.50,Y07,475,1153072,0.000412
153,2025,19.659654,19.900,21.00,2.518257,13.25,25.25,Y08,260,1153072,0.000225
154,2025,21.366093,21.750,22.75,2.741671,7.25,28.50,Y09,9493,1153072,0.008233
155,2025,17.617708,16.925,16.60,2.435760,13.95,24.00,Y10,48,1153072,0.000042


# Xử lý dữ liệu trước khi Train

In [36]:
import numpy as np

df = df_block_features.copy()

# 1. Chọn 8 khối chính + OTHER
MAIN_BLOCKS = ["A00", "A01", "B00", "C00", "D01", "D07", "D08", "D09"]

# Nếu khoi trong MAIN_BLOCKS thì giữ nguyên, ngược lại gom về OTHER
df["khoi_group"] = np.where(df["khoi"].isin(MAIN_BLOCKS), df["khoi"], "OTHER")

# 2. Index thời gian (0, 1, 2, ...)
df["year_idx"] = df["nam_hoc"] - df["nam_hoc"].min()

# 3. Sắp xếp theo nhóm khối + năm
df = df.sort_values(["khoi_group", "nam_hoc"]).reset_index(drop=True)

# 4. Tính lag & delta cho từng khoi_group
df["share_lag1"]   = df.groupby("khoi_group")["share_in_year"].shift(1)
df["delta_share"]  = df["share_in_year"] - df["share_lag1"]

# 5. Bỏ các dòng không có lag (năm đầu tiên của mỗi khoi_group)
df_model = df.dropna(subset=["share_lag1"]).copy()

# Các feature số dùng cho model
base_features = [
    "year_idx", "mean", "median", "std",
    "n_students", "total_students_year",
    "share_lag1", "delta_share"
]

df_model


Unnamed: 0,nam_hoc,mean,median,mode,std,min,max,khoi,n_students,total_students_year,share_in_year,khoi_group,year_idx,share_lag1,delta_share
1,2024,20.904633,21.300,22.80,3.380354,2.15,29.60,A00,343800,1061604,0.323850,A00,1,0.320270,0.003579
2,2025,19.384368,19.250,22.00,4.337350,1.70,30.00,A00,165467,1153072,0.143501,A00,2,0.323850,-0.180349
4,2024,20.473793,20.750,21.00,3.350473,5.20,29.60,A01,329761,1061604,0.310625,A01,1,0.309700,0.000925
5,2025,18.878852,18.850,20.00,3.627548,5.50,29.75,A01,148930,1153072,0.129159,A01,2,0.310625,-0.181466
7,2024,20.531107,20.750,21.20,2.981836,3.20,29.55,B00,342291,1061604,0.322428,B00,1,0.318946,0.003482
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
152,2025,20.486105,20.750,21.50,2.593046,8.95,26.50,Y07,475,1153072,0.000412,OTHER,2,0.000791,-0.000379
153,2025,19.659654,19.900,21.00,2.518257,13.25,25.25,Y08,260,1153072,0.000225,OTHER,2,0.000412,-0.000186
154,2025,21.366093,21.750,22.75,2.741671,7.25,28.50,Y09,9493,1153072,0.008233,OTHER,2,0.000225,0.008007
155,2025,17.617708,16.925,16.60,2.435760,13.95,24.00,Y10,48,1153072,0.000042,OTHER,2,0.008233,-0.008191


# Các mô hình dự báo

In [37]:
import numpy as np
# Linear Regression với Gradient Descent
class LinearRegressionGD:
    def __init__(
        self,
        lr: float = 0.01,
        n_iter: int = 2000,
        reg_lambda: float = 0.0,
        verbose: bool = False,
        random_state: int | None = None,
    ):
        """
        Linear Regression (OLS) huấn luyện bằng Gradient Descent.

        lr         : learning rate
        n_iter     : số vòng lặp GD
        reg_lambda : hệ số L2 regularization (0 = không regularize)
        """
        self.lr = lr
        self.n_iter = n_iter
        self.reg_lambda = reg_lambda
        self.verbose = verbose
        self.random_state = random_state

        self.W = None   # (d,)
        self.b = None   # scalar

    # Huấn luyện model với dữ liệu X, y
    def fit(self, X: np.ndarray, y: np.ndarray):
        X = np.asarray(X, dtype=float)
        y = np.asarray(y, dtype=float).reshape(-1)

        # Kích thước dữ liệu
        N, d = X.shape

        # Khởi tạo tham số
        rng = np.random.default_rng(self.random_state)
        self.W = rng.normal(loc=0.0, scale=0.01, size=d)
        self.b = 0.0

        # Thực hiện tìm Gradient Descent
        for it in range(self.n_iter):
            # y_hat = XW + b: Dự đoán
            y_hat = X @ self.W + self.b  # (N,)

            # residuals: đo sai số giữa dự đoán và thực tế ( Tính mất mát )
            err = y_hat - y             # (N,)

            # gradient (MSE + L2): 
            dW = (X.T @ err) / N + self.reg_lambda * self.W   # (d,)
            db = err.mean()                                   # scalar

            # update: Cập nhật tham số
            self.W -= self.lr * dW
            self.b -= self.lr * db

            # Kiểm tra loss mỗi 200 vòng hoặc vòng cuối
            if self.verbose and (it % 200 == 0 or it == self.n_iter - 1):
                mse = (err ** 2).mean()
                loss = mse + 0.5 * self.reg_lambda * np.sum(self.W ** 2)
                # print(f"Iter {it:4d} | loss = {loss:.8f}")

        return self

    # Dự đoán với dữ liệu X mới
    def predict(self, X: np.ndarray) -> np.ndarray:
        if self.W is None:
            raise ValueError("Model chưa được fit.")
        X = np.asarray(X, dtype=float)
        return X @ self.W + self.b


In [38]:
from sklearn.metrics import mean_absolute_error, r2_score

# One-hot cho khoi_group
khoi_dummies = pd.get_dummies(df_model["khoi_group"], prefix="khoi_group")

X_all = pd.concat([df_model[base_features], khoi_dummies], axis=1)
y_all = df_model["share_in_year"]

# Train 2023–2024, Test 2025
train_mask = df_model["nam_hoc"] < 2025
test_mask  = df_model["nam_hoc"] == 2025

X_train = X_all.loc[train_mask].values
y_train = y_all.loc[train_mask].values

X_test  = X_all.loc[test_mask].values
y_test  = y_all.loc[test_mask].values

# Scale feature để tránh overflow
scaler_lin = StandardScaler()
X_train_scaled = scaler_lin.fit_transform(X_train)
X_test_scaled  = scaler_lin.transform(X_test)


In [39]:
# Dùng class LinearRegressionGD bạn đã định nghĩa
lin_model = LinearRegressionGD(
    lr=1e-3,
    n_iter=10000,
    reg_lambda=1e-2,
    verbose=True,
    random_state=42
)

lin_model.fit(X_train_scaled, y_train)
y_pred_lin = lin_model.predict(X_test_scaled)

# Đánh giá
mae_lin = mean_absolute_error(y_test, y_pred_lin)
r2_lin  = r2_score(y_test, y_pred_lin)

print(f"[Linear] MAE = {mae_lin:.4f}, R² = {r2_lin:.4f}")

df_eval_lin = df_model.loc[test_mask, ["nam_hoc", "khoi", "khoi_group"]].copy()
df_eval_lin["share_true"] = y_test
df_eval_lin["share_pred"] = y_pred_lin
df_eval_lin["abs_err"]    = (df_eval_lin["share_true"] - df_eval_lin["share_pred"]).abs()

print(df_eval_lin.head(10))



[Linear] MAE = 0.0139, R² = 0.9711
    nam_hoc khoi khoi_group  share_true  share_pred   abs_err
2      2025  A00        A00    0.143501    0.154485  0.010984
5      2025  A01        A01    0.129159    0.126685  0.002475
8      2025  B00        B00    0.040420    0.068047  0.027627
11     2025  C00        C00    0.268857    0.305421  0.036563
14     2025  D01        D01    0.312425    0.333461  0.021037
17     2025  D07        D07    0.314339    0.338312  0.023974
20     2025  D08        D08    0.004081    0.022253  0.018171
23     2025  D09        D09    0.063478    0.059584  0.003895
72     2025  A02      OTHER    0.004950    0.029343  0.024393
73     2025  A03      OTHER    0.014652    0.035335  0.020683


In [40]:
df_model2 = df_model.copy()

# Chỉ lấy feature số
X_cols = base_features
for c in X_cols:
    df_model2[c] = pd.to_numeric(df_model2[c], errors="coerce")

df_model2 = df_model2.replace([np.inf, -np.inf], np.nan).dropna(subset=X_cols + ["khoi_group", "nam_hoc"])

In [41]:
# Split train/test theo năm (test = năm max, thường là 2025)
test_year = int(df_model2["nam_hoc"].max())
train_df = df_model2[df_model2["nam_hoc"] < test_year].copy()
test_df  = df_model2[df_model2["nam_hoc"] == test_year].copy()

X_train = train_df[X_cols].values
y_train = train_df["khoi_group"].astype(str).values
X_test  = test_df[X_cols].values
y_test  = test_df["khoi_group"].astype(str).values

# Nếu train bị thiếu class thì báo luôn (tránh lỗi sklearn)
if len(np.unique(y_train)) < 2:
    raise ValueError(f"Train set chỉ có {len(np.unique(y_train))} class. Kiểm tra lại dữ liệu train.")


In [44]:
class MultiShareModel:
    """
    Multinomial Logistic cho bài toán dự báo 'share' khi dữ liệu là 1 dòng / (khoi_group, năm).
    Ý tưởng:
      - Fit classifier y=khoi_group từ feature.
      - Với 1 năm, mỗi khoi_group có 1 dòng -> lấy decision score của đúng class cho từng dòng.
      - Softmax các score đó để ra phân phối share (tổng = 1).
    """
    def __init__(self, feature_cols, random_state=42):
        self.feature_cols = feature_cols
        self.random_state = random_state
        self.model = Pipeline([
            ("scaler", StandardScaler()),
            ("lr", LogisticRegression(
                solver="saga",
                penalty="l2",
                class_weight="balanced",
                max_iter=10000,
                random_state=random_state
            ))
        ])
        self._fitted = False

    @staticmethod
    def _softmax(v):
        v = np.asarray(v, dtype=float)
        v = v - np.max(v)
        e = np.exp(v)
        return e / (e.sum() + 1e-12)

    def _clean(self, df):
        out = df.copy()
        for c in self.feature_cols:
            out[c] = pd.to_numeric(out[c], errors="coerce")
        out = out.replace([np.inf, -np.inf], np.nan).dropna(subset=self.feature_cols + ["khoi_group", "nam_hoc"])
        out["nam_hoc"] = out["nam_hoc"].astype(int)
        out["khoi_group"] = out["khoi_group"].astype(str)
        return out

    def fit(self, df_train):
        df_train = self._clean(df_train)
        X = df_train[self.feature_cols].values
        y = df_train["khoi_group"].values
        if len(np.unique(y)) < 2:
            raise ValueError("Train set chỉ có <2 class. Kiểm tra lại dữ liệu train.")
        self.model.fit(X, y)
        self._fitted = True
        return self

    def predict_share_year(self, df_year):
        if not self._fitted:
            raise RuntimeError("Model chưa fit().")

        df_year = self._clean(df_year)
        # giữ đúng order các khoi trong năm
        groups = df_year["khoi_group"].tolist()

        lr = self.model.named_steps["lr"]
        classes = lr.classes_
        class_to_idx = {c:i for i, c in enumerate(classes)}

        X = df_year[self.feature_cols].values
        Xs = self.model.named_steps["scaler"].transform(X)
        scores = lr.decision_function(Xs)

        # Binary case: scores (n,) -> convert to (n,2)
        if scores.ndim == 1:
            scores = np.vstack([-scores, scores]).T

        # lấy score theo đúng class của từng dòng
        s = np.empty(len(groups), dtype=float)
        for i, g in enumerate(groups):
            s[i] = scores[i, class_to_idx[g]] if g in class_to_idx else -1e9

        share = self._softmax(s)
        out = pd.DataFrame({"khoi_group": groups, "share_pred": share})
        out["share_pred"] = out["share_pred"] / out["share_pred"].sum()
        return out

    def evaluate_year(self, df_all, year):
        df_all = self._clean(df_all)
        df_year = df_all[df_all["nam_hoc"] == int(year)].copy()

        df_true = (df_year.groupby("khoi_group", as_index=False)["share_in_year"]
                   .mean().rename(columns={"share_in_year": "share_true"}))
        df_pred = self.predict_share_year(df_year[["nam_hoc","khoi_group"] + self.feature_cols].copy())
        df_pred = df_pred.rename(columns={"share_pred":"share_pred"})

        df_eval = df_true.merge(df_pred, on="khoi_group", how="outer").fillna(0)
        df_eval["abs_err"] = (df_eval["share_true"] - df_eval["share_pred"]).abs()
        mae = df_eval["abs_err"].mean()
        return df_eval.sort_values("abs_err", ascending=False).reset_index(drop=True), mae

    def build_features_for_next_year(self, df_all):
        """
        Tạo feature cho năm (max_year+1) theo đúng logic notebook:
          - year_idx = max(year_idx)+1
          - share_lag1 = share_in_year năm cuối
          - delta_share = share_in_year(last) - share_in_year(last-1)
          - mean/median/std/n_students/total_students_year: copy từ năm cuối
        """
        df_all = self._clean(df_all)
        last_year = int(df_all["nam_hoc"].max())
        prev_year = last_year - 1

        df_last = df_all[df_all["nam_hoc"] == last_year].copy()
        df_prev = df_all[df_all["nam_hoc"] == prev_year][["khoi_group","share_in_year"]].copy()
        df_prev = df_prev.rename(columns={"share_in_year":"share_prev"})

        df_next = df_last.merge(df_prev, on="khoi_group", how="left")
        df_next["share_lag1"] = df_next["share_in_year"]
        df_next["delta_share"] = df_next["share_in_year"] - df_next["share_prev"].fillna(df_next["share_in_year"])
        df_next["year_idx"] = int(df_last["year_idx"].max()) + 1
        df_next["nam_hoc"] = last_year + 1

        # đảm bảo đủ feature_cols
        for c in self.feature_cols:
            if c not in df_next.columns:
                df_next[c] = 0
        df_next = df_next[["nam_hoc","khoi_group"] + self.feature_cols].copy()
        return df_next

    def forecast_next_year(self, df_all):
        df_next = self.build_features_for_next_year(df_all)
        return self.predict_share_year(df_next)

In [47]:
# TRAIN/EVAL MULTI
multi = MultiShareModel(feature_cols=base_features).fit(df_model)   # df_model đã dropna lag1
eval_multi, mae_multi = multi.evaluate_year(df, year=df["nam_hoc"].max())
print(eval_multi.head(10))
print("MAE MULTI:", round(mae_multi, 4))

pred_2026_multi = multi.forecast_next_year(df)
print(pred_2026_multi.head(10))

  khoi_group  share_true    share_pred   abs_err
0        D07    0.314339  8.418048e-01  0.527466
1        D01    0.312425  5.119076e-05  0.312373
2        C00    0.268857  1.316907e-08  0.268857
3        A00    0.143501  8.050070e-07  0.143500
4        A01    0.129159  3.595567e-08  0.129159
5      OTHER    0.039501  1.517424e-01  0.112241
6        D09    0.063478  4.665389e-08  0.063478
7        B00    0.040420  1.188579e-07  0.040420
8      OTHER    0.039501  1.241538e-09  0.039501
9      OTHER    0.039501  1.192279e-08  0.039501
MAE MULTI: 0.0528
  khoi_group    share_pred
0        A00  8.037195e-05
1        A01  1.886261e-06
2        B00  3.822099e-06
3        C00  2.817818e-07
4        D01  3.856405e-07
5        D07  7.471079e-01
6        D08  7.796743e-06
7        D09  1.272405e-07
8      OTHER  4.373399e-06
9      OTHER  2.644213e-07




In [None]:
from statsmodels.tsa.arima.model import ARIMA

# ARIMA forecast cho share theo từng khoi_group
class ArimaShareModel:
    """
    ARIMA forecast cho share theo từng khoi_group.
    - Fix warning bằng PeriodIndex(freq="Y")
    - Forecast từng group, clip >= 1e-9
    - Normalize tổng share = 1
    """
    def __init__(self, min_points_for_ar1=4):
        self.min_points_for_ar1 = min_points_for_ar1

    @staticmethod
    def _to_series(sub_df):
        years = sub_df["nam_hoc"].astype(int).tolist()
        vals = pd.to_numeric(sub_df["share_in_year"], errors="coerce").astype(float).tolist()
        s = pd.Series(vals, index=pd.PeriodIndex(years, freq="Y")).asfreq("Y")
        return s

    def _forecast_one(self, s: pd.Series):
        s = s.astype(float)
        if len(s) < 2:
            return float(s.iloc[-1])
        order = (1,0,0) if len(s) >= self.min_points_for_ar1 else (0,0,0)
        try:
            res = ARIMA(s, order=order).fit()
            return float(res.forecast(steps=1).iloc[0])
        except Exception:
            return float(s.iloc[-1])

    def forecast_year(self, df_all, target_year):
        df_all = df_all.copy()
        df_all["nam_hoc"] = df_all["nam_hoc"].astype(int)
        df_all["khoi_group"] = df_all["khoi_group"].astype(str)
        df_all["share_in_year"] = pd.to_numeric(df_all["share_in_year"], errors="coerce")
        df_all = df_all.dropna(subset=["share_in_year"])

        preds = {}
        for g, sub in df_all.groupby("khoi_group"):
            sub = sub.sort_values("nam_hoc")
            sub = sub[sub["nam_hoc"] < int(target_year)]
            if len(sub) == 0:
                continue
            s = self._to_series(sub)
            yhat = self._forecast_one(s)
            preds[g] = max(yhat, 1e-9)

        pred = pd.Series(preds, name=f"share_pred_{int(target_year)}")
        pred = pred / pred.sum()
        return pred.reset_index().rename(columns={"index":"khoi_group"})

    def evaluate_year(self, df_all, year):
        year = int(year)
        df_all = df_all.copy()
        df_all["nam_hoc"] = df_all["nam_hoc"].astype(int)

        df_true = df_all[df_all["nam_hoc"] == year][["khoi_group","share_in_year"]].copy()
        df_true = df_true.groupby("khoi_group", as_index=False)["share_in_year"].mean()
        df_true = df_true.rename(columns={"share_in_year":"share_true"})

        df_pred = self.forecast_year(df_all, target_year=year)
        pred_col = f"share_pred_{year}"

        df_eval = df_true.merge(df_pred, on="khoi_group", how="outer").fillna(0)
        df_eval["abs_err"] = (df_eval["share_true"] - df_eval[pred_col]).abs()
        mae = df_eval["abs_err"].mean()
        return df_eval.sort_values("abs_err", ascending=False).reset_index(drop=True), mae


In [50]:
# TRAIN/EVAL ARIMA
arima = ArimaShareModel()
eval_arima, mae_arima = arima.evaluate_year(df, year=df["nam_hoc"].max())
print(eval_arima.head(10))
print("MAE ARIMA:", round(mae_arima, 4))

pred_2026_arima = arima.forecast_year(df, target_year=df["nam_hoc"].max() + 1)
print(pred_2026_arima.head(10))



  khoi_group  share_true  share_pred_2025   abs_err
0        D07    0.314339         0.184648  0.129690
1        D01    0.312425         0.184078  0.128347
2        C00    0.268857         0.142757  0.126101
3        A00    0.143501         0.068976  0.074525
4        A01    0.129159         0.066428  0.062731
5        D08    0.004081         0.066093  0.062011
6      OTHER    0.039501         0.101124  0.061623
7        D09    0.063478         0.117214  0.053736
8        B00    0.040420         0.068682  0.028262
MAE ARIMA: 0.0808
  khoi_group  share_pred_2026
0        A00         0.080405
1        A01         0.076512
2        B00         0.069601
3        C00         0.163539
4        D01         0.207378
5        D07         0.208117
6        D08         0.063423
7        D09         0.118222
8      OTHER         0.012803
