In [1]:
import pandas as pd

in_path = "SnP_daily_update.csv"   # หรือ path ของคุณ
raw = pd.read_csv(in_path, low_memory=False)

# แถว 0 คือ ticker ของแต่ละคอลัมน์
ticker_row = raw.iloc[0]

# เก็บคอลัมน์ Date (ชื่อว่า Price) + คอลัมน์ที่ ticker == AMZN
keep_cols = [c for c in raw.columns if (c == "Price") or (str(ticker_row[c]).upper() == "AMZN")]

amzn_raw = raw[keep_cols].copy()

# ตัด 2 แถวบนสุดออก: (Ticker row) และ (Date label row)
amzn = amzn_raw.iloc[2:].copy()

# เปลี่ยนชื่อคอลัมน์ให้สะอาด: Price -> Date, Close.31 -> Close (ตัด .xx ออก)
rename_map = {"Price": "Date"}
for c in amzn.columns:
    if c != "Price":
        rename_map[c] = c.split(".")[0]  # Close.31 -> Close

amzn = amzn.rename(columns=rename_map)

# แปลงชนิดข้อมูล
amzn["Date"] = pd.to_datetime(amzn["Date"], errors="coerce")
for c in amzn.columns:
    if c != "Date":
        amzn[c] = pd.to_numeric(amzn[c], errors="coerce")

# เรียงตามวันและบันทึก
amzn = amzn.sort_values("Date").reset_index(drop=True)
amzn.to_csv("SnP_daily_update_AMZN.csv", index=False)

print("Saved: SnP_daily_update_AMZN.csv")
print(amzn.head())


Saved: SnP_daily_update_AMZN.csv
        Date   Close    High     Low    Open     Volume
0 2010-01-04  6.6950  6.8305  6.6570  6.8125  151998000
1 2010-01-05  6.7345  6.7740  6.5905  6.6715  177038000
2 2010-01-06  6.6125  6.7365  6.5825  6.7300  143576000
3 2010-01-07  6.5000  6.6160  6.4400  6.6005  220604000
4 2010-01-08  6.6760  6.6840  6.4515  6.5280  196610000


In [2]:
import pandas as pd
import numpy as np

in_path = "SnP_daily_update_AMZN.csv"   # หรือ path ของคุณ
df = pd.read_csv(in_path)

# --- Basic cleaning / types ---
df["Date"] = pd.to_datetime(df["Date"], errors="coerce")
for c in ["Open", "High", "Low", "Close", "Volume"]:
    df[c] = pd.to_numeric(df[c], errors="coerce")

df = df.dropna(subset=["Date"]).sort_values("Date").reset_index(drop=True)

# --- Feature engineering ---

# 1) Returns
df["ret_1"] = df["Close"].pct_change(1)

# 2) Lag returns
for k in [2, 3, 4, 5]:
    df[f"ret_{k}"] = df["Close"].pct_change(k)

# 3) Moving average gap (10, 20)
df["sma_10"] = df["Close"].rolling(10).mean()
df["sma_20"] = df["Close"].rolling(20).mean()
df["ma_gap_10"] = (df["Close"] - df["sma_10"]) / df["sma_10"]
df["ma_gap_20"] = (df["Close"] - df["sma_20"]) / df["sma_20"]

# 4) Intraday range percent
df["range_pct"] = (df["High"] - df["Low"]) / df["Close"]

# 5) Rolling volatility (std of daily returns)
df["vol_10"] = df["ret_1"].rolling(10).std()
df["vol_20"] = df["ret_1"].rolling(20).std()

# 6) Volume change
df["vol_chg"] = df["Volume"].pct_change(1)

# 7) Volume relative to average (20)
df["vol_sma_20"] = df["Volume"].rolling(20).mean()
df["vol_ratio_20"] = df["Volume"] / df["vol_sma_20"]

# 8) Close-to-Open return
df["co_ret"] = (df["Close"] - df["Open"]) / df["Open"]

# 9) Upper/Lower wick (normalized by Close)
df["upper_wick"] = (df["High"] - np.maximum(df["Open"], df["Close"])) / df["Close"]
df["lower_wick"] = (np.minimum(df["Open"], df["Close"]) - df["Low"]) / df["Close"]

# Clean infinities
cols_inf = ["ma_gap_10", "ma_gap_20", "range_pct", "vol_ratio_20", "upper_wick", "lower_wick"]
df[cols_inf] = df[cols_inf].replace([np.inf, -np.inf], np.nan)

# Save
df.to_csv("SnP_daily_update_AMZN_features.csv", index=False)
print("Saved: SnP_daily_update_AMZN_features.csv")


Saved: SnP_daily_update_AMZN_features.csv


In [7]:
import pandas as pd
import numpy as np

df = pd.read_csv("SnP_daily_update_AMZN_features.csv")
df["Date"] = pd.to_datetime(df["Date"])
df = df.sort_values("Date").reset_index(drop=True)

# 2) Target: next-day return
df["y_ret_t1"] = df["Close"].shift(-1) / df["Close"] - 1


# แถวสุดท้ายจะไม่มีวันถัดไป -> เป็น NaN (ปกติให้ drop ตอน train)
print(df[["Date","Close","y_ret_t1"]].tail())

out_path = "SnP_daily_update_AMZN_features_with_target.csv"
df.to_csv(out_path, index=False)



           Date       Close  y_ret_t1
4044 2026-02-02  242.960007 -0.017863
4045 2026-02-03  238.619995 -0.023594
4046 2026-02-04  232.990005 -0.044208
4047 2026-02-05  222.690002 -0.055548
4048 2026-02-06  210.320007       NaN


In [11]:
import pandas as pd
import numpy as np

from sklearn.model_selection import TimeSeriesSplit
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.ensemble import RandomForestRegressor
from sklearn.inspection import permutation_importance
from sklearn.feature_selection import SelectKBest, f_regression
from sklearn.linear_model import LinearRegression
from sklearn.feature_selection import RFE

# =========================
# 1) Load data
# =========================
path = "SnP_daily_update_AMZN_features_with_target.csv"
df = pd.read_csv(path)

df["Date"] = pd.to_datetime(df["Date"], errors="coerce")
df = df.dropna(subset=["Date"]).sort_values("Date").reset_index(drop=True)

# =========================
# 2) Storytelling: เป้าหมาย + ฟีเจอร์ที่ใช้
# =========================
# Target: y_ret_t1 = ผลตอบแทนของวันพรุ่งนี้ (Close_{t+1}/Close_t - 1)
# ทำไมใช้ return: โมเดลเรียนรู้ “การเปลี่ยนแปลง” ได้ง่ายกว่า “ระดับราคา” (ราคา trend ยาว / ไม่ stationary)
target = "y_ret_t1"

# ฟีเจอร์ (>=5) แบ่งตามเรื่องเล่า:
# A) Momentum/Trend: ret_1..ret_5, ma_gap_10/20 (ราคาอยู่เหนือ/ต่ำกว่าค่าเฉลี่ยแค่ไหน)
# B) Volatility/Risk: range_pct, vol_10/20 (วันนี้แกว่งแรงไหม / ช่วงนี้ผันผวนไหม)
# C) Volume confirmation: vol_chg, vol_ratio_20 (การยืนยันจากปริมาณซื้อขาย)
# D) Candle behavior: co_ret, upper_wick, lower_wick (แรงซื้อ/ขายในแท่งวันนั้น)
feature_cols = [
    "ret_1","ret_2","ret_3","ret_4","ret_5",
    "ma_gap_10","ma_gap_20",
    "range_pct","vol_10","vol_20",
    "vol_chg","vol_ratio_20",
    "co_ret","upper_wick","lower_wick"
]

# ลบแถวที่ฟีเจอร์/target ยังเป็น NaN (เกิดจาก rolling เช่น SMA/vol)
df_model = df.dropna(subset=feature_cols + [target]).reset_index(drop=True)

X = df_model[feature_cols].copy()
y = df_model[target].copy()

print("Data used for modeling:", df_model.shape)
print("Features used:", len(feature_cols), feature_cols)
print("Target:", target)

# =========================
# 3) Train/Test แบบ Time Series (กันข้อมูลรั่ว)
# =========================
# ใช้ TimeSeriesSplit เพื่อให้ train อยู่ก่อน test เสมอ
tscv = TimeSeriesSplit(n_splits=5)

# โมเดลตัวอย่าง: RandomForestRegressor
# เหตุผล: จับความสัมพันธ์ไม่เชิงเส้นได้ และมีแนวคิด feature importance แบบ model-based
model = RandomForestRegressor(
    n_estimators=400,
    random_state=42,
    n_jobs=-1
)

# ประเมินผลแบบเดินไปข้างหน้า (walk-forward)
mae_list, rmse_list, r2_list = [], [], []

for fold, (train_idx, test_idx) in enumerate(tscv.split(X), start=1):
    X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
    y_train, y_test = y.iloc[train_idx], y.iloc[test_idx]

    model.fit(X_train, y_train)
    pred = model.predict(X_test)

    mae = mean_absolute_error(y_test, pred)
    mse = mean_squared_error(y_test, pred)
    rmse=np.sqrt(mse)
    r2 = r2_score(y_test, pred)

    mae_list.append(mae)
    rmse_list.append(rmse)
    r2_list.append(r2)

    print(f"\nFold {fold}")
    print(" MAE :", mae)
    print(" RMSE:", rmse)
    print(" R2  :", r2)

print("\nAverage")
print(" MAE :", np.mean(mae_list))
print(" RMSE:", np.mean(rmse_list))
print(" R2  :", np.mean(r2_list))

# =========================
# 4) Feature Importance (ตาม Hint)
# =========================
# 4.1 Model-based importance (จาก RandomForest)
# ช่วยตอบว่า “โมเดลใช้ฟีเจอร์ไหนบ่อย/ช่วยลด error ในการ split มาก”
model.fit(X, y)
imp_rf = pd.Series(model.feature_importances_, index=feature_cols).sort_values(ascending=False)

print("\n[Feature Importance] RandomForest (model-based)")
print(imp_rf.head(10))

# 4.2 Permutation importance (แนะนำมาก)
# หลักการ: สุ่มสลับค่าฟีเจอร์ทีละตัว แล้วดูว่า score แย่ลงเท่าไหร่
# ถ้าแย่ลงมาก = ฟีเจอร์นั้นสำคัญจริง
perm = permutation_importance(
    model, X, y,
    n_repeats=10,
    random_state=42,
    n_jobs=-1,
    scoring="neg_mean_squared_error"
)
imp_perm = pd.Series(perm.importances_mean, index=feature_cols).sort_values(ascending=False)

print("\n[Feature Importance] Permutation (higher = more important)")
print(imp_perm.head(10))

# =========================
# 5) Feature Insight (ตัวเลือกเสริม) : SelectKBest + RFE
# =========================
# 5.1 SelectKBest: ดูความสัมพันธ์เชิงเส้นแบบทีละฟีเจอร์ (เร็ว แต่ไม่เห็น interaction)
k = 8
skb = SelectKBest(score_func=f_regression, k=k)
skb.fit(X, y)
skb_scores = pd.Series(skb.scores_, index=feature_cols).sort_values(ascending=False)

print(f"\n[Feature Insight] SelectKBest (Top {k} by f_regression)")
print(skb_scores.head(k))

# 5.2 RFE: ค่อยๆ ตัดฟีเจอร์ที่อ่อนออก โดยอิงโมเดลฐาน (LinearRegression)
# เหมาะเพื่อให้เห็น “ชุดฟีเจอร์หลัก” แต่ระวังถ้าฟีเจอร์สัมพันธ์กันมาก
rfe = RFE(estimator=LinearRegression(), n_features_to_select=8)
rfe.fit(X, y)
rfe_rank = pd.Series(rfe.ranking_, index=feature_cols).sort_values()

print("\n[Feature Insight] RFE ranking (1 = selected)")
print(rfe_rank.head(12))

# =========================
# 6) สรุปสั้นๆเป็นข้อความ (เอาไปใส่รายงานได้)
# =========================
print("\n--- Short report summary (ใช้ในรายงานได้) ---")
print("Target ที่ใช้คือ y_ret_t1 (ผลตอบแทนวันพรุ่งนี้) เพราะช่วยให้โมเดลเรียนรู้การเปลี่ยนแปลงได้ดีกว่าระดับราคา.")
print("ฟีเจอร์ถูกออกแบบให้เล่าเรื่อง 4 มิติ: trend/momentum, volatility/risk, volume confirmation, และ candle behavior.")
print("Feature importance ใช้ 2 มุมมอง: model-based (RF) และ permutation (สลับฟีเจอร์ดูผลกระทบ) เพื่อความน่าเชื่อถือ.")


Data used for modeling: (4028, 25)
Features used: 15 ['ret_1', 'ret_2', 'ret_3', 'ret_4', 'ret_5', 'ma_gap_10', 'ma_gap_20', 'range_pct', 'vol_10', 'vol_20', 'vol_chg', 'vol_ratio_20', 'co_ret', 'upper_wick', 'lower_wick']
Target: y_ret_t1

Fold 1
 MAE : 0.013963907039339087
 RMSE: 0.020951408042720807
 R2  : -0.16558354171099476

Fold 2
 MAE : 0.01183690644808675
 RMSE: 0.01764176539546164
 R2  : -0.07379682156729195

Fold 3
 MAE : 0.014923568987767253
 RMSE: 0.020867118015535615
 R2  : 0.018073031859547672

Fold 4
 MAE : 0.0180111820673533
 RMSE: 0.0248311174814252
 R2  : -0.057356091126401276

Fold 5
 MAE : 0.014685867091944383
 RMSE: 0.020489822351492524
 R2  : -0.08040567336743987

Average
 MAE : 0.014684286326898156
 RMSE: 0.020956246257327157
 R2  : -0.07181381918251603

[Feature Importance] RandomForest (model-based)
vol_chg         0.091458
vol_ratio_20    0.084983
lower_wick      0.081788
ma_gap_20       0.077214
upper_wick      0.074126
range_pct       0.071640
co_ret       

In [19]:
import pandas as pd
import numpy as np

from sklearn.feature_selection import SelectKBest, f_regression, RFE
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression

# =========================
# (A) Print settings: ไม่ให้ pandas ตัดบรรทัด/ตัดคอลัมน์
# =========================
pd.set_option("display.max_rows", None)
pd.set_option("display.max_columns", None)
pd.set_option("display.width", 200)
pd.set_option("display.max_colwidth", None)

# -----------------------------
# Load prepared dataset
# -----------------------------
path = "SnP_daily_update_AMZN_features_with_target.csv"
df = pd.read_csv(path)

df["Date"] = pd.to_datetime(df["Date"], errors="coerce")
df = df.dropna(subset=["Date"]).sort_values("Date").reset_index(drop=True)

feature_cols = [
    "ret_1","ret_2","ret_3","ret_4","ret_5",
    "ma_gap_10","ma_gap_20",
    "range_pct","vol_10","vol_20",
    "vol_chg","vol_ratio_20",
    "co_ret","upper_wick","lower_wick"
]
target = "y_ret_t1"

# drop rows that still have NaN (rolling features + last day target)
df_model = df.dropna(subset=feature_cols + [target]).reset_index(drop=True)
X = df_model[feature_cols]
y = df_model[target]

# -----------------------------
# 6.1 SelectKBest (Univariate)
# -----------------------------
k = 8
skb = SelectKBest(score_func=f_regression, k=k)
skb.fit(X, y)

skb_scores = pd.Series(skb.scores_, index=feature_cols).sort_values(ascending=False)
skb_top = skb_scores.head(k).reset_index()
skb_top.columns = ["feature", "selectkbest_fscore"]

# -----------------------------
# 6.2 RandomForest model-based importance
# -----------------------------
rf = RandomForestRegressor(
    n_estimators=400,
    random_state=42,
    n_jobs=-1,
    max_depth=8,
    min_samples_leaf=5
)
rf.fit(X, y)

rf_imp = pd.Series(rf.feature_importances_, index=feature_cols).sort_values(ascending=False)
rf_top = rf_imp.reset_index()
rf_top.columns = ["feature", "rf_importance"]  # เอาทุกฟีเจอร์ ไม่ตัด top10

# -----------------------------
# 6.3 RFE (LinearRegression)
# -----------------------------
rfe = RFE(estimator=LinearRegression(), n_features_to_select=8)
rfe.fit(X, y)

rfe_rank = pd.Series(rfe.ranking_, index=feature_cols).sort_values()
rfe_all = rfe_rank.reset_index()
rfe_all.columns = ["feature", "rfe_rank"]

# -----------------------------
# (B) แสดงผลแบบไม่ซ่อน: พิมพ์เต็มทุกแถวที่ต้องการ
# -----------------------------
print("======================================================")
print("6) Methods (Medium-style) + Results for AMZN (FULL OUTPUT)")
print("Dataset:", path)
print("Usable rows:", len(df_model), " | Features:", len(feature_cols), " | Target:", target)
print("Date range:", df_model["Date"].min().date(), "to", df_model["Date"].max().date())
print("======================================================\n")

print("6.1 Univariate Selection (SelectKBest with f_regression)")
print("Idea: score each feature independently vs target (fast, easy).")
print("Limit: doesn't capture interactions between features.\n")
print("Top 8 features (highest f_score):")
print(skb_top.to_string(index=False))
print()

print("6.2 Model-based Importance (RandomForest feature_importances_)")
print("Idea: features that reduce error in tree splits get higher importance.")
print("Limit: can be biased; importance may spread across correlated features.\n")
print("ALL features (sorted):")
print(rf_imp.to_string())  # พิมพ์เป็น Series เต็ม
print()

print("6.3 RFE (Recursive Feature Elimination) with LinearRegression")
print("Idea: repeatedly remove weakest features to keep a core subset.")
print("Limit: can be unstable when features are highly correlated.\n")
print("ALL features (sorted by rank):")
print(rfe_all.to_string(index=False))
print()

selected = rfe_all[rfe_all["rfe_rank"] == 1].copy()
print("Selected core features (rank=1):")
print(selected.to_string(index=False))
print()

# -----------------------------
# (C) Export: รวมผลทั้ง 3 วิธีเป็น CSV ไฟล์เดียว
# -----------------------------
# รวมเป็นตารางเดียว โดยใช้ feature เป็น key
final = pd.DataFrame({"feature": feature_cols})
final["selectkbest_fscore"] = final["feature"].map(skb_scores)      # ได้คะแนนครบทุกฟีเจอร์
final["rf_importance"] = final["feature"].map(rf_imp)              # ได้ importance ครบทุกฟีเจอร์
final["rfe_rank"] = final["feature"].map(rfe_rank).astype(int)     # rank เป็น int

# เพิ่ม flag ให้สไลด์ดูง่าย
final["selectkbest_top8"] = final["feature"].isin(skb_top["feature"]).astype(int)
final["rfe_selected"] = (final["rfe_rank"] == 1).astype(int)

# จัดเรียงให้ดูง่าย (เรียงตาม RF importance ก่อน)
final = final.sort_values(["rf_importance", "selectkbest_fscore"], ascending=False).reset_index(drop=True)

out_csv = "AMZN_methods_all_in_one.csv"
final.to_csv(out_csv, index=False)

print("Exported ONE CSV:", out_csv)
print("\nPreview exported table:")
print(final.head(15).to_string(index=False))



6) Methods (Medium-style) + Results for AMZN (FULL OUTPUT)
Dataset: SnP_daily_update_AMZN_features_with_target.csv
Usable rows: 4028  | Features: 15  | Target: y_ret_t1
Date range: 2010-02-02 to 2026-02-05

6.1 Univariate Selection (SelectKBest with f_regression)
Idea: score each feature independently vs target (fast, easy).
Limit: doesn't capture interactions between features.

Top 8 features (highest f_score):
   feature  selectkbest_fscore
lower_wick           18.422112
upper_wick            9.718989
 ma_gap_20            6.302124
 ma_gap_10            4.692296
     ret_4            3.574626
     ret_3            3.041269
     ret_5            2.657280
    co_ret            1.728718

6.2 Model-based Importance (RandomForest feature_importances_)
Idea: features that reduce error in tree splits get higher importance.
Limit: can be biased; importance may spread across correlated features.

ALL features (sorted):
lower_wick      0.106624
vol_chg         0.104709
ma_gap_20       0.100057