# This is a sample Jupyter Notebook

Below is an example of a code cell. 
Put your cursor into the cell and press Shift+Enter to execute it and select the next one, or click 'Run Cell' button.

Press Double Shift to search everywhere for classes, files, tool windows, actions, and settings.

To learn more about Jupyter Notebooks in PyCharm, see [help](https://www.jetbrains.com/help/pycharm/ipython-notebook-support.html).
For an overview of PyCharm, go to Help -> Learn IDE features or refer to [our documentation](https://www.jetbrains.com/help/pycharm/getting-started.html).

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
from sklearn.model_selection import StratifiedKFold, GridSearchCV
from sklearn.metrics import classification_report, confusion_matrix, f1_score, roc_auc_score



In [None]:
df = pd.read_csv("spatial-rain-hii.csv")

In [None]:
df.head(5)

Cleaning

1. Check Missing value

In [None]:

print(df.describe(include="all"))

2.ตัด Column PROV_ID

In [None]:
df = df.drop(columns=["PROV_T"])


In [None]:
df.head(5)

3.สร้าง Label >= 90 คือตก

In [None]:
df["Rain"] = (df["AvgRain"] >= 90).astype(int)

In [None]:
df.head(5)

Feature Engineering

1.Add coulumn

- Seasonality

In [None]:
df["month_sin"] = np.sin(2*np.pi*df["MONTH"]/12)
df["month_cos"] = np.cos(2*np.pi*df["MONTH"]/12)

In [None]:
df.head(5)

In [None]:
df = df.drop(columns=["MONTH"])

In [None]:
df.head(3)

# Train/Test Split

In [None]:
X = df[["PROV_ID", "month_sin", "month_cos"]]
y = df["Rain"]

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

In [None]:
print("X_train sample:")
print(X_train.head())

print("\ny_train sample:")
print(y_train.head())

print("\nX_test sample:")
print(X_test.head())

print("\ny_test sample:")
print(y_test.head())


********** Traning ************
and scaling

# Logistic Regression

In [None]:
log_pipe = Pipeline([
    ("scaler", StandardScaler()),
    ("clf", LogisticRegression(max_iter=1000))
])
log_pipe.fit(X_train, y_train)

In [None]:
y_pred = log_pipe.predict(X_test)
y_proba = log_pipe.predict_proba(X_test)[:, 1]   # ใช้ proba ของ Rain=1

print("=== Logistic Regression ===")
print(classification_report(y_test, y_pred, digits=4))
print(confusion_matrix(y_test, y_pred))

# เพิ่ม F1 และ ROC-AUC
f1 = f1_score(y_test, y_pred)
auc = roc_auc_score(y_test, y_proba)
print(f"F1-score (test): {f1:.4f}")
print(f"ROC-AUC   (test): {auc:.4f}")

# Decision Tree

In [None]:
tree_pipe = Pipeline([
    ("scaler", StandardScaler()),   # จริง ๆ ไม่จำเป็นกับ tree
    ("clf", DecisionTreeClassifier(random_state=42))
])
tree_pipe.fit(X_train, y_train)

In [None]:
y_pred = tree_pipe.predict(X_test)
print("=== Decision Tree ===")
print(classification_report(y_test, y_pred, digits=4))
print(confusion_matrix(y_test, y_pred))
# เพิ่ม F1 และ ROC-AUC
f1 = f1_score(y_test, y_pred)
auc = roc_auc_score(y_test, y_proba)
print(f"F1-score (test): {f1:.4f}")
print(f"ROC-AUC   (test): {auc:.4f}")

# Random Forest

In [None]:
rf_pipe = Pipeline([
    ("scaler", StandardScaler()),   # ไม่จำเป็นกับ forest เช่นกัน
    ("clf", RandomForestClassifier(random_state=42))
])
rf_pipe.fit(X_train, y_train)

In [None]:
y_pred = rf_pipe.predict(X_test)
print("=== Random Forest ===")
print(classification_report(y_test, y_pred, digits=4))
print(confusion_matrix(y_test, y_pred))
# เพิ่ม F1 และ ROC-AUC
f1 = f1_score(y_test, y_pred)
auc = roc_auc_score(y_test, y_proba)
print(f"F1-score (test): {f1:.4f}")
print(f"ROC-AUC   (test): {auc:.4f}")

# Hyperparameter Tunning

## Grid Search

In [None]:
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
scoring = "f1"   # เน้นบาลานซ์ precision/recall ของ Rain=1

def run_grid_search(pipe, grid):
    search = GridSearchCV(
        estimator=pipe,
        param_grid=grid,
        scoring="f1",        # ใช้ f1 ในการเลือกพารามิเตอร์
        cv=cv,
        n_jobs=-1
    )
    search.fit(X_train, y_train)
    best_model = search.best_estimator_
    print("Best params:", search.best_params_)

    # Predict บน test set
    y_pred = best_model.predict(X_test)
    y_proba = best_model.predict_proba(X_test)[:, 1]

    # รายงานผล
    print(classification_report(y_test, y_pred, digits=4))
    print(confusion_matrix(y_test, y_pred))

    # ค่า F1 และ ROC-AUC บน test set
    f1 = f1_score(y_test, y_pred)
    auc = roc_auc_score(y_test, y_proba)
    print(f"F1-score (test): {f1:.4f}")
    print(f"ROC-AUC   (test): {auc:.4f}")

    return best_model

### - Logistic Regession

In [None]:
lr_pipe = Pipeline([
    ("scaler", StandardScaler()),
    ("clf", LogisticRegression(max_iter=4000, solver="lbfgs"))
])

grid_lr = {
    "clf__C": [0.1, 0.5, 1.0, 2.0, 10.0],   # คุมความแรง regularization
    "clf__class_weight": [None, "balanced"] # ถ่วง class ถ้า skew
}

best_lr = run_grid_search(lr_pipe, grid_lr)


### - Decision Tree

In [None]:
dt_pipe = Pipeline([
    ("scaler", StandardScaler()),
    ("clf", DecisionTreeClassifier(random_state=42))
])

grid_dt = {
    "clf__max_depth": [None, 5, 10, 15],  # กัน overfit
    "clf__min_samples_leaf": [1, 3, 5, 10],
    "clf__class_weight": [None, "balanced"]
}

best_dt = run_grid_search(dt_pipe, grid_dt)


### - Random Forest

In [None]:
from sklearn.ensemble import RandomForestClassifier

rf_pipe = Pipeline([
    ("scaler", StandardScaler()),
    ("clf", RandomForestClassifier(random_state=42, n_jobs=-1))
])

grid_rf = {
    "clf__n_estimators": [200, 400, 600],  # มากขึ้น = เสถียรขึ้น (แต่ช้าขึ้น)
    "clf__max_depth": [None, 10, 15, 20],
    "clf__min_samples_leaf": [1, 2, 4],
    "clf__max_features": ["sqrt", "log2"], # 'sqrt' มักดีสำหรับ classification
    "clf__class_weight": [None, "balanced"]
}

best_rf = run_grid_search(rf_pipe, grid_rf)


In [None]:
rf_pipe = Pipeline([
    ("scaler", StandardScaler()),
    ("clf", RandomForestClassifier(random_state=42, n_jobs=-1))
])

grid_rf = {
    # จำนวนต้นไม้ (เยอะขึ้น เสถียรขึ้น แต่ช้าลง)
    "clf__n_estimators": [200, 400, 600, 1000],

    # ความลึกของต้นไม้
    "clf__max_depth": [None, 10, 15, 25, 30],

    # จำนวน sample ขั้นต่ำต่อ split และ leaf
    "clf__min_samples_split": [2, 5, 10],
    "clf__min_samples_leaf": [1, 2, 6],

    # ฟีเจอร์ที่ใช้ตอน split
    "clf__max_features": ["sqrt", "log2", None],

    # ถ่วงน้ำหนักคลาส
    "clf__class_weight": [None, "balanced"]
}

best_rf = run_grid_search(rf_pipe, grid_rf)
