# This is a sample Jupyter Notebook

Below is an example of a code cell. 
Put your cursor into the cell and press Shift+Enter to execute it and select the next one, or click 'Run Cell' button.

Press Double Shift to search everywhere for classes, files, tool windows, actions, and settings.

To learn more about Jupyter Notebooks in PyCharm, see [help](https://www.jetbrains.com/help/pycharm/ipython-notebook-support.html).
For an overview of PyCharm, go to Help -> Learn IDE features or refer to [our documentation](https://www.jetbrains.com/help/pycharm/getting-started.html).

In [32]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
from sklearn.model_selection import StratifiedKFold, GridSearchCV
from sklearn.metrics import classification_report, confusion_matrix, f1_score, roc_auc_score



In [3]:
df = pd.read_csv("spatial-rain-hii.csv")

In [4]:
df.head(5)

Unnamed: 0,YEAR,MONTH,PROV_ID,PROV_T,MinRain,MaxRain,AvgRain
0,2018,1,10,กรุงเทพมหานคร,54.299999,257.230011,142.119137
1,2018,1,11,สมุทรปราการ,76.25,256.100006,137.302046
2,2018,1,12,นนทบุรี,38.360001,161.470001,113.433771
3,2018,1,13,ปทุมธานี,51.439999,116.5,82.901688
4,2018,1,14,พระนครศรีอยุธยา,8.85,88.589996,39.960089


Cleaning

1. Check Missing value

In [5]:

print(df.describe(include="all"))

               YEAR        MONTH      PROV_ID         PROV_T      MinRain  \
count   7084.000000  7084.000000  7084.000000           7084  7084.000000   
unique          NaN          NaN          NaN             77          NaN   
top             NaN          NaN          NaN  กรุงเทพมหานคร          NaN   
freq            NaN          NaN          NaN             92          NaN   
mean    2021.347826     6.326087    51.129870            NaN    85.751294   
std        2.218826     3.414115    24.943954            NaN    99.093571   
min     2018.000000     1.000000    10.000000            NaN     0.000000   
25%     2019.000000     3.000000    31.000000            NaN     4.765000   
50%     2021.000000     6.000000    50.000000            NaN    56.670000   
75%     2023.000000     9.000000    72.000000            NaN   135.262496   
max     2025.000000    12.000000    96.000000            NaN  1264.140000   

            MaxRain      AvgRain  
count   7084.000000  7084.000000  
uniqu

2.ตัด Column PROV_ID

In [6]:
df = df.drop(columns=["PROV_T"])


In [7]:
df.head(5)

Unnamed: 0,YEAR,MONTH,PROV_ID,MinRain,MaxRain,AvgRain
0,2018,1,10,54.299999,257.230011,142.119137
1,2018,1,11,76.25,256.100006,137.302046
2,2018,1,12,38.360001,161.470001,113.433771
3,2018,1,13,51.439999,116.5,82.901688
4,2018,1,14,8.85,88.589996,39.960089


3.สร้าง Label >= 90 คือตก

In [8]:
df["Rain"] = (df["AvgRain"] >= 90).astype(int)

In [9]:
df.head(5)

Unnamed: 0,YEAR,MONTH,PROV_ID,MinRain,MaxRain,AvgRain,Rain
0,2018,1,10,54.299999,257.230011,142.119137,1
1,2018,1,11,76.25,256.100006,137.302046,1
2,2018,1,12,38.360001,161.470001,113.433771,1
3,2018,1,13,51.439999,116.5,82.901688,0
4,2018,1,14,8.85,88.589996,39.960089,0


Feature Engineering

1.Add coulumn

- Seasonality

In [10]:
df["month_sin"] = np.sin(2*np.pi*df["MONTH"]/12)
df["month_cos"] = np.cos(2*np.pi*df["MONTH"]/12)

In [11]:
df.head(5)

Unnamed: 0,YEAR,MONTH,PROV_ID,MinRain,MaxRain,AvgRain,Rain,month_sin,month_cos
0,2018,1,10,54.299999,257.230011,142.119137,1,0.5,0.866025
1,2018,1,11,76.25,256.100006,137.302046,1,0.5,0.866025
2,2018,1,12,38.360001,161.470001,113.433771,1,0.5,0.866025
3,2018,1,13,51.439999,116.5,82.901688,0,0.5,0.866025
4,2018,1,14,8.85,88.589996,39.960089,0,0.5,0.866025


In [12]:
df = df.drop(columns=["MONTH"])

In [13]:
df.head(3)

Unnamed: 0,YEAR,PROV_ID,MinRain,MaxRain,AvgRain,Rain,month_sin,month_cos
0,2018,10,54.299999,257.230011,142.119137,1,0.5,0.866025
1,2018,11,76.25,256.100006,137.302046,1,0.5,0.866025
2,2018,12,38.360001,161.470001,113.433771,1,0.5,0.866025


# Train/Test Split

In [14]:
X = df[["PROV_ID", "month_sin", "month_cos"]]
y = df["Rain"]

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

In [15]:
print("X_train sample:")
print(X_train.head())

print("\ny_train sample:")
print(y_train.head())

print("\nX_test sample:")
print(X_test.head())

print("\ny_test sample:")
print(y_test.head())


X_train sample:
      PROV_ID  month_sin     month_cos
3528       80  -0.866025  5.000000e-01
6240       13  -0.866025  5.000000e-01
670        67  -1.000000 -1.836970e-16
4248       23  -0.866025 -5.000000e-01
4836       77   1.000000  6.123234e-17

y_train sample:
3528    1
6240    1
670     1
4248    1
4836    0
Name: Rain, dtype: int64

X_test sample:
      PROV_ID  month_sin     month_cos
3426       50  -1.000000 -1.836970e-16
3915       82   1.000000  6.123234e-17
6698       96   1.000000  6.123234e-17
302        91   0.866025 -5.000000e-01
1897       62   0.500000  8.660254e-01

y_test sample:
3426    1
3915    1
6698    0
302     1
1897    0
Name: Rain, dtype: int64


********** Traning ************
and scaling

# Logistic Regression

In [37]:
log_pipe = Pipeline([
    ("scaler", StandardScaler()),
    ("clf", LogisticRegression(max_iter=1000))
])
log_pipe.fit(X_train, y_train)

0,1,2
,steps,"[('scaler', ...), ('clf', ...)]"
,transform_input,
,memory,
,verbose,False

0,1,2
,copy,True
,with_mean,True
,with_std,True

0,1,2
,penalty,'l2'
,dual,False
,tol,0.0001
,C,1.0
,fit_intercept,True
,intercept_scaling,1
,class_weight,
,random_state,
,solver,'lbfgs'
,max_iter,1000


In [38]:
y_pred = log_pipe.predict(X_test)
y_proba = log_pipe.predict_proba(X_test)[:, 1]   # ใช้ proba ของ Rain=1

print("=== Logistic Regression ===")
print(classification_report(y_test, y_pred, digits=4))
print(confusion_matrix(y_test, y_pred))

# เพิ่ม F1 และ ROC-AUC
f1 = f1_score(y_test, y_pred)
auc = roc_auc_score(y_test, y_proba)
print(f"F1-score (test): {f1:.4f}")
print(f"ROC-AUC   (test): {auc:.4f}")

=== Logistic Regression ===
              precision    recall  f1-score   support

           0     0.8571    0.8765    0.8667       664
           1     0.8889    0.8712    0.8799       753

    accuracy                         0.8737      1417
   macro avg     0.8730    0.8738    0.8733      1417
weighted avg     0.8740    0.8737    0.8737      1417

[[582  82]
 [ 97 656]]
F1-score (test): 0.8799
ROC-AUC   (test): 0.9193


# Decision Tree

In [40]:
tree_pipe = Pipeline([
    ("scaler", StandardScaler()),   # จริง ๆ ไม่จำเป็นกับ tree
    ("clf", DecisionTreeClassifier(random_state=42))
])
tree_pipe.fit(X_train, y_train)

0,1,2
,steps,"[('scaler', ...), ('clf', ...)]"
,transform_input,
,memory,
,verbose,False

0,1,2
,copy,True
,with_mean,True
,with_std,True

0,1,2
,criterion,'gini'
,splitter,'best'
,max_depth,
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,
,random_state,42
,max_leaf_nodes,
,min_impurity_decrease,0.0


In [41]:
y_pred = tree_pipe.predict(X_test)
print("=== Decision Tree ===")
print(classification_report(y_test, y_pred, digits=4))
print(confusion_matrix(y_test, y_pred))
# เพิ่ม F1 และ ROC-AUC
f1 = f1_score(y_test, y_pred)
auc = roc_auc_score(y_test, y_proba)
print(f"F1-score (test): {f1:.4f}")
print(f"ROC-AUC   (test): {auc:.4f}")

=== Decision Tree ===
              precision    recall  f1-score   support

           0     0.8586    0.8780    0.8682       664
           1     0.8902    0.8725    0.8813       753

    accuracy                         0.8751      1417
   macro avg     0.8744    0.8753    0.8747      1417
weighted avg     0.8754    0.8751    0.8752      1417

[[583  81]
 [ 96 657]]
F1-score (test): 0.8813
ROC-AUC   (test): 0.9193


# Random Forest

In [42]:
rf_pipe = Pipeline([
    ("scaler", StandardScaler()),   # ไม่จำเป็นกับ forest เช่นกัน
    ("clf", RandomForestClassifier(random_state=42))
])
rf_pipe.fit(X_train, y_train)

0,1,2
,steps,"[('scaler', ...), ('clf', ...)]"
,transform_input,
,memory,
,verbose,False

0,1,2
,copy,True
,with_mean,True
,with_std,True

0,1,2
,n_estimators,100
,criterion,'gini'
,max_depth,
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,'sqrt'
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True


In [43]:
y_pred = rf_pipe.predict(X_test)
print("=== Random Forest ===")
print(classification_report(y_test, y_pred, digits=4))
print(confusion_matrix(y_test, y_pred))
# เพิ่ม F1 และ ROC-AUC
f1 = f1_score(y_test, y_pred)
auc = roc_auc_score(y_test, y_proba)
print(f"F1-score (test): {f1:.4f}")
print(f"ROC-AUC   (test): {auc:.4f}")

=== Random Forest ===
              precision    recall  f1-score   support

           0     0.8634    0.8660    0.8647       664
           1     0.8815    0.8792    0.8803       753

    accuracy                         0.8730      1417
   macro avg     0.8724    0.8726    0.8725      1417
weighted avg     0.8730    0.8730    0.8730      1417

[[575  89]
 [ 91 662]]
F1-score (test): 0.8803
ROC-AUC   (test): 0.9193


# Hyperparameter Tunning

## Grid Search

In [46]:
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
scoring = "f1"   # เน้นบาลานซ์ precision/recall ของ Rain=1

def run_grid_search(pipe, grid):
    search = GridSearchCV(
        estimator=pipe,
        param_grid=grid,
        scoring="f1",        # ใช้ f1 ในการเลือกพารามิเตอร์
        cv=cv,
        n_jobs=-1
    )
    search.fit(X_train, y_train)
    best_model = search.best_estimator_
    print("Best params:", search.best_params_)

    # Predict บน test set
    y_pred = best_model.predict(X_test)
    y_proba = best_model.predict_proba(X_test)[:, 1]

    # รายงานผล
    print(classification_report(y_test, y_pred, digits=4))
    print(confusion_matrix(y_test, y_pred))

    # ค่า F1 และ ROC-AUC บน test set
    f1 = f1_score(y_test, y_pred)
    auc = roc_auc_score(y_test, y_proba)
    print(f"F1-score (test): {f1:.4f}")
    print(f"ROC-AUC   (test): {auc:.4f}")

    return best_model

### - Logistic Regession

In [48]:
lr_pipe = Pipeline([
    ("scaler", StandardScaler()),
    ("clf", LogisticRegression(max_iter=4000, solver="lbfgs"))
])

grid_lr = {
    "clf__C": [0.1, 0.5, 1.0, 2.0, 10.0],   # คุมความแรง regularization
    "clf__class_weight": [None, "balanced"] # ถ่วง class ถ้า skew
}

best_lr = run_grid_search(lr_pipe, grid_lr)


Best params: {'clf__C': 0.5, 'clf__class_weight': None}
              precision    recall  f1-score   support

           0     0.8571    0.8765    0.8667       664
           1     0.8889    0.8712    0.8799       753

    accuracy                         0.8737      1417
   macro avg     0.8730    0.8738    0.8733      1417
weighted avg     0.8740    0.8737    0.8737      1417

[[582  82]
 [ 97 656]]
F1-score (test): 0.8799
ROC-AUC   (test): 0.9193


### - Decision Tree

In [36]:
dt_pipe = Pipeline([
    ("scaler", StandardScaler()),
    ("clf", DecisionTreeClassifier(random_state=42))
])

grid_dt = {
    "clf__max_depth": [None, 5, 10, 15],  # กัน overfit
    "clf__min_samples_leaf": [1, 3, 5, 10],
    "clf__class_weight": [None, "balanced"]
}

best_dt = run_grid_search(dt_pipe, grid_dt)


Best params: {'clf__class_weight': None, 'clf__max_depth': 5, 'clf__min_samples_leaf': 1}
              precision    recall  f1-score   support

           0     0.8830    0.8750    0.8790       664
           1     0.8906    0.8977    0.8942       753

    accuracy                         0.8871      1417
   macro avg     0.8868    0.8864    0.8866      1417
weighted avg     0.8871    0.8871    0.8871      1417

[[581  83]
 [ 77 676]]
F1-score (test): 0.8942
ROC-AUC   (test): 0.9432


### - Random Forest

In [49]:
from sklearn.ensemble import RandomForestClassifier

rf_pipe = Pipeline([
    ("scaler", StandardScaler()),
    ("clf", RandomForestClassifier(random_state=42, n_jobs=-1))
])

grid_rf = {
    "clf__n_estimators": [200, 400, 600],  # มากขึ้น = เสถียรขึ้น (แต่ช้าขึ้น)
    "clf__max_depth": [None, 10, 15, 20],
    "clf__min_samples_leaf": [1, 2, 4],
    "clf__max_features": ["sqrt", "log2"], # 'sqrt' มักดีสำหรับ classification
    "clf__class_weight": [None, "balanced"]
}

best_rf = run_grid_search(rf_pipe, grid_rf)


Best params: {'clf__class_weight': None, 'clf__max_depth': 10, 'clf__max_features': 'sqrt', 'clf__min_samples_leaf': 4, 'clf__n_estimators': 400}
              precision    recall  f1-score   support

           0     0.8805    0.8765    0.8785       664
           1     0.8915    0.8951    0.8933       753

    accuracy                         0.8864      1417
   macro avg     0.8860    0.8858    0.8859      1417
weighted avg     0.8864    0.8864    0.8864      1417

[[582  82]
 [ 79 674]]
F1-score (test): 0.8933
ROC-AUC   (test): 0.9481


In [None]:
rf_pipe = Pipeline([
    ("scaler", StandardScaler()),
    ("clf", RandomForestClassifier(random_state=42, n_jobs=-1))
])

grid_rf = {
    # จำนวนต้นไม้ (เยอะขึ้น เสถียรขึ้น แต่ช้าลง)
    "clf__n_estimators": [200, 400, 600, 1000],

    # ความลึกของต้นไม้
    "clf__max_depth": [None, 10, 15, 25, 30],

    # จำนวน sample ขั้นต่ำต่อ split และ leaf
    "clf__min_samples_split": [2, 5, 10],
    "clf__min_samples_leaf": [1, 2, 6],

    # ฟีเจอร์ที่ใช้ตอน split
    "clf__max_features": ["sqrt", "log2", None],

    # ถ่วงน้ำหนักคลาส
    "clf__class_weight": [None, "balanced"]
}

best_rf = run_grid_search(rf_pipe, grid_rf)
