# This is a sample Jupyter Notebook

Below is an example of a code cell. 
Put your cursor into the cell and press Shift+Enter to execute it and select the next one, or click 'Run Cell' button.

Press Double Shift to search everywhere for classes, files, tool windows, actions, and settings.

To learn more about Jupyter Notebooks in PyCharm, see [help](https://www.jetbrains.com/help/pycharm/ipython-notebook-support.html).
For an overview of PyCharm, go to Help -> Learn IDE features or refer to [our documentation](https://www.jetbrains.com/help/pycharm/getting-started.html).

In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
from sklearn.model_selection import StratifiedKFold, GridSearchCV
from sklearn.metrics import classification_report, confusion_matrix, f1_score, roc_auc_score, accuracy_score, roc_curve, auc
from skopt import BayesSearchCV
from skopt.space import Integer, Categorical, Real



In [3]:
df = pd.read_csv("spatial-rain-hii.csv")

In [4]:
df.head(5)

Unnamed: 0,YEAR,MONTH,PROV_ID,PROV_T,MinRain,MaxRain,AvgRain
0,2018,1,10,กรุงเทพมหานคร,54.299999,257.230011,142.119137
1,2018,1,11,สมุทรปราการ,76.25,256.100006,137.302046
2,2018,1,12,นนทบุรี,38.360001,161.470001,113.433771
3,2018,1,13,ปทุมธานี,51.439999,116.5,82.901688
4,2018,1,14,พระนครศรีอยุธยา,8.85,88.589996,39.960089


Cleaning

1. Check Missing value

In [5]:

print(df.describe(include="all"))

               YEAR        MONTH      PROV_ID         PROV_T      MinRain  \
count   7084.000000  7084.000000  7084.000000           7084  7084.000000   
unique          NaN          NaN          NaN             77          NaN   
top             NaN          NaN          NaN  กรุงเทพมหานคร          NaN   
freq            NaN          NaN          NaN             92          NaN   
mean    2021.347826     6.326087    51.129870            NaN    85.751294   
std        2.218826     3.414115    24.943954            NaN    99.093571   
min     2018.000000     1.000000    10.000000            NaN     0.000000   
25%     2019.000000     3.000000    31.000000            NaN     4.765000   
50%     2021.000000     6.000000    50.000000            NaN    56.670000   
75%     2023.000000     9.000000    72.000000            NaN   135.262496   
max     2025.000000    12.000000    96.000000            NaN  1264.140000   

            MaxRain      AvgRain  
count   7084.000000  7084.000000  
uniqu

2.ตัด Column PROV_ID

In [6]:
df = df.drop(columns=["PROV_T"])


In [7]:
df.head(5)

Unnamed: 0,YEAR,MONTH,PROV_ID,MinRain,MaxRain,AvgRain
0,2018,1,10,54.299999,257.230011,142.119137
1,2018,1,11,76.25,256.100006,137.302046
2,2018,1,12,38.360001,161.470001,113.433771
3,2018,1,13,51.439999,116.5,82.901688
4,2018,1,14,8.85,88.589996,39.960089


3.สร้าง Label >= 90 คือตก

In [8]:
df["Rain"] = (df["AvgRain"] >= 90).astype(int)

In [9]:
df.head(5)

Unnamed: 0,YEAR,MONTH,PROV_ID,MinRain,MaxRain,AvgRain,Rain
0,2018,1,10,54.299999,257.230011,142.119137,1
1,2018,1,11,76.25,256.100006,137.302046,1
2,2018,1,12,38.360001,161.470001,113.433771,1
3,2018,1,13,51.439999,116.5,82.901688,0
4,2018,1,14,8.85,88.589996,39.960089,0


Feature Engineering

1.Add coulumn

- Seasonality

In [10]:
df["month_sin"] = np.sin(2*np.pi*df["MONTH"]/12)
df["month_cos"] = np.cos(2*np.pi*df["MONTH"]/12)

In [11]:
df.head(5)

Unnamed: 0,YEAR,MONTH,PROV_ID,MinRain,MaxRain,AvgRain,Rain,month_sin,month_cos
0,2018,1,10,54.299999,257.230011,142.119137,1,0.5,0.866025
1,2018,1,11,76.25,256.100006,137.302046,1,0.5,0.866025
2,2018,1,12,38.360001,161.470001,113.433771,1,0.5,0.866025
3,2018,1,13,51.439999,116.5,82.901688,0,0.5,0.866025
4,2018,1,14,8.85,88.589996,39.960089,0,0.5,0.866025


In [12]:
df = df.drop(columns=["MONTH"])

In [13]:
df.head(3)

Unnamed: 0,YEAR,PROV_ID,MinRain,MaxRain,AvgRain,Rain,month_sin,month_cos
0,2018,10,54.299999,257.230011,142.119137,1,0.5,0.866025
1,2018,11,76.25,256.100006,137.302046,1,0.5,0.866025
2,2018,12,38.360001,161.470001,113.433771,1,0.5,0.866025


# Train/Test Split

In [14]:
X = df[["PROV_ID", "month_sin", "month_cos"]]
y = df["Rain"]

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

In [15]:
print("X_train sample:")
print(X_train.head())

print("\ny_train sample:")
print(y_train.head())

print("\nX_test sample:")
print(X_test.head())

print("\ny_test sample:")
print(y_test.head())


X_train sample:
      PROV_ID  month_sin     month_cos
3528       80  -0.866025  5.000000e-01
6240       13  -0.866025  5.000000e-01
670        67  -1.000000 -1.836970e-16
4248       23  -0.866025 -5.000000e-01
4836       77   1.000000  6.123234e-17

y_train sample:
3528    1
6240    1
670     1
4248    1
4836    0
Name: Rain, dtype: int64

X_test sample:
      PROV_ID  month_sin     month_cos
3426       50  -1.000000 -1.836970e-16
3915       82   1.000000  6.123234e-17
6698       96   1.000000  6.123234e-17
302        91   0.866025 -5.000000e-01
1897       62   0.500000  8.660254e-01

y_test sample:
3426    1
3915    1
6698    0
302     1
1897    0
Name: Rain, dtype: int64


********** Traning ************
and scaling

# Logistic Regression

In [16]:
# log_pipe = Pipeline([
#     ("scaler", StandardScaler()),
#     ("clf", LogisticRegression(max_iter=1000))
# ])
# log_pipe.fit(X_train, y_train)

In [17]:
# y_pred = log_pipe.predict(X_test)
# y_proba = log_pipe.predict_proba(X_test)[:, 1]   # ใช้ proba ของ Rain=1
#
# print("=== Logistic Regression ===")
# print(classification_report(y_test, y_pred, digits=4))
# print(confusion_matrix(y_test, y_pred))
#
# # เพิ่ม F1 และ ROC-AUC
# f1 = f1_score(y_test, y_pred)
# auc = roc_auc_score(y_test, y_proba)
# print(f"F1-score (test): {f1:.4f}")
# print(f"ROC-AUC   (test): {auc:.4f}")

# Decision Tree

In [18]:
# tree_pipe = Pipeline([
#     ("scaler", StandardScaler()),   # จริง ๆ ไม่จำเป็นกับ tree
#     ("clf", DecisionTreeClassifier(random_state=42))
# ])
# tree_pipe.fit(X_train, y_train)

In [19]:
# y_pred = tree_pipe.predict(X_test)
# print("=== Decision Tree ===")
# print(classification_report(y_test, y_pred, digits=4))
# print(confusion_matrix(y_test, y_pred))
# # เพิ่ม F1 และ ROC-AUC
# f1 = f1_score(y_test, y_pred)
# auc = roc_auc_score(y_test, y_proba)
# print(f"F1-score (test): {f1:.4f}")
# print(f"ROC-AUC   (test): {auc:.4f}")

# Random Forest

In [20]:
# rf_pipe = Pipeline([
#     ("scaler", StandardScaler()),   # ไม่จำเป็นกับ forest เช่นกัน
#     ("clf", RandomForestClassifier(random_state=42))
# ])
# rf_pipe.fit(X_train, y_train)

In [21]:
# y_pred = rf_pipe.predict(X_test)
# print("=== Random Forest ===")
# print(classification_report(y_test, y_pred, digits=4))
# print(confusion_matrix(y_test, y_pred))
# # เพิ่ม F1 และ ROC-AUC
# f1 = f1_score(y_test, y_pred)
# auc = roc_auc_score(y_test, y_proba)
# print(f"F1-score (test): {f1:.4f}")
# print(f"ROC-AUC   (test): {auc:.4f}")

# Hyperparameter Tunning

In [22]:
# Pipeline ที่ fix ค่า hyperparameters ตาม best params
rf_best = Pipeline([
    ("scaler", StandardScaler()),
    ("clf", RandomForestClassifier(
        random_state=42,
        n_jobs=-1,
        class_weight=None,
        max_depth=10,
        max_features="sqrt",
        min_samples_leaf=6,
        min_samples_split=2,
        n_estimators=600
    ))
])

# เทรนโมเดล
rf_best.fit(X_train, y_train)

# ถ้าอยากดูผล
y_pred = rf_best.predict(X_test)

print("Accuracy:", (y_pred == y_test).mean())

Accuracy: 0.8913196894848271


### - Random Forest