In [1]:
import pandas as pd
df = pd.read_csv("processed_rice.csv")

In [2]:
X = df.iloc[:, :-1]
y = df.iloc[:, -1]

In [3]:
from sklearn.model_selection import train_test_split, StratifiedKFold, GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, f1_score
##  ===== Parameter ranges
n_estimators = [10, 30, 60, 100]
max_leaf_nodes = [6, 12]

def bestRFClassifier(X, y):


    # ===== Split data into train/test with stratification
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, stratify=y, random_state=0
    )

    # ====== Stratified 10-fold cross-validation
    cvKFold = StratifiedKFold(n_splits=10, shuffle=True, random_state=0)

    # ===== Base Random Forest model 
    rf = RandomForestClassifier(
        criterion="entropy", # use information gain
        max_features="sqrt", # feature selection strategy
        random_state=0,
    )

    # ====== Parameter grid 
    param_grid = {
        "n_estimators": n_estimators,  
        "max_leaf_nodes": max_leaf_nodes,   
    }

    # ===== Grid search with cross-validation 
    gscv = GridSearchCV(
        estimator=rf,
        param_grid=param_grid,
        scoring="accuracy",
        cv=cvKFold,
        n_jobs=1,      
        refit=True,
        return_train_score=False,
    )
    gscv.fit(X_train, y_train)

    # ===== Get best model and CV score 
    best_model = gscv.best_estimator_
    best_params = gscv.best_params_
    best_cv_acc = float(gscv.best_score_)

    # ====== Evaluate on the test set 
    y_pred = best_model.predict(X_test)
    test_acc = float(accuracy_score(y_test, y_pred))
    macro_f1 = float(f1_score(y_test, y_pred, average="macro"))
    weighted_f1 = float(f1_score(y_test, y_pred, average="weighted"))

    return (
        int(best_params["n_estimators"]),
        int(best_params["max_leaf_nodes"]),
        best_cv_acc,
        test_acc,
        macro_f1,
        weighted_f1
    )


In [4]:
best_n, best_leaf, cv_acc, tst_acc, macro_f1, weighted_f1 = bestRFClassifier(X, y)

print("RF best n_estimators:", best_n)
print("RF best max_leaf_nodes:", best_leaf)
print("RF cross-validation accuracy:", round(cv_acc, 4))
print("RF test set accuracy:", round(tst_acc, 4))
print("RF test set macro average F1:", round(macro_f1, 4))
print("RF test set weighted average F1:", round(weighted_f1, 4))

RF best n_estimators: 30
RF best max_leaf_nodes: 6
RF cross-validation accuracy: 0.9411
RF test set accuracy: 0.9429
RF test set macro average F1: 0.9414
RF test set weighted average F1: 0.9427
