In [2]:
import os, optuna, joblib, pandas as pd, numpy as np
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import cross_val_score,cross_validate, KFold
from sklearn import preprocessing
import gc

In [6]:
AFGRUNDIR = "/media/vsevolod/T7/work/prj_kn_afterglow/"

# sim = {}; sim["name"] = "SFHoTim276_13_14_0025_150mstg_B0_HLLC"
sim = {}; sim["name"] = "SFHoTim276_135_135_45km_150mstg_B0_FUKA"
collated_file_path = AFGRUNDIR + sim["name"] + '/' + "collated.csv"

assert os.path.isfile(collated_file_path), "Collated file not found"
df = pd.read_csv(collated_file_path, index_col=0)
print(f"File loaded: {collated_file_path} {print(df.info(memory_usage='deep'))}")

features_names = [col for col in list(df.columns) if col not in ["flux"]] # here time is included

<class 'pandas.core.frame.DataFrame'>
Index: 6480000 entries, 0 to 6479999
Data columns (total 10 columns):
 #   Column     Dtype  
---  ------     -----  
 0   eps_e      float64
 1   eps_b      float64
 2   eps_t      float64
 3   p          float64
 4   theta_obs  float64
 5   n_ism      float64
 6   freq       float64
 7   time       float64
 8   flux       float64
 9   text       int64  
dtypes: float64(9), int64(1)
memory usage: 543.8 MB
None
File loaded: /media/vsevolod/T7/work/prj_kn_afterglow/SFHoTim276_135_135_45km_150mstg_B0_FUKA/collated.csv None


# Explore

In [18]:
print(df["text"].unique())
df = df.loc[df["text"] == 32]
df.drop(["text"], axis=1, inplace=True)

[32]


In [19]:
print(df.keys())

Index(['eps_e', 'eps_b', 'eps_t', 'p', 'theta_obs', 'n_ism', 'freq', 'time',
       'flux'],
      dtype='object')


In [7]:
X = df.copy()
y = np.array( X.loc[:,"flux"] )
X = np.array( X.loc[:,features_names] )

# transform target
y = np.log10(y)

# transform features
scaler = preprocessing.MinMaxScaler()
scaler.fit(X)

In [9]:
clf = RandomForestRegressor(
    random_state=42,
    oob_score=True,
    n_jobs=4,
    n_estimators=50,
    max_depth=10,
    min_samples_split=2,
    min_samples_leaf=2
)
scores = cross_validate(
    clf,
    X,y,
    verbose=1,
    n_jobs=1,
    scoring='neg_mean_squared_error',
    cv=4)
scores["test_score"].mean()

[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done   2 out of   4 | elapsed:  3.9min remaining:  3.9min
[Parallel(n_jobs=4)]: Done   4 out of   4 | elapsed:  3.9min finished


-0.29864852577453

In [10]:
print(scores)

{'fit_time': array([231.91555905, 231.85501361, 231.95413136, 231.90727544]), 'score_time': array([1.03164864, 1.03530145, 1.04971719, 1.04965711]), 'test_score': array([-0.29581061, -0.30342422, -0.29544974, -0.29990953])}


In [ ]:
class RFObjective():
    def __init__(self):
        self.rf_int_pars = {
            "n_estimators":[50, 300],
            "max_depth":[10, 100],
            "min_samples_split":[2, 15],
            "min_samples_leaf":[1, 10]
        }
    def __call__(self, trial:optuna.trial.Trial):
        # Define the hyperparameters to be tuned
        pars = {}
        for par, range in self.rf_int_pars.items():
            pars[par] = trial.suggest_int(par, range[0], range[1])

        # random forest regressor with suggested hyperparameters
        rf = RandomForestRegressor(
            **pars,
            random_state=23,
            oob_score=True,
            n_jobs=4
        )

        # K-Fold Cross-Validation
        kfold = KFold(n_splits=5, shuffle=True, random_state=42)

        # Perform cross-validation and return the average score
        scores = cross_val_score(rf, X, y, cv=kfold, scoring='neg_mean_squared_error')

        return np.mean(scores)

# Create a study object and specify the direction as 'maximize' or 'minimize'
study = optuna.create_study(
    study_name="example-study",
    direction='maximize',
    pruner=optuna.pruners.MedianPruner(n_warmup_steps=5)
)
study.optimize(RFObjective(),
               n_trials=20,
               callbacks=[lambda study, trial: gc.collect()])

# Print the optimal hyperparameters
print(f"Best trial: {study.best_trial.params}")


# Save the result
outdir = os.getcwd()+"/"+"studies/"
if not os.path.isdir(outdir):
    os.mkdir(outdir)
fname = outdir+"optuna_rf_study.pkl"
joblib.dump(study, fname)
study = joblib.load(fname)
print(f"Best trial: {study.best_trial.params}")

In [10]:


def objective(trial:optuna.trial.Trial):
    # Define the hyperparameters to be tuned
    n_estimators = trial.suggest_int('n_estimators', 50, 300)
    max_depth = trial.suggest_int('max_depth', 10, 100, log=True)
    min_samples_split = trial.suggest_int('min_samples_split', 2, 15)
    min_samples_leaf = trial.suggest_int('min_samples_leaf', 1, 10)

    # Create a random forest regressor with suggested hyperparameters
    rf = RandomForestRegressor(
        n_estimators=n_estimators,
        max_depth=max_depth,
        min_samples_split=min_samples_split,
        min_samples_leaf=min_samples_leaf,
        random_state=23,
        oob_score=True,
        n_jobs=4
    )

    # K-Fold Cross-Validation
    kfold = KFold(n_splits=5, shuffle=True, random_state=42)

    # Perform cross-validation and return the average score
    scores = cross_val_score(rf, X, y, cv=kfold, scoring='neg_mean_squared_error')
    
    return np.mean(scores)

# Create a study object and specify the direction as 'maximize' or 'minimize'
study = optuna.create_study(
    direction='maximize',
    pruner=optuna.pruners.MedianPruner(n_warmup_steps=5)
)
study.optimize(objective, n_trials=20,
               callbacks=[lambda study, trial: gc.collect()])

# Print the optimal hyperparameters
print(f"Best trial: {study.best_trial.params}")
joblib.dump(study, "optuna_rf_study.pkl")

[I 2024-01-28 16:14:06,057] A new study created in memory with name: no-name-b5c2c12e-526f-4237-86eb-b838919e090f
[W 2024-01-28 16:23:40,167] Trial 0 failed with parameters: {'n_estimators': 87, 'max_depth': 58, 'min_samples_split': 8, 'min_samples_leaf': 5} because of the following error: KeyboardInterrupt().
Traceback (most recent call last):
  File "/home/vsevolod/anaconda3/lib/python3.11/site-packages/optuna/study/_optimize.py", line 200, in _run_trial
    value_or_values = func(trial)
                      ^^^^^^^^^^^
  File "/tmp/ipykernel_12436/2438905381.py", line 23, in objective
    scores = cross_val_score(rf, X, y, cv=kfold, scoring='neg_mean_squared_error')
             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/home/vsevolod/anaconda3/lib/python3.11/site-packages/sklearn/model_selection/_validation.py", line 562, in cross_val_score
    cv_results = cross_validate(
                 ^^^^^^^^^^^^^^^
  File "/home/vsevolod/anaconda3/lib/pyt

KeyboardInterrupt: 

In [4]:
from imblearn.pipeline import make_pipeline, Pipeline
from imblearn.over_sampling import SMOTE
import optuna
from sklearn.model_selection import cross_validate
from sklearn.metrics import matthews_corrcoef, make_scorer
from sklearn.ensemble import RandomForestClassifier

ImportError: cannot import name '_MissingValues' from 'sklearn.utils._param_validation' (/home/vsevolod/anaconda3/lib/python3.11/site-packages/sklearn/utils/_param_validation.py)