In [1]:
import sys
import os
sys.path.append(os.path.dirname(os.getcwd()))

import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
import xgboost as xgb
from bart_playground import *
import bartz
from stochtree import BARTModel
import time

import itertools
import pandas as pd
from sklearn.model_selection import KFold

## Abalone Data

In [2]:
from ucimlrepo import fetch_ucirepo 
  
# fetch dataset 
abalone = fetch_ucirepo(id=1) 
  
# data (as pandas dataframes) 
X = abalone.data.features 
y = abalone.data.targets 

# variable information 
print(abalone.variables)

             name     role         type demographic  \
0             Sex  Feature  Categorical        None   
1          Length  Feature   Continuous        None   
2        Diameter  Feature   Continuous        None   
3          Height  Feature   Continuous        None   
4    Whole_weight  Feature   Continuous        None   
5  Shucked_weight  Feature   Continuous        None   
6  Viscera_weight  Feature   Continuous        None   
7    Shell_weight  Feature   Continuous        None   
8           Rings   Target      Integer        None   

                   description  units missing_values  
0         M, F, and I (infant)   None             no  
1    Longest shell measurement     mm             no  
2      perpendicular to length     mm             no  
3           with meat in shell     mm             no  
4                whole abalone  grams             no  
5               weight of meat  grams             no  
6  gut weight (after bleeding)  grams             no  
7        

In [3]:
X = X.drop(columns=['Sex'])
X

Unnamed: 0,Length,Diameter,Height,Whole_weight,Shucked_weight,Viscera_weight,Shell_weight
0,0.455,0.365,0.095,0.5140,0.2245,0.1010,0.1500
1,0.350,0.265,0.090,0.2255,0.0995,0.0485,0.0700
2,0.530,0.420,0.135,0.6770,0.2565,0.1415,0.2100
3,0.440,0.365,0.125,0.5160,0.2155,0.1140,0.1550
4,0.330,0.255,0.080,0.2050,0.0895,0.0395,0.0550
...,...,...,...,...,...,...,...
4172,0.565,0.450,0.165,0.8870,0.3700,0.2390,0.2490
4173,0.590,0.440,0.135,0.9660,0.4390,0.2145,0.2605
4174,0.600,0.475,0.205,1.1760,0.5255,0.2875,0.3080
4175,0.625,0.485,0.150,1.0945,0.5310,0.2610,0.2960


In [4]:
X = X.values.astype(float)
y = np.array(y).reshape(-1)

In [5]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

In [6]:
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [None, 5, 10, 20],
    'max_features': [None, "sqrt", "log2"]
}
param_names = list(param_grid.keys())
param_list = list(itertools.product(*param_grid.values()))

memory_file = "Abalone_tuning_memory_rf.csv"

In [7]:
if os.path.exists(memory_file):
    memory_df = pd.read_csv(memory_file)
else:
    memory_df = pd.DataFrame(columns=param_names + ["cv_mse"])


In [8]:
start_idx = 0
param_list = param_list[start_idx:]

In [9]:
kf = KFold(n_splits=5, shuffle=True, random_state=42)

## Random Forest

In [10]:
from joblib import Parallel, delayed

def evaluate_params(params):
    n_estimators, max_depth, max_features = params
    mses = []
    for train_idx, val_idx in kf.split(X_train):
        X_tr, X_val = X_train[train_idx], X_train[val_idx]
        y_tr, y_val = y_train[train_idx], y_train[val_idx]
        model = RandomForestRegressor(
            n_estimators=n_estimators,
            max_depth=max_depth,
            max_features=max_features,
            random_state=42,
            n_jobs=1
        )
        model.fit(X_tr, y_tr)
        y_pred = model.predict(X_val)
        mse = mean_squared_error(y_val, y_pred)
        mses.append(mse)
    avg_mse = np.mean(mses)
    print(f"Params: {params}, CV MSE: {avg_mse:.4f}")
    return list(params) + [avg_mse]

params_to_run = [params for params in param_list if not ((memory_df[param_names] == params).all(axis=1)).any()]

results = Parallel(n_jobs=3)(delayed(evaluate_params)(params) for params in params_to_run)

In [11]:
for res in results:
    memory_df.loc[len(memory_df)] = res
    memory_df.to_csv(memory_file, index=False)

  memory_df.loc[len(memory_df)] = res
  memory_df.loc[len(memory_df)] = res
  memory_df.loc[len(memory_df)] = res
  memory_df.loc[len(memory_df)] = res
  memory_df.loc[len(memory_df)] = res


In [12]:
import pandas as pd

memory_file = "Abalone_tuning_memory_rf.csv"

memory_df = pd.read_csv(memory_file)

In [13]:
print(memory_df.sort_values("cv_mse").head())

    n_estimators  max_depth max_features    cv_mse
35         300.0       20.0         log2  4.690042
34         300.0       20.0         sqrt  4.690042
14         200.0        NaN         log2  4.695232
13         200.0        NaN         sqrt  4.695232
20         200.0       10.0         log2  4.705427
