In [1]:
import warnings
warnings.filterwarnings('ignore')
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.model_selection import GridSearchCV
import pandas as pd
import pickle

In [2]:
train_slice_df = pd.read_csv("slice_std_train.csv")
test_slice_df = pd.read_csv("slice_std_test.csv")
print(train_slice_df.shape, test_slice_df.shape)

(42800, 361) (10700, 361)


In [3]:
# Open the pickle file in binary read mode
with open('result1.pickle', 'rb') as file:
    old_results  = pickle.load(file)

In [4]:
X_train = train_slice_df.drop('Y', axis=1)
y_train = train_slice_df['Y']
X_test = test_slice_df.drop('Y', axis=1)
y_test = test_slice_df['Y']

## Decision Tree Model

In [5]:
def find_optimal_min_samples_leaf(X_train, y_train, X_test, y_test):
    # Create a range of values to test for min_samples_leaf
    parameters = {'min_samples_leaf': range(2, 51)}  # checking from 1 to 50 as an example
    dt_regressor = DecisionTreeRegressor(random_state=0)

    # Use GridSearchCV to find the optimal min_samples_leaf
    # by trying all values from 1 to 50 and evaluating using RMSE
    clf = GridSearchCV(dt_regressor, parameters, scoring='neg_mean_squared_error', cv=5, n_jobs=-1)
    clf.fit(X_train, y_train)

    # Check each candidate model
    for model in clf.cv_results_['params']:
        dt_regressor.set_params(**model)
        dt_regressor.fit(X_train, y_train)
        predictions = dt_regressor.predict(X_test)
        RMSE = mean_squared_error(y_test, predictions, squared=False)
        MAE = mean_absolute_error(y_test, predictions)
        
        # If the model meets the criteria, return it
        if RMSE <= 4.35 and MAE <= 1.5:
            return model['min_samples_leaf'], RMSE, MAE

    # If no model meets the criteria, return None
    return None

In [6]:
find_optimal_min_samples_leaf(X_train, y_train, X_test, y_test)

(2, 2.3283972634840713, 0.4466481835202493)

In [28]:
# Create and fit a decision tree model with default parameters and random_state=0
dt_model = DecisionTreeRegressor(random_state=0, min_samples_leaf = 32)
dt_model.fit(X_train, y_train)

# Make predictions and calculate RMSE and MAE for the test set
dt_predictions = dt_model.predict(X_test)
dt_RMSE = mean_squared_error(y_test, dt_predictions, squared=False)
dt_MAE = mean_absolute_error(y_test, dt_predictions)

# Initialize the dictionary to store validation metrics
validation_decision_tree = {'AIC': None, 'R2': None, 'RMSE': dt_RMSE, 'MAE': dt_MAE, 'num_variables': X_train.shape[1], 'min_samples_leaf': dt_model.min_samples_leaf}

## Random Forest

In [8]:
def find_optimal_n_estimators(X_train, y_train, X_test, y_test):
    optimal_values = {}

    # Range of `n_estimators` to test: from 10 up to a reasonable upper limit
    # We will start from 10 and increase stepwise. This range can be adjusted.
    for n_estimators in range(20, 30):
        rf_regressor = RandomForestRegressor(n_estimators=n_estimators, n_jobs=-1, random_state=0)
        rf_regressor.fit(X_train, y_train)

        # Make predictions on the test set
        predictions = rf_regressor.predict(X_test)
        RMSE = mean_squared_error(y_test, predictions, squared=False)
        MAE = mean_absolute_error(y_test, predictions)

        # Check if the obtained values meet the validation criteria
        if RMSE <= 1.4 and MAE <= 0.41:
            optimal_values = {'n_estimators': n_estimators, 'RMSE': RMSE, 'MAE': MAE, 'num_variables': X_train.shape[1]}
            break  # No need to check for more estimators if criteria are met

    return optimal_values

In [9]:
find_optimal_n_estimators(X_train, y_train, X_test, y_test)

{'n_estimators': 28,
 'RMSE': 1.393080074828867,
 'MAE': 0.4086587568674898,
 'num_variables': 360}

In [31]:
# Fit a random forest model with default parameters and random_state=0
rf_model = RandomForestRegressor(random_state=0, n_jobs=-1, n_estimators=28)
rf_model.fit(X_train, y_train)

# Make predictions and calculate RMSE and MAE for the test set
rf_predictions = rf_model.predict(X_test)
rf_RMSE = mean_squared_error(y_test, rf_predictions, squared=False)
rf_MAE = mean_absolute_error(y_test, rf_predictions)

# Initialize the dictionary to store validation metrics for the random forest
validation_rf = {'AIC': None, 'R2': None, 'RMSE': rf_RMSE, 'MAE': rf_MAE, 'num_variables': X_train.shape[1], 'n_estimators': 28}

## Dump Answer

In [32]:
old_results['Validation']['validation_decision_tree'] = validation_decision_tree
old_results['Validation']['validation_rf'] = validation_rf
old_results['Validation']

{'validation_reduced_lm': {'AIC': 302514.48178790975,
  'R2': 0.8638882045172942,
  'RMSE': 8.450981711744108,
  'MAE': 6.249744339096156,
  'num_variables': 245},
 'validation_PCR': {'AIC': 302506.1882273548,
  'R2': 0.8638954982621725,
  'RMSE': 8.447360569858315,
  'MAE': 6.245095210025417,
  'num_variables': 242},
 'validation_ridge': {'AIC': None,
  'R2': 0.858229094231964,
  'RMSE': 8.420103292960205,
  'MAE': 6.213675468649426,
  'num_variables': 360},
 'validation_lasso': {'AIC': None,
  'R2': 0.8580481710035675,
  'RMSE': 8.425474305254362,
  'MAE': 6.2043910454752424,
  'num_variables': 325},
 'validation_decision_tree': {'AIC': None,
  'R2': None,
  'RMSE': 4.268502905218808,
  'MAE': 1.4931511913628603,
  'num_variables': 360,
  'min_samples_leaf': 32},
 'validation_rf': {'AIC': None,
  'R2': None,
  'RMSE': 1.393080074828867,
  'MAE': 0.40865875686748976,
  'num_variables': 360,
  'n_estimators': 28}}

In [33]:
res = {'Validation': old_results['Validation'],
        'dt_model' : dt_model,
        'rf_model' : rf_model}

In [34]:
with open('result2.pickle', 'wb') as f:
    pickle.dump(res,f)