# Notebook documenting model training, selection, and saving pre-trained models

In [None]:
import os
from retrofit_cost_tool import load_data, preprocess_data, split_data, train_ridge_model, train_elastic_net_model, train_random_forest_model, train_gradient_boosting_model, evaluate_model, save_model, model_selection


In [None]:
# Load training data
file_path = '../data/srce_train.csv'
data = load_data(file_path)

In [None]:
# Preprocess data
features_string = ['seismicity_pga050', 'p_obj_dummy', 'bldg_group_dummy', 'sp_dummy', 'occup_cond', 'historic_dummy']
features_num = ['area', 'bldg_age', 'stories']
target = 'ystruct19'
X, y = preprocess_data(data, features_string, features_num, target)

# Split data into training and validation sets
X_train, X_valid, y_train, y_valid = split_data(X, y)


In [None]:
# Train models
models = {
    'ridge': train_ridge_model(X_train, y_train),
    'elastic_net': train_elastic_net_model(X_train, y_train),
    'random_forest': train_random_forest_model(X_train, y_train),
    'gradient_boosting': train_gradient_boosting_model(X_train, y_train)
}

# Model selection
best_model_name, best_model = model_selection(models, X_train, y_train)
print(f'Best model: {best_model_name}')


In [None]:
# Evaluate best model
rmse = evaluate_model(best_model, X_valid, y_valid)
print(f'RMSE: {rmse:.4f}')


In [None]:
# Save best model
model_path = os.path.join('../models', f'{best_model_name}_model.pkl')
os.makedirs(os.path.dirname(model_path), exist_ok=True)
save_model(best_model, model_path)
print(f'Saved {best_model_name} model to {model_path}')

In [None]:
# Save all models
for model_name, model in models.items():
    model_path = os.path.join('../models', f'{model_name}_model.pkl')
    os.makedirs(os.path.dirname(model_path), exist_ok=True)
    save_model(model, model_path)
    print(f'Saved {model_name} model to {model_path}')