In [None]:
#!/usr/bin/env python
# coding: utf-8

# In[1]:


import os
import pandas as pd 
import numpy as np
import matplotlib.pyplot as plt # General plotting
import seaborn as sns # Scatterplots and Histograms
import catboost


os.chdir('/gpfs/home/djs19ctu/ML_project/anage_all_data_nogen')
os.getcwd()


contents = os.listdir(os.getcwd())

print("Contents of the current working directory:")
for item in contents:
    print(item)


train_df_imputed = pd.read_csv('train_mammal_data_imputed_nogen.csv')
test_df_imputed = pd.read_csv('test_mammal_data_imputed_nogen.csv')

try:
    import sklearn
    print(f"scikit-learn version: {sklearn.__version__}")
    print("scikit-learn imported successfully!")
except ImportError as e:
    print("Error importing scikit-learn:", e)

try:
    import catboost
    print(f"CatBoost version: {catboost.__version__}")
    print("CatBoost imported successfully!")
except ImportError as e:
    print("Error importing CatBoost:", e)
    print("To install CatBoost, run: pip install catboost")
    

from catboost import CatBoostRegressor
from sklearn.model_selection import RepeatedKFold, RandomizedSearchCV
from scipy.stats import randint, uniform

# Define the features and target
features = ['order', 'family', 'genus',
            'adult_mass_g', 'adult_brain_mass_g',
            'female_maturity_d', 'gestation_length_d',
            'litter_size_n', 'litters_per_year_n',
            'weaning_age_d',
            'hibernation_torpor',
            'trophic_level', 'activity_cycle',
            'freshwater', 'marine', 'terrestrial_non-volant', 'terrestrial_volant', 'habitat_breadth_n',
           'specimen origin']

target = 'maximum longevity (yrs)'

# Assuming train_df_imputed is defined elsewhere in your code
data_train = train_df_imputed.copy()

X = data_train[features].copy()  # Explicitly make a copy
y = data_train[target]

# Convert categorical features to category type
categorical_features = ['order', 'family', 'genus', 'specimen origin']
for col in categorical_features:
    X.loc[:, col] = X[col].astype('category')  # Use .loc to avoid SettingWithCopyWarning

# Define the model
model = CatBoostRegressor(thread_count=4, verbose=0)  # Set thread_count to 4

# Define the parameter grid for randomized search
param_dist = {
    'iterations': randint(100, 1000),
    'learning_rate': uniform(0.01, 0.3),
    'depth': randint(3, 12),
    'l2_leaf_reg': uniform(0, 20),
    'cat_features': [categorical_features]
}


# Define the repeated k-fold cross-validator
rkf = RepeatedKFold(n_splits=5, n_repeats=10, random_state=42)

# Perform randomized search with cross-validation
random_search = RandomizedSearchCV(model, param_distributions=param_dist, n_iter=100, cv=rkf, scoring='r2', random_state=42, n_jobs=-1)
random_search.fit(X, y)

# Get the best model
catboost_best_model = random_search.best_estimator_
catboost_params = random_search.best_params_

# Save the results to a text file
with open("catboost_param_results_nogen.txt", "w") as file:
    file.write(f"Best Model: {catboost_best_model}\n")
    file.write(f"Best Parameters: {catboost_params}\n")

Contents of the current working directory:
data_exploration_general.ipynb
databases
previous_analysis
mammal_data_filtered.csv
param_search_cbQ.err
correlation_plots.ipynb
manuscript_plots_PathAnalysisData.ipynb
CatBoost_withoutQuality.ipynb
Hyperparameter_search.ipynb
.ipynb_checkpoints
test_mammal_data_imputed_nogen.csv
CatBoost_withQuality.ipynb
1
mammal_data_imputed_nogen.csv
param_search_cbQ.out
2. database_impute.ipynb
figures
hpcadmin.sub
train_mammal_data_imputed_nogen.csv
mammal_data.csv
manuscript_plots.ipynb
catboost_info
1. database_prepare_explore.ipynb
scikit-learn version: 0.22.2
scikit-learn imported successfully!
CatBoost version: 1.2.5
CatBoost imported successfully!

































































































































































In [None]:
### Simpler Parameter Tuning

from catboost import CatBoostRegressor, Pool
from sklearn.model_selection import KFold
from sklearn.metrics import r2_score
import numpy as np

# Define parameter ranges (narrow search)
param_grid = {
    'depth': [3, 4, 5, 6],
    'iterations': [500, 800, 1000],
    'learning_rate': [0.05, 0.07, 0.1],
    'l2_leaf_reg': [2, 4, 6],
    'bootstrap_type': ['Bayesian'],
    'bagging_temperature': [0.5, 1]
}

# Prepare KFold
kf = KFold(n_splits=5, shuffle=True, random_state=42)

best_score = -np.inf
best_params = None

# Loop through parameter combinations
for depth in param_grid['depth']:
    for iters in param_grid['iterations']:
        for lr in param_grid['learning_rate']:
            for reg in param_grid['l2_leaf_reg']:
                for temp in param_grid['bagging_temperature']:
                    
                    cv_scores = []
                    
                    for train_index, val_index in kf.split(X):
                        X_train_cv, X_val_cv = X.iloc[train_index], X.iloc[val_index]
                        y_train_cv, y_val_cv = y.iloc[train_index], y.iloc[val_index]

                        train_pool_cv = Pool(data=X_train_cv, label=y_train_cv, cat_features=categorical_features)
                        val_pool_cv = Pool(data=X_val_cv, label=y_val_cv, cat_features=categorical_features)

                        model = CatBoostRegressor(
                            loss_function='RMSE',
                            iterations=iters,
                            learning_rate=lr,
                            depth=depth,
                            l2_leaf_reg=reg,
                            bootstrap_type='Bayesian',
                            bagging_temperature=temp,
                            cat_features=categorical_features,
                            verbose=False
                        )

                        model.fit(train_pool_cv, eval_set=val_pool_cv, early_stopping_rounds=50, verbose=False)
                        y_val_pred = model.predict(val_pool_cv)
                        r2_cv = r2_score(y_val_cv, y_val_pred)
                        cv_scores.append(r2_cv)

                    mean_cv_score = np.mean(cv_scores)

                    if mean_cv_score > best_score:
                        best_score = mean_cv_score
                        best_params = {
                            'depth': depth,
                            'iterations': iters,
                            'learning_rate': lr,
                            'l2_leaf_reg': reg,
                            'bootstrap_type': 'Bayesian',
                            'bagging_temperature': temp
                        }

print("Best CV R²:", best_score)
print("Best Parameters:", best_params)