In [4]:
import pandas as pd
import numpy as np

from m4_feats_functions import *
from m5_models import *
from m7_utils import *
from m3_model_params import lgb_params_1, xgb_params
from sklearn.metrics import mean_squared_error
from tqdm import tqdm  # Import tqdm

In [5]:
# Paths to the train and test directories
INPUT_DIR = 'kaggle/input/linking-writing-processes-to-writing-quality'
FEATURE_STORE = 'feature_store'
train_scores = pd.read_csv(f'{INPUT_DIR}/train_scores.csv')
train_feats = pd.read_pickle('feature_store/base_feats/train_base_feats_2.pkl')
train_feats = train_feats.merge(train_scores, on='id', how='left')
test_feats = pd.read_pickle('feature_store/base_feats/test_base_feats_2.pkl')
train_dir = 'feature_store/train'
test_dir = 'feature_store/test'
# Usage
seed = 42
n_repeats = 5
n_splits = 10
target_col = 'score'

In [6]:
import numpy as np
import lightgbm as lgb
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import StratifiedKFold
import random

def random_param_search(train_feats, test_feats, n_iterations):
    # Define parameter ranges
    param_ranges = {
        'boosting_type': ['gbdt', 'dart', 'goss'],
        'num_leaves': (15, 150),  # Range for num_leaves
        'learning_rate': (0.01, 0.2),  # Range for learning_rate
        'max_depth': (5, 20),  # Range for max_depth
        'min_child_samples': (5, 50),  # Range for min_child_samples
        'reg_alpha': (0, 1),  # Range for L1 regularization
        'reg_lambda': (0, 1),  # Range for L2 regularization
    }

    best_rmse = float("inf")
    best_params = None
    test_predictions_df = pd.DataFrame()

    for _ in range(n_iterations):

        # Sampling parameters from ranges
        params = {
            'boosting_type': random.choice(param_ranges['boosting_type']),
            'num_leaves': random.randint(*param_ranges['num_leaves']),
            'learning_rate': random.uniform(*param_ranges['learning_rate']),
            'max_depth': random.randint(*param_ranges['max_depth']),
            'min_child_samples': random.randint(*param_ranges['min_child_samples']),
            'reg_alpha': random.uniform(*param_ranges['reg_alpha']),
            'reg_lambda': random.uniform(*param_ranges['reg_lambda']),
            'n_estimators': 300
        }
        
        # Train and evaluate model
        boosting_type = params['boosting_type']
        _, _, rmse, model = cv_pipeline(train_feats, test_feats, params, boosting_type)
                                        
        new_iter = {'rmse': [rmse],
                    'params': str(model.get_params())}

        test_predictions_df = pd.concat([test_predictions_df, pd.DataFrame(data=new_iter)])
        # Update best parameters if current model is better
        if rmse < best_rmse:
            print(f"New best RMSE: {rmse:.6f}")
            best_rmse = rmse
            best_params = model.get_params()
    
    print(f"Best RMSE: {best_rmse:.6f}")
    print("Best Parameters:", best_params)
    return test_predictions_df

# Example usage
test_predictions_df = random_param_search(train_feats, test_feats, 200)

LGBM Average RMSE over 50 folds: 0.622802
New best RMSE: 0.622802
LGBM Average RMSE over 50 folds: 0.606161
New best RMSE: 0.606161
LGBM Average RMSE over 50 folds: 0.611611
LGBM Average RMSE over 50 folds: 0.612280
LGBM Average RMSE over 50 folds: 0.619868
LGBM Average RMSE over 50 folds: 0.607789
LGBM Average RMSE over 50 folds: 0.607032
LGBM Average RMSE over 50 folds: 0.612013
LGBM Average RMSE over 50 folds: 0.618102
LGBM Average RMSE over 50 folds: 0.620845
LGBM Average RMSE over 50 folds: 0.615602
LGBM Average RMSE over 50 folds: 0.606533
LGBM Average RMSE over 50 folds: 0.620912
LGBM Average RMSE over 50 folds: 0.625116
LGBM Average RMSE over 50 folds: 0.611155
LGBM Average RMSE over 50 folds: 0.622289
LGBM Average RMSE over 50 folds: 0.609465
LGBM Average RMSE over 50 folds: 0.603816
New best RMSE: 0.603816
LGBM Average RMSE over 50 folds: 0.618701
LGBM Average RMSE over 50 folds: 0.614396
LGBM Average RMSE over 50 folds: 0.621933
LGBM Average RMSE over 50 folds: 0.608788
LGBM

KeyboardInterrupt: 

In [7]:
test_predictions_df.sort_values('rmse')

Unnamed: 0,rmse,params
0,0.603186,"boosting_type: gbdt, num_leaves: 16, learning_..."
0,0.603505,"boosting_type: gbdt, num_leaves: 16, learning_..."
0,0.603551,"boosting_type: gbdt, num_leaves: 17, learning_..."
0,0.604466,"boosting_type: goss, num_leaves: 88, learning_..."
0,0.604790,"boosting_type: gbdt, num_leaves: 16, learning_..."
...,...,...
0,2.038624,"boosting_type: dart, num_leaves: 35, learning_..."
0,2.086226,"boosting_type: dart, num_leaves: 76, learning_..."
0,2.087839,"boosting_type: dart, num_leaves: 100, learning..."
0,2.121359,"boosting_type: dart, num_leaves: 120, learning..."
