```
Score = 20.467
Key points: 
    (1) XGBRegressor model
    (2) optuna lib: to define an objective function to be minimized
    (3) KFold cross validation
    (4) Use ONLY useful_features = ['Subject Focus', 'Eyes', 'Face', 'Near', 'Action', 'Accessory',
                                    'Group', 'Collage', 'Human', 'Occlusion', 'Info', 'Blur']
        NOT use images in the image dataset
    
```

In [1]:
import pandas as pd
import numpy as np
import os
from sklearn import model_selection
from sklearn.metrics import mean_squared_error
from xgboost import XGBRegressor
from functools import partial
import optuna
import random

path_input = r'../input/petfinder-pawpularity-score/'

## Using optuna: to define an objective function to be minimized.

In [2]:
def objective(trial, fold, df, useful_features):
    '''
    To performs trials of parameter optimization for XGBoost regression model.
    :param trial: optuna trial object to generate hyperparameters
    :param fold: a fold in kfold for train and validation dataset. 
                 Ex. kfold = 10 (having 0, 1, .., 9) and fold = 2
                 -> data at fold = 2 for validation and others for training
    :param df: a dataframe from train.csv with added kfold column.
        Ex.
        Id                              Subject Focus  Eyes Face  Near Action Accessory Group Collage Human Occlusion Info Blur Pawpularity  kfold
        0007de18844b0dbbb5e1f607da0606e0   0            1     1    1     0    0      1         0      0      0     0         0     63         2
        0009c66b9439883ba2750fb825e1d7db   0            1     1    0     0    0      0         0      0      0     0         0     4          9
    :param useful_features: column names of df using for X_train and X_valid
        Ex. useful_features = ['Subject Focus', 'Eyes', 'Face', 'Near', 'Action', 'Accessory',
                               'Group', 'Collage', 'Human', 'Occlusion', 'Info', 'Blur']
    return: RMSE score
    '''
    # Get Parameters for XGBRegressor
    # η (learning_rate): Boosting learning rate (xgb's "eta")
    η = trial.suggest_float('η', 1e-4, 0.5, log=True)  #  1e-2, 0.25
    
    # λ (reg_lambda): L2 regularization term on weights
    λ = trial.suggest_loguniform('λ', 1e-9, 1000.0)  # 1e-8, 100.0
    
    # α (reg_alpha): L1 regularization term on weights
    α = trial.suggest_loguniform('α', 1e-9, 1000.0)  # 1e-8, 100.0
    
    # subsample: Subsample ratio of the training instance.
    subsample = trial.suggest_float('subsample', 0.01, 1.0) # 0.1, 1.0
    
    # colsample_bytree: Subsample ratio of columns when constructing each tree.
    colsample_bytree = trial.suggest_float('colsample_bytree', 0.01, 1.0) # 0.1, 1.0
    
    # max_depth: Maximum tree depth for base learners.
    max_depth = trial.suggest_int('max_depth', 1, 20)  # 1, 7
    
    # Get train and validation dataset
    X_train = df[df.kfold != fold].reset_index(drop=True)
    X_valid = df[df.kfold == fold].reset_index(drop=True)
    
    Y_train = X_train.Pawpularity
    Y_valid = X_valid.Pawpularity
    
    X_train = X_train[useful_features]
    X_valid = X_valid[useful_features]
    
    # Model
    model = XGBRegressor(
        n_estimators=50000, # Number of gradient boosted trees. Equivalent to number of boosting rounds.  10000
        tree_method='gpu_hist', # Default: auto (XGBoost will choose the most conservative option available)
        random_state=42, # Random number seed
        gpu_id=0, # Device ordinal. 0 or 1, ..
        predictor='gpu_predictor',
        
        # BELOW: searching for parameter optimization
        learning_rate=η,
        reg_lambda=λ,
        reg_alpha=α,
        subsample=subsample,
        colsample_bytree=colsample_bytree,
        max_depth=max_depth        
    )
    
    # Train
    model.fit(X_train, Y_train,
              early_stopping_rounds=1000,  # 300
              eval_set=[(X_valid, Y_valid)],
              verbose=1000)
    
    # Prediction
    Y_valid_pred = model.predict(X_valid)
    
    # rmse
    rmse = mean_squared_error(Y_valid, Y_valid_pred, squared=False)
    return rmse

## Search for the best parameters

In [3]:
KFold = 10  
df = pd.read_csv(os.path.join(path_input, 'train.csv')).reset_index(drop=True)

# Create kfold column to split train and validation dataset
df["kfold"] = [random.randint(0, KFold) for i in range(len(df))]

# Get df_test
df_test = pd.read_csv(os.path.join(path_input,'test.csv')).reset_index(drop=True)
sample_submission = pd.read_csv(os.path.join(path_input,'sample_submission.csv'))

# Get feature columns (for X)
useful_features = [
    'Subject Focus', 'Eyes', 'Face', 'Near', 'Action', 'Accessory',
    'Group', 'Collage', 'Human', 'Occlusion', 'Info', 'Blur'
]

opt_fun = partial(
    objective,
    fold=0,
    df=df,
    useful_features=useful_features,
)

# At optuna lib: Create a new study.
study = optuna.create_study(direction="minimize")

# Invoke optimization of the objective function.
study.optimize(opt_fun, n_trials=10)  # 200, 1000
print(f'study.best_params \n {study.best_params}')
print(f'study.best_value \n {study.best_value}')

In [4]:
def generate_predictions(params, fold, df, df_test, useful_features): 
    '''
    Very similar to def objective(trial, fold, df, useful_features)
    Note here: change trial to params
    
    :param params: the study.best_params
    :param fold: a fold in kfold for train and validation dataset. 
                 Ex. kfold = 10 (having 0, 1, .., 9) and fold = 2
                 -> data at fold = 2 for validation and others for training
    :param df: a dataframe from train.csv with added kfold column.
        Ex.
        Id                              Subject Focus  Eyes Face  Near Action Accessory Group Collage Human Occlusion Info Blur Pawpularity  kfold
        0007de18844b0dbbb5e1f607da0606e0   0            1     1    1     0    0      1         0      0      0     0         0     63         2
        0009c66b9439883ba2750fb825e1d7db   0            1     1    0     0    0      0         0      0      0     0         0     4          9
    
    :param df_test: from Kaggle
    
    :param useful_features: column names of df using for X_train and X_valid
        Ex. useful_features = ['Subject Focus', 'Eyes', 'Face', 'Near', 'Action', 'Accessory',
                               'Group', 'Collage', 'Human', 'Occlusion', 'Info', 'Blur']
    return: RMSE score
    '''
    
    # Get train, validation and test datasets
    X_train = df[df.kfold != fold].reset_index(drop=True)
    X_valid = df[df.kfold == fold].reset_index(drop=True)
    X_test = df_test.copy(deep=True) # deep copy of df_test
    
    Y_train = X_train.Pawpularity
    Y_valid = X_valid.Pawpularity  # Note: No Y_test here
    
    X_train = X_train[useful_features]
    X_valid = X_valid[useful_features]
    X_test = X_test[useful_features]
    
    # Model
    model = XGBRegressor(
        n_estimators=50000, # Number of gradient boosted trees. Equivalent to number of boosting rounds.   10000
        tree_method='gpu_hist', # Default: auto (XGBoost will choose the most conservative option available)
        random_state=42, # Random number seed
        gpu_id=0, # Device ordinal.
        predictor='gpu_predictor',          
        # Un-pack params        
        **params,
    )
    
    # Train
    model.fit(X_train, Y_train,
              early_stopping_rounds=1000, # 300
              eval_set=[(X_valid, Y_valid)],
              verbose=1000) # 1000
    
    # Prediction
    Y_valid_pred = model.predict(X_valid)
    Y_test_pred = model.predict(X_test)
    
    # rmse: here for printing
    rmse = mean_squared_error(Y_valid, Y_valid_pred, squared=False)
    print(f'rmse = {rmse}')
    
    return Y_test_pred

In [5]:
final_predictions = []
for fold in range(KFold):
    Y_test_pred = generate_predictions(params=study.best_params, 
                                       fold=fold, 
                                       df=df, 
                                       df_test=df_test, 
                                       useful_features=useful_features)    
    final_predictions.append(Y_test_pred)
# End of for
final_predictions = np.mean(np.column_stack(final_predictions), axis=1)
sample_submission.Pawpularity = final_predictions
sample_submission.to_csv("submission.csv", index=False)

In [6]:
!ls /kaggle/working/
df = pd.read_csv('//kaggle/working/submission.csv')
df