https://assessment.hackerearth.com/challenges/hiring/eli-lilly-data-scientist-hiring-challenge-2021/problems/e0ba253fe5e44ea0b9965e78d6915d5e/

In [1]:
import datatable as dt
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
import warnings

warnings.filterwarnings('ignore')
warnings.simplefilter('ignore')


def clean_data(url, train=True):
    df = dt.fread(url).to_pandas()
    df.set_index('instance_id', inplace=True, drop=True)
    df.drop(columns='track_name', inplace=True)
    # Get the column's name if the column is of string type
    for col in df.iloc[:, :-1].select_dtypes(exclude=[np.number]).columns:
        if col != 'music_genre':
            df[col] = pd.Categorical(df[col]).codes

    if train:
        oe = LabelEncoder()
        target_col = df.columns[-1]
        y = df[target_col]
        X = df.drop(columns=[target_col])
        oe.fit(y)
        y = pd.Series(index=y.index, data=oe.transform(y))
        return X, y, oe
    else:
        return df

In [2]:
X_train, y_train, label_encoder = clean_data('train.csv', train=True)
X_test = clean_data('test.csv', train=False)

In [3]:
# Credit: https://johaupt.github.io/scikit-learn/tutorial/python/data%20processing/ml%20pipeline/model%20interpretation/columnTransformer_feature_names.html
def get_feature_names(column_transformer):
    """Get feature names from all transformers.
    Returns
    -------
    feature_names : list of strings
        Names of the features produced by transform.
    """

    # Remove the internal helper function
    #check_is_fitted(column_transformer)

    # Turn loopkup into function for better handling with pipeline later
    def get_names(trans):
        # >> Original get_feature_names() method
        if trans == 'drop' or (
                hasattr(column, '__len__') and not len(column)):
            return []
        if trans == 'passthrough':
            if hasattr(column_transformer, '_df_columns'):
                if ((not isinstance(column, slice))
                        and all(isinstance(col, str) for col in column)):
                    return column
                else:
                    return column_transformer._df_columns[column]
            else:
                indices = np.arange(column_transformer._n_features)
                return ['x%d' % i for i in indices[column]]
        if not hasattr(trans, 'get_feature_names'):
            # >>> Change: Return input column names if no method avaiable
            # Turn error into a warning
            warnings.warn("Transformer %s (type %s) does not "
                          "provide get_feature_names. "
                          "Will return input column names if available"
                          % (str(name), type(trans).__name__))
            # For transformers without a get_features_names method, use the input
            # names to the column transformer
            if column is None:
                return []
            else:
                return [f for f in column]

        return [f for f in trans.get_feature_names()]

    ### Start of processing
    feature_names = []

    # Allow transformers to be pipelines. Pipeline steps are named differently, so preprocessing is needed
    if type(column_transformer) == Pipeline:
        l_transformers = [(name, trans, None, None) for step, name, trans in column_transformer._iter()]
    else:
        # For column transformers, follow the original method
        l_transformers = list(column_transformer._iter(fitted=True))

    for name, trans, column, _ in l_transformers:
        if type(trans) == Pipeline:
            # Recursive call on pipeline
            _names = get_feature_names(trans)
            # if pipeline has no transformer that returns names
            if len(_names) == 0:
                _names = [f for f in column]
            feature_names.extend(_names)
        else:
            feature_names.extend(get_names(trans))

    return feature_names

In [4]:
from typing import Union
from sklearn.preprocessing import StandardScaler
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
import pickle


# Credit: https://stackoverflow.com/questions/60273501/column-specific-processing-in-an-sklearn-pipeline
def process_data(X: pd.DataFrame, y: Union[None, pd.Series], train=True):
    impute_transformer = Pipeline([('impute', IterativeImputer())])
    scale_transformer = Pipeline([('standard_scale', StandardScaler())])

    null_cols = X.columns[X.isnull().any()]
    to_be_scaled_cols = []
    for col in X.columns:
        if X[col].nunique() < 3:
            continue
        else:
            to_be_scaled_cols.append(col)
    if train:
        processor = ColumnTransformer([
            ('imputed', impute_transformer, null_cols),
            ('scaled', scale_transformer, to_be_scaled_cols),
        ], remainder='passthrough')
        processor.fit(X)
        # Save to file in the current working directory
        with open('processor.pkl', 'wb') as file:
            pickle.dump(processor, file)
        X = pd.DataFrame(processor.transform(X), columns=get_feature_names(processor), index=X.index)
        return X, y
    else:
        # Load from file
        with open('processor.pkl', 'rb') as file:
            processor = pickle.load(file)
        X = pd.DataFrame(processor.transform(X), columns=get_feature_names(processor), index=X.index)
        return X

In [5]:
X_train, y_train = process_data(X_train, y_train, train=True)
X_test = process_data(X_test, None, train=False)

In [6]:
X_train.to_csv('X_train.csv', header=True)
y_train.to_csv('y_train.csv', header=True)
X_test.to_csv('X_test.csv', header=True)

In [7]:
import pandas as pd

X_train = dt.fread("X_train.csv").to_pandas().set_index(keys='instance_id', drop=True)
y_train = dt.fread("y_train.csv", header=True).to_pandas().set_index(keys='instance_id', drop=True)
X_test = dt.fread("X_test.csv").to_pandas().set_index(keys='instance_id', drop=True)

In [8]:
import optuna
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier
from xgboost import XGBClassifier
from sklearn.model_selection import cross_val_score, StratifiedKFold


# Credit: https://medium.com/analytics-vidhya/hyperparameters-optimization-for-lightgbm-catboost-and-xgboost-regressors-using-bayesian-6e7c495947a9
# Define an objective function to be maximized.
def objective(trial, X_train, y_train, cv, scoring):
    classifier = trial.suggest_categorical('classifier', ['lightgbm', 'catboost', 'xgboost'])
    model = LGBMClassifier()
    # Setup values for the hyperparameters:
    if classifier == 'lightgbm':
        params = {
            "num_leaves": trial.suggest_int('num_leaves', 45, 60),
            'min_child_samples': trial.suggest_uniform('min_child_samples', 100, 500),
            'min_child_weight': trial.suggest_int('min_child_weight', 30, 50, 5),
            'subsample': trial.suggest_uniform('subsample', 0.2, 0.8),
            'colsample_bytree': trial.suggest_uniform('colsample_bytree', 0.4, 0.6),
            'reg_alpha': trial.suggest_categorical('reg_alpha', [0, 1e-1, 1, 2, 5, 7, 10, 50, 100]),
            'reg_lambda': trial.suggest_categorical('reg_lambda', [0, 1e-1, 1, 5, 10, 20, 50, 100]),
        }
        model = LGBMClassifier(**params)

    elif classifier == 'catboost':
        params = {
            "depth": trial.suggest_int('depth', 1, 10),
            'iterations': trial.suggest_categorical('iterations', [250, 100, 500, 1000]),
            'learning_rate': trial.suggest_categorical('learning_rate', [0.03, 0.001, 0.01, 0.1, 0.2, 0.3]),
            'l2_leaf_reg': trial.suggest_categorical('l2_leaf_reg', [3, 1, 5, 10, 100]),
            'border_count': trial.suggest_categorical('border_count', [32, 5, 10, 20, 50, 100, 200]),
            'bagging_temperature': trial.suggest_categorical('bagging_temperature', [0.03, 0.09, 0.25, 0.75]),
            'random_strength': trial.suggest_categorical('random_strength', [0.2, 0.5, 0.8]),
            'max_ctr_complexity': trial.suggest_categorical('max_ctr_complexity', [1, 2, 3, 4, 5])
        }
        model = CatBoostClassifier(**params)

    else:
        params = {
            "min_child_weight": trial.suggest_int('min_child_weight', 14, 20),
            'gamma': trial.suggest_int('gamma', 0, 5),
            "max_depth": trial.suggest_int('max_depth', 5, 10),
            'subsample': trial.suggest_uniform('subsample', 0.5, 1.0),
            "colsample_bytree": trial.suggest_uniform('colsample_bytree', 0.1, 1.0),

        }
        model = XGBClassifier(**params)

    # Scoring method:
    score = cross_val_score(model, X_train, y_train, n_jobs=-1, cv=cv, scoring=scoring)
    return score.mean()

In [9]:
ss = StratifiedKFold(n_splits=5)

# Create study that minimizes
study = optuna.create_study(direction='maximize')
study.optimize(lambda trial: objective(trial, X_train, y_train, cv=ss, scoring='f1_macro'),
               n_trials=100)

[32m[I 2021-12-14 16:01:45,382][0m A new study created in memory with name: no-name-5907870d-e5d1-4b15-b7f2-5dc3b8c173b1[0m
[32m[I 2021-12-14 16:01:56,583][0m Trial 0 finished with value: 0.6032451473334677 and parameters: {'classifier': 'xgboost', 'min_child_weight': 18, 'gamma': 0, 'max_depth': 5, 'subsample': 0.566482012209103, 'colsample_bytree': 0.1096541505413382}. Best is trial 0 with value: 0.6032451473334677.[0m
[32m[I 2021-12-14 16:02:22,959][0m Trial 1 finished with value: 0.6123840334489298 and parameters: {'classifier': 'xgboost', 'min_child_weight': 19, 'gamma': 2, 'max_depth': 7, 'subsample': 0.8368097305642075, 'colsample_bytree': 0.46773610748539207}. Best is trial 1 with value: 0.6123840334489298.[0m
[33m[W 2021-12-14 16:02:23,428][0m Trial 2 failed, because the objective function returned nan.[0m
[32m[I 2021-12-14 16:02:54,248][0m Trial 3 finished with value: 0.6169462896869758 and parameters: {'classifier': 'xgboost', 'min_child_weight': 17, 'gamma': 2

In [10]:
print('Five best values')
pd.options.display.float_format = '{:,.2f}'.format
study.trials_dataframe().sort_values('value', ascending=False).head(5)

Five best values


Unnamed: 0,number,value,datetime_start,datetime_complete,duration,params_bagging_temperature,params_border_count,params_classifier,params_colsample_bytree,params_depth,...,params_max_ctr_complexity,params_max_depth,params_min_child_samples,params_min_child_weight,params_num_leaves,params_random_strength,params_reg_alpha,params_reg_lambda,params_subsample,state
4,4,0.62,2021-12-14 16:02:54.248519,2021-12-14 16:03:32.340402,0 days 00:00:38.091883,,,xgboost,0.95,,...,,5.0,,18.0,,,,,0.92,COMPLETE
6,6,0.62,2021-12-14 16:03:32.543867,2021-12-14 16:15:53.883322,0 days 00:12:21.339455,0.09,200.0,catboost,,9.0,...,1.0,,,,,0.5,,,,COMPLETE
3,3,0.62,2021-12-14 16:02:23.428408,2021-12-14 16:02:54.248519,0 days 00:00:30.820111,,,xgboost,0.45,,...,,10.0,,17.0,,,,,0.77,COMPLETE
12,12,0.62,2021-12-14 16:17:03.763490,2021-12-14 16:17:23.072920,0 days 00:00:19.309430,,,xgboost,0.58,,...,,5.0,,20.0,,,,,0.51,COMPLETE
14,14,0.61,2021-12-14 16:22:08.308988,2021-12-14 16:29:10.514406,0 days 00:07:02.205418,0.09,50.0,catboost,,10.0,...,1.0,,,,,0.2,,,,COMPLETE


In [11]:
trial = study.best_trial
print(f'Loss : {trial}')
print(f"Best hyperparameters: {trial.params}")

Loss : FrozenTrial(number=4, values=[0.6181791788398555], datetime_start=datetime.datetime(2021, 12, 14, 16, 2, 54, 248519), datetime_complete=datetime.datetime(2021, 12, 14, 16, 3, 32, 340402), params={'classifier': 'xgboost', 'min_child_weight': 18, 'gamma': 0, 'max_depth': 5, 'subsample': 0.9204538551754038, 'colsample_bytree': 0.9532056047734623}, distributions={'classifier': CategoricalDistribution(choices=('lightgbm', 'catboost', 'xgboost')), 'min_child_weight': IntUniformDistribution(high=20, low=14, step=1), 'gamma': IntUniformDistribution(high=5, low=0, step=1), 'max_depth': IntUniformDistribution(high=10, low=5, step=1), 'subsample': UniformDistribution(high=1.0, low=0.5), 'colsample_bytree': UniformDistribution(high=1.0, low=0.1)}, user_attrs={}, system_attrs={}, intermediate_values={}, trial_id=4, state=TrialState.COMPLETE, value=None)
Best hyperparameters: {'classifier': 'xgboost', 'min_child_weight': 18, 'gamma': 0, 'max_depth': 5, 'subsample': 0.9204538551754038, 'colsam

In [12]:
study.best_params

{'classifier': 'xgboost',
 'min_child_weight': 18,
 'gamma': 0,
 'max_depth': 5,
 'subsample': 0.9204538551754038,
 'colsample_bytree': 0.9532056047734623}

In [13]:
best_params = {key: value for key, value in study.best_params.items() if key != 'classifier'}
if study.best_params['classifier'] == 'lightgbm':
    best_model = LGBMClassifier(**best_params)
elif study.best_params['classifier'] == 'catboost':
    best_model = CatBoostClassifier(**best_params)
else:
    best_model = XGBClassifier(**best_params)
best_model.fit(X_train, y_train)



XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=0.9532056047734623,
              enable_categorical=False, gamma=0, gpu_id=-1,
              importance_type=None, interaction_constraints='',
              learning_rate=0.300000012, max_delta_step=0, max_depth=5,
              min_child_weight=18, missing=nan, monotone_constraints='()',
              n_estimators=100, n_jobs=4, num_parallel_tree=1,
              objective='multi:softprob', predictor='auto', random_state=0,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=None,
              subsample=0.9204538551754038, tree_method='exact',
              validate_parameters=1, verbosity=None)

In [14]:
y_pred = best_model.predict(X_test)
y_pred = label_encoder.inverse_transform(y_pred)
y_pred = pd.DataFrame(data=y_pred, columns=['music_genre'], index=X_test.index)
y_pred.index = X_test.index
y_pred.to_csv('Predicted value from Optuna.csv', index=True)

In [15]:
import webbrowser
webbrowser.open('https://youtu.be/5dwxGvmUG90?t=53')

True