In [1]:
import datatable as dt
import pandas as pd
import numpy as np

df_train = dt.fread('train.csv').to_pandas()

In [2]:
for col in df_train.columns:
    if df_train[col].nunique() < 5:
        print(f'{col}: {df_train[col].unique()}')

gender: ['F' 'M' 'XNA']
owns_car: ['N' 'Y' '']
owns_house: ['Y' 'N']
migrant_worker: [ 1.  0. nan]
prev_defaults: [2 0 1]
default_in_last_6months: [ True False]
credit_card_default: [ True False]


In [26]:
import datatable as dt
import pandas as pd
import numpy as np
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
import pickle
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

# Problem: https://www.hackerearth.com/challenges/competitive/amexpert-code-lab/machine-learning/credit-card-default-risk-5-95cbc85f/
# Credit: https://johaupt.github.io/scikit-learn/tutorial/python/data%20processing/ml%20pipeline/model%20interpretation/columnTransformer_feature_names.html

import warnings

warnings.filterwarnings('ignore')
warnings.simplefilter('ignore')


def get_feature_names(column_transformer):
    """Get feature names from all transformers.
    Returns
    -------
    feature_names : list of strings
        Names of the features produced by transform.
    """

    # Remove the internal helper function
    #check_is_fitted(column_transformer)

    # Turn loopkup into function for better handling with pipeline later
    def get_names(trans):
        # >> Original get_feature_names() method
        if trans == 'drop' or (
                hasattr(column, '__len__') and not len(column)):
            return []
        if trans == 'passthrough':
            if hasattr(column_transformer, '_df_columns'):
                if ((not isinstance(column, slice))
                        and all(isinstance(col, str) for col in column)):
                    return column
                else:
                    return column_transformer._df_columns[column]
            else:
                indices = np.arange(column_transformer._n_features)
                return ['x%d' % i for i in indices[column]]
        if not hasattr(trans, 'get_feature_names'):
            # >>> Change: Return input column names if no method avaiable
            # Turn error into a warning
            warnings.warn("Transformer %s (type %s) does not "
                          "provide get_feature_names. "
                          "Will return input column names if available"
                          % (str(name), type(trans).__name__))
            # For transformers without a get_features_names method, use the input
            # names to the column transformer
            if column is None:
                return []
            else:
                return [f for f in column]

        return [f for f in trans.get_feature_names()]

    ### Start of processing
    feature_names = []

    # Allow transformers to be pipelines. Pipeline steps are named differently, so preprocessing is needed
    if type(column_transformer) == Pipeline:
        l_transformers = [(name, trans, None, None) for step, name, trans in column_transformer._iter()]
    else:
        # For column transformers, follow the original method
        l_transformers = list(column_transformer._iter(fitted=True))

    for name, trans, column, _ in l_transformers:
        if type(trans) == Pipeline:
            # Recursive call on pipeline
            _names = get_feature_names(trans)
            # if pipeline has no transformer that returns names
            if len(_names) == 0:
                _names = [f for f in column]
            feature_names.extend(_names)
        else:
            feature_names.extend(get_names(trans))

    return feature_names


# Credit: https://stackoverflow.com/questions/60273501/column-specific-processing-in-an-sklearn-pipeline
def clean_data(url, y=None, train=True):
    df = dt.fread(url).to_pandas()
    df.set_index('customer_id', inplace=True)
    imputer = ColumnTransformer([
        ('gender', SimpleImputer(missing_values='XNA', strategy='most_frequent'), ['gender']),
        ('owns_car', SimpleImputer(missing_values='', strategy='most_frequent'), ['owns_car']),
        ('migrant_worker', SimpleImputer(missing_values=np.nan, strategy='most_frequent'), ['migrant_worker']),
    ], remainder='passthrough')
    df = pd.DataFrame(data=imputer.fit_transform(df), columns=df.columns, index=df.index)
    print(df.head())
    if train:
        target_col = df.columns[-1]
        y = df[target_col]
        y = pd.Series(map(int, y), index=y.index)
        X = df.drop(columns=['name', target_col])
        return X, y
    else:
        X = df.drop(columns=['name'])
        return X


def process_data(X, y=None, train=True):
    impute_transformer = Pipeline([('impute', IterativeImputer())])
    one_hot_transformer = Pipeline([('one_hot', OneHotEncoder(drop='first'))])
    scale_transformer = Pipeline([('standard_scale', StandardScaler())])

    null_cols = X.columns[X.isnull().any()]
    oh_cols, to_be_scaled_cols = [], []
    for col in X.columns:
        if np.issubdtype(X[col].dtype, np.number) and X[col].nunique() < 3:
            continue
        if not np.issubdtype(X[col].dtype, np.number):
            oh_cols.append(col)
        else:
            to_be_scaled_cols.append(col)

    if train:
        processor = ColumnTransformer([
            ('imputed', impute_transformer, null_cols),
            ('encoded', one_hot_transformer, oh_cols),
            ('scaled', scale_transformer, to_be_scaled_cols),
        ], remainder='passthrough')
        processor.fit(X)
        # Save to file in the current working directory
        with open('processor.pkl', 'wb') as file:
            pickle.dump(processor, file)
        X = pd.DataFrame(processor.transform(X), columns=get_feature_names(processor), index=y.index)
        return X, y
    else:
        # Load from file
        with open('processor.pkl', 'rb') as file:
            processor = pickle.load(file)
        X = pd.DataFrame(processor.transform(X), columns=get_feature_names(processor), index=X.index)
        return X

In [27]:
X_train, y_train = clean_data('train.csv', train=True)
X_test = clean_data('test.csv', train=False)
# X_train, y_train = process_data(X_train, y_train, train=True)
# X_test = process_data(X_test, train=False)

ValueError: 1D data passed to a transformer that expects 2D data. Try to specify the column selection as a list of one item instead of a scalar.

In [None]:
X_train

In [23]:
y_train

customer_id
CST_115179    1
CST_121920    0
CST_109330    0
CST_128288    0
CST_151355    0
             ..
CST_130421    0
CST_136670    0
CST_145435    0
CST_130913    0
CST_160078    0
Length: 45528, dtype: int64

In [None]:
break

In [None]:
X_train.to_csv('X_train.csv')
y_train.to_csv('y_train.csv', header=False)
X_test.to_csv('X_test.csv')

In [None]:
# Import cleaned dataset
import datatable as dt
import pandas as pd
import numpy as np
import warnings

warnings.filterwarnings('ignore')
warnings.simplefilter('ignore')

X_train = dt.fread("X_train.csv").to_pandas().set_index(keys='customer_id', drop=True)
y_train = dt.fread("y_train.csv").to_pandas().iloc[:, 1]
y_train.index = X_train.index
X_test = dt.fread("X_test.csv").to_pandas().set_index(keys='customer_id', drop=True)

Credit:
https://medium.com/analytics-vidhya/hyperparameters-optimization-for-lightgbm-catboost-and-xgboost-regressors-using-bayesian-6e7c495947a9

In [None]:
import optuna
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier
from xgboost import XGBClassifier
from sklearn.model_selection import cross_val_score, StratifiedKFold


# Define an objective function to be maximized.
def objective(trial, X_train, y_train, cv, scoring):
    classifier = trial.suggest_categorical('classifier', ['lightgbm', 'catboost', 'xgboost'])
    model = LGBMClassifier()
    # Setup values for the hyperparameters:
    if classifier == 'lightgbm':
        params = {
            "num_leaves": trial.suggest_int('num_leaves', 45, 60),
            'min_child_samples': trial.suggest_uniform('min_child_samples', 100, 500),
            'min_child_weight': trial.suggest_int('min_child_weight', 30, 50, 5),
            'subsample': trial.suggest_uniform('subsample', 0.2, 0.8),
            'colsample_bytree': trial.suggest_uniform('colsample_bytree', 0.4, 0.6),
            'reg_alpha': trial.suggest_categorical('reg_alpha', [0, 1e-1, 1, 2, 5, 7, 10, 50, 100]),
            'reg_lambda': trial.suggest_categorical('reg_lambda', [0, 1e-1, 1, 5, 10, 20, 50, 100]),
        }
        model = LGBMClassifier(**params)

    elif classifier == 'catboost':
        params = {
            "depth": trial.suggest_int('depth', 1, 10),
            'iterations': trial.suggest_categorical('iterations', [250, 100, 500, 1000]),
            'learning_rate': trial.suggest_categorical('learning_rate', [0.03, 0.001, 0.01, 0.1, 0.2, 0.3]),
            'l2_leaf_reg': trial.suggest_categorical('l2_leaf_reg', [3, 1, 5, 10, 100]),
            'border_count': trial.suggest_categorical('border_count', [32, 5, 10, 20, 50, 100, 200]),
            'bagging_temperature': trial.suggest_categorical('bagging_temperature', [0.03, 0.09, 0.25, 0.75]),
            'random_strength': trial.suggest_categorical('random_strength', [0.2, 0.5, 0.8]),
            'max_ctr_complexity': trial.suggest_categorical('max_ctr_complexity', [1, 2, 3, 4, 5])
        }
        model = CatBoostClassifier(**params)

    else:
        params = {
            "min_child_weight": trial.suggest_int('min_child_weight', 14, 20),
            'gamma': trial.suggest_int('gamma', 0, 5),
            "max_depth": trial.suggest_int('max_depth', 5, 10),
            'subsample': trial.suggest_uniform('subsample', 0.5, 1.0),
            "colsample_bytree": trial.suggest_uniform('colsample_bytree', 0.1, 1.0),

        }
        model = XGBClassifier(**params)

    # Scoring method:
    score = cross_val_score(model, X_train, y_train, n_jobs=-1, cv=cv, scoring=scoring)
    return score.mean()

In [None]:
ss = StratifiedKFold(n_splits=5)

# Create study that minimizes
study = optuna.create_study(direction='maximize')
study.optimize(lambda trial: objective(trial, X_train, y_train, cv=ss, scoring='f1_macro'),
               n_trials=200)

In [None]:
print('Five best values')
pd.options.display.float_format = '{:,.2f}'.format
study.trials_dataframe().sort_values('value', ascending=True).head(5)

In [None]:
trial = study.best_trial
print(f'Loss : {trial}')
print(f"Best hyperparameters: {trial.params}")

In [None]:
# plot the optimization history of the study
from optuna.visualization.matplotlib import plot_optimization_history

plot_optimization_history(study);

In [None]:
study.best_params

In [None]:
best_params = {key: value for key, value in study.best_params.items() if key != 'classifier'}
if study.best_params['classifier'] == 'lightgbm':
    best_model = LGBMClassifier(**best_params)
elif study.best_params['classifier'] == 'catboost':
    best_model = CatBoostClassifier(**best_params)
else:
    best_model = XGBClassifier(**best_params)
best_model.fit(X_train, y_train)


In [None]:
y_pred = best_model.predict(X_test)
y_pred = pd.DataFrame(data=y_pred, columns=['predicted'])
y_pred.index = X_test.index
y_pred['predicted'] = np.where(y_pred['predicted'] == True, 1, 0)

In [None]:
y_pred.to_csv('Predicted value from Optuna.csv', index=True)