In [None]:
import datatable as dt
import pandas as pd
import numpy as np
import warnings

warnings.filterwarnings('ignore')
warnings.simplefilter('ignore')


# Problem: https://www.hackerearth.com/challenges/competitive/amexpert-code-lab/machine-learning/credit-card-default-risk-5-95cbc85f/
def clean_data(url, y=None, train=True):
    df = dt.fread(url).to_pandas()
    df.set_index('customer_id', inplace=True)
    df.drop(columns='name', inplace=True)
    # Replace weird values of a column by the most frequent ones
    df = df.replace({'gender': 'XNA'}, 'F').replace({'owns_car': ''}, 'N')
    if train:
        target_col = df.columns[-1]
        y = df[target_col]
        y = pd.Series(map(int, y), index=y.index)
        X = df.drop(columns=[target_col])
        return X, y
    else:
        return df

In [None]:
X_train, y_train = clean_data('train.csv', train=True)
X_test = clean_data('test.csv', train=False)

In [None]:
# Credit: https://johaupt.github.io/scikit-learn/tutorial/python/data%20processing/ml%20pipeline/model%20interpretation/columnTransformer_feature_names.html
def get_feature_names(column_transformer):
    """Get feature names from all transformers.
    Returns
    -------
    feature_names : list of strings
        Names of the features produced by transform.
    """

    # Remove the internal helper function
    #check_is_fitted(column_transformer)

    # Turn loopkup into function for better handling with pipeline later
    def get_names(trans):
        # >> Original get_feature_names() method
        if trans == 'drop' or (
                hasattr(column, '__len__') and not len(column)):
            return []
        if trans == 'passthrough':
            if hasattr(column_transformer, '_df_columns'):
                if ((not isinstance(column, slice))
                        and all(isinstance(col, str) for col in column)):
                    return column
                else:
                    return column_transformer._df_columns[column]
            else:
                indices = np.arange(column_transformer._n_features)
                return ['x%d' % i for i in indices[column]]
        if not hasattr(trans, 'get_feature_names'):
            # >>> Change: Return input column names if no method avaiable
            # Turn error into a warning
            warnings.warn("Transformer %s (type %s) does not "
                          "provide get_feature_names. "
                          "Will return input column names if available"
                          % (str(name), type(trans).__name__))
            # For transformers without a get_features_names method, use the input
            # names to the column transformer
            if column is None:
                return []
            else:
                return [f for f in column]

        return [f for f in trans.get_feature_names()]

    ### Start of processing
    feature_names = []

    # Allow transformers to be pipelines. Pipeline steps are named differently, so preprocessing is needed
    if type(column_transformer) == Pipeline:
        l_transformers = [(name, trans, None, None) for step, name, trans in column_transformer._iter()]
    else:
        # For column transformers, follow the original method
        l_transformers = list(column_transformer._iter(fitted=True))

    for name, trans, column, _ in l_transformers:
        if type(trans) == Pipeline:
            # Recursive call on pipeline
            _names = get_feature_names(trans)
            # if pipeline has no transformer that returns names
            if len(_names) == 0:
                _names = [f for f in column]
            feature_names.extend(_names)
        else:
            feature_names.extend(get_names(trans))

    return feature_names

In [None]:
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
import pickle


# Credit: https://stackoverflow.com/questions/60273501/column-specific-processing-in-an-sklearn-pipeline
def process_data(X: pd.DataFrame, y=None, train=True):
    impute_transformer = Pipeline([('impute', IterativeImputer())])
    one_hot_transformer = Pipeline([('one_hot', OneHotEncoder(drop='first'))])
    scale_transformer = Pipeline([('standard_scale', StandardScaler())])

    null_cols = X.columns[X.isnull().any()]
    oh_cols, to_be_scaled_cols = [], []
    for col in X.columns:
        if np.issubdtype(X[col].dtype, np.number) and X[col].nunique() < 3:
            continue
        if not np.issubdtype(X[col].dtype, np.number):
            oh_cols.append(col)
        else:
            to_be_scaled_cols.append(col)

    if train:
        processor = ColumnTransformer([
            ('imputed', impute_transformer, null_cols),
            ('encoded', one_hot_transformer, oh_cols),
            ('scaled', scale_transformer, to_be_scaled_cols),
        ], remainder='passthrough')
        processor.fit(X)
        # Save to file in the current working directory
        with open('processor.pkl', 'wb') as file:
            pickle.dump(processor, file)
        X = pd.DataFrame(processor.transform(X), columns=get_feature_names(processor), index=X.index)
        return X, y
    else:
        # Load from file
        with open('processor.pkl', 'rb') as file:
            processor = pickle.load(file)
        X = pd.DataFrame(processor.transform(X), columns=get_feature_names(processor), index=X.index)
        return X

In [None]:
X_train, y_train = process_data(X_train, y_train, train=True)
X_test = process_data(X_test, train=False)

In [None]:
X_train.to_csv('X_train.csv')
y_train.to_csv('y_train.csv', header=False)
X_test.to_csv('X_test.csv')