In [1]:
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

import numpy as np
import pandas as pd
import xgboost 
from hyperopt import STATUS_OK, Trials, fmin, hp, tpe, space_eval
from functools import partial

from sklearn.datasets import make_regression
from sklearn.metrics import mean_squared_error
from sklearn.feature_selection import SelectKBest, f_regression, mutual_info_regression, RFE
from sklearn.model_selection import cross_val_score, KFold, train_test_split
from sklearn.pipeline import Pipeline
from sklearn.metrics import r2_score

import sys
sys.path.insert(0,'..')

from vf_portalytics.model import PredictionModel
from vf_portalytics.tool import squared_error_objective_with_weighting, get_categorical_features
from vf_portalytics.transformers import get_transformer
from vf_portalytics.multi_model import MultiModel

# Generate data 

In [2]:
def make_dataset(n_samples, n_features, n_informative, **kwargs):
    x, y = make_regression(
        n_samples=n_samples, 
        n_features=n_features,
        noise=0.5,
        n_informative=n_informative, 
        random_state=0
    )
    x = pd.DataFrame(x)
    
    x.columns = ['feature_' + str(i) for i in range(n_features)]
    x = x.assign(**kwargs)
    return x, pd.Series(y, name='target')


# Generate data for 4 different categories
# different #samples for each category but the same #features since they belong to the same dataset
n_features = 20
x1, y1 = make_dataset(n_samples=100, n_features=n_features, n_informative=10, category='A')
x2, y2 = make_dataset(n_samples=150, n_features=n_features, n_informative=8, category='B')
x3, y3 = make_dataset(n_samples=80, n_features=n_features, n_informative=7, category='C')
x4, y4 = make_dataset(n_samples=120, n_features=n_features, n_informative=12, category='D')

# combine into one dataset
total_x = pd.concat([x1, x2, x3, x4], axis=0, ignore_index=True).reset_index(drop=True)
total_y = pd.concat([y1, y2, y3, y4], axis=0, ignore_index=True).reset_index(drop=True)

# make two random features categorical
labels = ['g1', 'g2', 'g3']
bins = [[],[]]
for i in range(2):
    bins[i] = [-np.inf, 
               total_x['feature_' + str(i)].mean() - total_x['feature_' + str(i)].std(), 
               total_x['feature_' + str(i)].mean() + total_x['feature_' + str(i)].std(), 
               total_x['feature_' + str(i)].max()]
total_x['feature_0'] = pd.cut(total_x['feature_0'], bins=bins[0], labels=labels).astype('object')
total_x['feature_1'] = pd.cut(total_x['feature_1'], bins=bins[1], labels=labels).astype('object')

### Declare group parameters

In [3]:
# Declare basic parameters
target = 'target'
cat_feature = 'category'
feature_col_list = total_x.columns.drop(cat_feature)

clusters = total_x[cat_feature].unique()
clusters

array(['A', 'B', 'C', 'D'], dtype=object)

# Filtering

### ...

In [4]:
# Split into train and test
train_index, test_index = train_test_split(total_x.index, test_size=0.33, random_state=5)
train_x, train_y = total_x.loc[train_index, :], total_y.loc[train_index]
test_x, test_y = total_x.loc[test_index, :], total_y.loc[test_index]

del x1, x2, x3, x4
del y1, y2, y3, y4
del total_x, total_y

# Feature selection

In [5]:
def feature_selection(df_X, df_Y, features_len=1, step=1, max_features=20):

    if features_len > max_features:
        features_len = max_features
        
    # we fit with a lighter but representative model
    model=xgboost.XGBRegressor(
        max_depth=5,
        n_estimators=50,
        learning_rate=0.1,
        subsample=0.8,
        n_jobs=6,
        objective = partial(squared_error_objective_with_weighting, under_predict_weight=2.0), 
        seed=6789,
        silent=True
    )
    # Turn textual columns and booleans into classes
    transformer = get_transformer('OrdinalEncoder')
    transformer.cols = get_categorical_features(data=df_X)
    df_X = transformer.fit_transform(df_X)
    
    # create the RFE model and select the attributes
    rfe = RFE(model,  n_features_to_select=features_len, step=step)
    rfe = rfe.fit(df_X, df_Y)
    return rfe

In [6]:
# Set your parameters; please be aware that mutual_info_regression can be very resource intensive
max_features = train_x.shape[1]
features_len = 1  # max number of columns: 'all' or a number
step = 1  # x features to be dropped each step

groups = train_x.groupby(cat_feature)
feature_importances = {}
for gp_key, x_group in groups:
    print('Searching for Feature Ranking in ' + gp_key + '...')
    x_group = x_group.drop(cat_feature, axis=1)
    y_group = train_y.loc[x_group.index]
    # find best parameters for each model-group
    feature_importances[gp_key] = feature_selection(x_group, y_group, 
                                                    features_len=1, step=1, max_features=max_features)

Searching for Feature Ranking in A...
Searching for Feature Ranking in B...
Searching for Feature Ranking in C...
Searching for Feature Ranking in D...


#### Check The Ranking and manually decide which Features to use

In [7]:
print('The features ordered by importance for each group: \n')
ordered_feautures = []
for cluster in clusters:
    elements = sorted(list(zip(feature_importances[cluster].ranking_, feature_col_list)))
    ordered_feautures.append(list(zip(*elements))[1])
ordered_feautures_df = pd.DataFrame(ordered_feautures,  index=clusters)
ordered_feautures_df

The features ordered by importance for each group: 



Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19
A,feature_11,feature_13,feature_17,feature_8,feature_7,feature_18,feature_10,feature_12,feature_6,feature_14,feature_16,feature_19,feature_15,feature_9,feature_5,feature_4,feature_3,feature_2,feature_1,feature_0
B,feature_16,feature_19,feature_11,feature_2,feature_12,feature_6,feature_15,feature_7,feature_13,feature_17,feature_9,feature_3,feature_8,feature_10,feature_14,feature_4,feature_5,feature_0,feature_1,feature_18
C,feature_8,feature_18,feature_13,feature_12,feature_6,feature_4,feature_3,feature_17,feature_16,feature_15,feature_11,feature_10,feature_9,feature_5,feature_7,feature_2,feature_19,feature_0,feature_14,feature_1
D,feature_18,feature_14,feature_17,feature_10,feature_5,feature_13,feature_8,feature_3,feature_9,feature_4,feature_11,feature_16,feature_19,feature_12,feature_2,feature_7,feature_15,feature_6,feature_1,feature_0


In [8]:
# Manually Select features
# default select 10 most important
selected_features = {}
for group_key, _ in  ordered_feautures_df.iterrows():  
    selected_features[group_key] = set(ordered_feautures_df.loc[group_key, 0:9].values)

# change mannually the features
# in this example I am adding both categorical featuresfor each category
selected_features['A'].update(['feature_0', 'feature_1'])
selected_features['B'].update(['feature_0', 'feature_1'])
selected_features['C'].update(['feature_0', 'feature_1'])
selected_features['D'].update(['feature_0', 'feature_1'])
# Discard features that are not going to be in the future
# And discard features that are not important from the "business perspective"
    
# In the end selected_features; a dictionary with keys the group names and values list of features that are going to be used

#### Manually identify which categorical features are nominal and which are ordinal

In [9]:
# check each group seperately and add in the bellow two lists which features are ordilan and which are nominal
group1 = train_x[train_x[cat_feature] == clusters[0]][selected_features[clusters[0]]]
group1[list(get_categorical_features(group1))].head()

Unnamed: 0,feature_1,feature_0
6,g1,g2
10,g2,g2
81,g2,g2
83,g2,g2
92,g2,g3


In [10]:
ordinal_features = ['feature_1']
nominal_features = ['feature_0']

# Hyper Parameter tuning

In [11]:
# space can be different for each group but let this for the future if it is needed
space={
    'n_estimators' : hp.choice('n_estimators', np.arange(10, 150, 20, dtype=int)),
    'max_depth': hp.choice('max_depth', np.arange(1, 3, dtype = int)),
    'subsample': hp.quniform('subsample', 0.5, 0.7, 0.05),
    'min_child_weight': hp.quniform ('min_child_weight', 1, 20, 1),
    'gamma' : hp.quniform('gamma', 0.7, 1, 0.05),
    'colsample_bytree' : hp.quniform('colsample_bytree', 0.5, 0.7, 0.05), 
    'learning_rate' : hp.quniform('learning_rate', 0.001, 0.1, 0.01), 
    'transformer_nominal': hp.choice('transformer_nominal', ['TargetEncoder', 'JamesSteinEncoder']),
    'transformer_ordinal': hp.choice('transformer_ordinal', ['OneHotEncoder', 'OrdinalEncoder'])
}

def score(params):
    num_round = int(params['n_estimators'])
    del params['n_estimators']
    # preprocessing
    categorical_features = get_categorical_features(data=x_group)
    
    # preprocess nominals
    transformer_nominal = get_transformer(params['transformer_nominal'])
    gp_nominals = [feature for feature in categorical_features if feature in nominal_features]
    transformer_nominal.cols = gp_nominals
    
    # preprocess ordinals
    transformer_ordinal = get_transformer(params['transformer_ordinal'])
    gp_ordinal = [feature for feature in categorical_features if feature in ordinal_features]
    transformer_ordinal.cols = gp_ordinal
    
    gbm_model = xgboost.XGBRegressor(n_estimators = num_round, 
                                     objective = partial(squared_error_objective_with_weighting, under_predict_weight=2.0), 
                                     max_depth = params['max_depth'],
                                     subsample = params['subsample'],
                                     min_child_weight = params['min_child_weight'],
                                     gamma = params['gamma'],
                                     colsample_bytree = params['colsample_bytree'],
                                     learning_rate = params['learning_rate'],
                                     n_jobs = 8,
                                     seed = 1234,
                                     silent=True)
    
    pipeline = Pipeline([('transformer_ordinal', transformer_ordinal), 
                         ('transformer_nominal', transformer_nominal), 
                         ('estimator', gbm_model)])

    score = cross_val_score(pipeline, x_group, y_group, cv=KFold(n_splits=10, random_state=9876), 
                            scoring='neg_mean_squared_error')
    loss = np.abs(np.mean(score))
    return {'loss' : loss, 'status' : STATUS_OK}


def optimize(space, x_group, y_group, gp_key):
    trials = Trials()
    best = fmin(fn=score, 
                space=space, 
                algo=tpe.suggest, 
                max_evals=20,
                trials=trials
               )
    return space_eval(space, best), trials


In [12]:
groups = train_x.groupby(cat_feature)
params = {}
for gp_key, group in groups:
    print('Checking ' + gp_key + ' ...')
    # keep only the most improtant features
    x_group = group[selected_features[gp_key]]
    y_group = train_y[x_group.index]
    # find best parameters for each model-group
    best_params, trials = optimize(space, x_group, y_group, gp_key)
    params[gp_key] = best_params
    
# in the end we keep params; a dictionary with keys the group names and values dictionaries of the selected hyperparameters

Checking A ...
100%|██████████| 20/20 [00:21<00:00,  1.08s/trial, best loss: 6873.547324469885]
Checking B ...
100%|██████████| 20/20 [00:43<00:00,  2.18s/trial, best loss: 9385.307973852461] 
Checking C ...
100%|██████████| 20/20 [00:23<00:00,  1.18s/trial, best loss: 4954.1840644192725]
Checking D ...
100%|██████████| 20/20 [00:29<00:00,  1.46s/trial, best loss: 24768.691316215692]


# Train and validate the model

In [13]:
# Initiliaze model
model = MultiModel(group_col=cat_feature, clusters=clusters, params=params,
                   selected_features=selected_features, nominals=nominal_features, ordinals=ordinal_features)
model.fit(train_x, train_y)
pred_train_y = model.predict(train_x)
pred_test_y = model.predict(test_x)

Model for A trained
Model for B trained
Model for C trained
Model for D trained


In [14]:
print('Train performance {}'.format(round(r2_score(train_y, pred_train_y), 2)))
print('Validation performance {}'.format(round(r2_score(test_y, pred_test_y), 2)))

Train performance 0.96
Validation performance 0.52


# Train final model and save

In [15]:
# train with the whole dataset! 
# Initiliaze model
# combine into one dataset
total_x = pd.concat([train_x, test_x], axis=0, ignore_index=True).reset_index(drop=True)
total_y = pd.concat([train_y, test_y], axis=0, ignore_index=True).reset_index(drop=True)
del train_x, train_y, test_x, test_y

# Initiliaze model
model = MultiModel(group_col=cat_feature, clusters=clusters, params=params,
                   selected_features=selected_features, nominals=nominal_features, ordinals=ordinal_features)

In [16]:
# Note: must use one_hot_encode=False to prevent one-hot encoding of categorical features in input data
prediction_model = PredictionModel("multi_model", path='./exported_models', one_hot_encode=False)
prediction_model.model = model
# save feature names (no strictly since all the preprocessing is made being made in the pipeline)
prediction_model.features = {key: [] for key in total_x.columns}
prediction_model.target = {target: []}

prediction_model.ordered_column_list = sorted(total_x.columns)

In [17]:
prediction_model.model.fit(total_x, total_y)

Model for A trained
Model for B trained
Model for C trained
Model for D trained


MultiModel(clusters=array(['A', 'B', 'C', 'D'], dtype=object),
           group_col='category', nominals=['feature_0'], ordinals=['feature_1'],
           params={'A': {'colsample_bytree': 0.65, 'gamma': 0.75,
                         'learning_rate': 0.07, 'max_depth': 1,
                         'min_child_weight': 2.0, 'n_estimators': 70,
                         'subsample': 0.65,
                         'transformer_nominal': 'TargetEncoder',
                         'transformer_ordinal': 'OrdinalEncoder'},
                   'B':...
                                    'feature_16', 'feature_17', 'feature_19',
                                    'feature_2', 'feature_6', 'feature_7'},
                              'C': {'feature_0', 'feature_1', 'feature_12',
                                    'feature_13', 'feature_15', 'feature_16',
                                    'feature_17', 'feature_18', 'feature_3',
                                    'feature_4', 'feature_6', 'featu