In [None]:
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

import numpy as np
import pandas as pd
import xgboost 
from hyperopt import STATUS_OK, Trials, fmin, hp, tpe, space_eval
from functools import partial
from vf_portalytics.tool import squared_error_objective_with_weighting

from sklearn.datasets import make_regression
from sklearn.metrics import mean_squared_error
from sklearn.feature_selection import SelectKBest, f_regression, mutual_info_regression, RFE
from sklearn.model_selection import cross_val_score, KFold, train_test_split
from sklearn.pipeline import Pipeline

import sys
sys.path.insert(0,'..')

from models.transformers import OneHotEncoder, potential_transformers
from models.grouped_model import Grouped_Model

In [None]:
def make_dataset(n_samples, n_features, n_informative, **kwargs):
    x, y = make_regression(
        n_samples=n_samples, 
        n_features=n_features,
        noise=0.5,
        n_informative=n_informative, 
        random_state=0
    )
    x = pd.DataFrame(x)
    
    x.columns = ['feature_' + str(i) for i in range(n_features)]
    x = x.assign(**kwargs)
    return x, pd.Series(y, name='target')

# Generate data 

In [None]:
# Generate data for 4 different categories
# different #samples for each category but the same #features since they belong to the same dataset
n_features = 20
x1, y1 = make_dataset(n_samples=100, n_features=n_features, n_informative=5, category='A')
x2, y2 = make_dataset(n_samples=150, n_features=n_features, n_informative=8, category='B')
x3, y3 = make_dataset(n_samples=80, n_features=n_features, n_informative=7, category='C')
x4, y4 = make_dataset(n_samples=120, n_features=n_features, n_informative=6, category='D')

# combine into one dataset
total_x = pd.concat([x1, x2, x3, x4], axis=0, ignore_index=True).reset_index(drop=True)
total_y = pd.concat([y1, y2, y3, y4], axis=0, ignore_index=True).reset_index(drop=True)

# make two random features categorical
labels = [0, 1, 2]
bins = [[],[]]
for i in range(2):
    bins[i] = [-np.inf, 
               total_x['feature_' + str(i)].mean() - total_x['feature_' + str(i)].std(), 
               total_x['feature_' + str(i)].mean() + total_x['feature_' + str(i)].std(), 
               total_x['feature_' + str(i)].max()]
total_x['feature_0'] = pd.cut(total_x['feature_0'], bins=bins[0], labels=labels)
total_x['feature_1'] = pd.cut(total_x['feature_1'], bins=bins[1], labels=labels) 

### Declare group parameters

In [None]:
# Declare basic parameters
target = 'target'
cat_feature = 'category'
feature_col_list = total_x.columns.drop(cat_feature)

clusters = total_x[cat_feature].unique()
clusters

In [None]:
# Split into train and test
train_index, test_index = train_test_split(total_x.index, test_size=0.33, random_state=5)
train_x, train_y = total_x.loc[train_index, :], total_y.loc[train_index]
test_x, test_y = total_x.loc[test_index, :], total_y.loc[test_index]

del x1, x2, x3, x4
del y1, y2, y3, y4
del total_x, total_y

# Feature selection

In [None]:
def feature_selection(df_X, df_Y, features_len=1, step=1, max_features=20):

    if features_len > max_features:
        features_len = max_features
        
    # we fit with a lighter but representative model
    model=xgboost.XGBRegressor(
        max_depth=5,
        n_estimators=50,
        learning_rate=0.1,
        subsample=0.8,
        n_jobs=6,
        objective = partial(squared_error_objective_with_weighting, under_predict_weight=2.0), 
        seed=6789,
        silent=True
    )

    # create the RFE model and select the attributes
    rfe = RFE(model,  n_features_to_select=features_len, step=step)
    rfe = rfe.fit(df_X, df_Y)
    return rfe

In [None]:
# Set your parameters; please be aware that mutual_info_regression can be very resource intensive
max_features = train_x.shape[1]
features_len = 1  # max number of columns: 'all' or a number
step = 1  # x features to be dropped each step

groups = train_x.groupby(cat_feature)
feature_importances = {}
for gp_key, x_group in groups:
    print('Searching for Feature Ranking in ' + gp_key + '...')
    x_group = x_group.drop(cat_feature, axis=1)
    y_group = train_y.loc[x_group.index]
    # find best parameters for each model-group
    feature_importances[gp_key] = feature_selection(x_group, y_group, 
                                                    features_len=1, step=1, max_features=max_features)

### Check The Ranking and manually decide which Features to use

In [None]:
print('The features ordered by importance for each group: \n')
ordered_feautures = []
for cluster in clusters:
    elements = sorted(list(zip(feature_importances[cluster].ranking_, feature_col_list)))
    ordered_feautures.append(list(zip(*elements))[1])
ordered_feautures_df = pd.DataFrame(ordered_feautures,  index=clusters)
ordered_feautures_df

In [None]:
# Manually Select features
# default select 10 most important
selected_features = {}
for group_key, _ in  ordered_feautures_df.iterrows():  
    selected_features[group_key] = list(ordered_feautures_df.loc[group_key, 0:10].values)

# change mannually the features

# Discard features that are not going to be in the future
# And discard features that are not important from the "business perspective"
    
# In the end selected_features; a dictionary with keys the group names and values list of features that are going to be used

# Hyper Parameter tuning

In [None]:
# space can be different for each group but let this for the future if it is needed
space={
    'n_estimators' : hp.choice('n_estimators', np.arange(100, 500, 50, dtype=int)),
    'max_depth': hp.choice('max_depth', np.arange(1, 10, dtype = int)),
    'subsample': hp.quniform('subsample', 0.8, 1.0, 0.05),
    'min_child_weight': hp.quniform ('min_child_weight', 1, 20, 1),
    'gamma' : hp.quniform('gamma', 0.5, 1, 0.05),
    'colsample_bytree' : hp.quniform('colsample_bytree', 0.5, 1, 0.05), 
    'learning_rate' : hp.quniform('learning_rate', 0.025, 0.5, 0.025),
    'transformer': hp.choice('transformer', ['OneHotEncoder']),
}

def score(params):
    num_round = int(params['n_estimators'])
    del params['n_estimators']
#     x, y = preprocess(x, y, params['transformer'])

    # preprocess (the transformation is being done in the cross_val)
    transformer = tr.potential_transformers.get(params['transformer'])

    gbm_model = xgboost.XGBRegressor(n_estimators = num_round, 
                                     objective = partial(squared_error_objective_with_weighting, under_predict_weight=2.0), 
                                     max_depth = params['max_depth'],
                                     subsample = params['subsample'],
                                     min_child_weight = params['min_child_weight'],
                                     gamma = params['gamma'],
                                     colsample_bytree = params['colsample_bytree'],
                                     learning_rate = params['learning_rate'],
                                     n_jobs = 8,
                                     seed = 1234,
                                     silent=True)
    
    pipeline = Pipeline([('transformer', transformer), ('estimator', gbm_model)])

    score = cross_val_score(pipeline, x_group, y_group, cv=KFold(n_splits=5, random_state=9876), 
                            scoring='neg_mean_squared_error')
    loss = np.abs(np.mean(score))
    return {'loss' : loss, 'status' : STATUS_OK}


def optimize(space, x_group, y_group, gp_key):
    trials = Trials()
    best = fmin(fn=score, 
                space=space, 
                algo=tpe.suggest, 
                max_evals=20, #What value is optimal?
                trials=trials
               )
    return space_eval(space, best), trials


In [None]:
groups = train_x.groupby(cat_feature)
params = {}
for gp_key, group in groups:
    print('Checking ' + gp_key + ' ...')
    # keep only the most improtant features
    x_group = group[selected_features[gp_key]]
    y_group = train_y[x_group.index]
    # find best parameters for each model-group
    best_params, trials = optimize(space, x_group, y_group, gp_key)
    params[gp_key] = best_params
    
# in the end we keep params; a dictionary with keys the group names and values dictionaries of the selected hyperparameters

In [None]:
params
# Here we can specify which feature we want to consider as categoricals eg:
# params['A']['potential_cat_feat'] = set(['feature_1', 'feature_2'])

# Train and model and validate

In [None]:
# Initiliaze model
model = Grouped_Model(group_col=cat_feature, clusters=clusters, params=params, selected_features=selected_features)

model.fit(train_x, train_y)
pred_test_y = model.predict(test_x)

In [None]:
pd.DataFrame({'predicted': pred_test_y, 'actuals': test_y})