In [1]:
from random import random
from sklearn.tree import DecisionTreeClassifier
import random
from functools import partial
import numpy as np
import itertools
from sklearn.metrics import roc_auc_score

random.seed(10)


parameters_range = {
    'criterion' : ("gini", "entropy", "log_loss"),
    'splitter' : ("best", "random"),
    'max_depth' : list(range(1, 15)) + list(range(15,25,2)) + list(range(25,51,5)),
    'min_samples_split' : list(np.arange(0.01,1,0.05)),
    'min_samples_leaf' : list(np.arange(0.01,1,0.01)),
    'min_weight_fraction_leaf' : list(np.arange(0.01,0.5,0.01)),
    'max_features' : ['sqrt', 'log2', None],
    'max_leaf_nodes' : list(range(20, 500, 10)),
    'min_impurity_decrease' : list(np.arange(0.02,5,0.02)),
    'class_weight' : [{0: 1, 1:w} for w in range(1,51)],
    'ccp_alpha' : list(np.arange(0.05,5,0.05))
}


In [2]:
class PopulationBasedTraining(object):
    """
    """


    def __init__(self, parameters_range, initial_population_size = 100, model = None):
        """
        """
        
        self.parameters_range = dict(parameters_range).copy(),
        self.parameters_range = self.parameters_range[0]
        self.population_size = initial_population_size
        self.model = model

    def spawn(self, ):
        """
        """
        
        population_parameters = []
        population = []
        
        for i in range(self.population_size):
            population_parameters.append({k: random.choice(v) for k,v in self.parameters_range.items()})

        for params in population_parameters:
            model_ = self.model
            population.append(model_(**params))

        return population, population_parameters

    @staticmethod
    def fit_single(model, X, y, sample_weight=None):
        """
        fit single model with provided X and y data
        """

        model.fit(X, y, sample_weight)

        return model


    def fit_many(self, models, X, y, sample_weight=None):
        """
        """

        fit_single_part = partial(self.fit_single, X=X, y=y, sample_weight=sample_weight)

        models_fitted = list(map(fit_single_part, models))

        return models_fitted


    @staticmethod
    def score_single(model, X, y, scoring_fcn, sample_weight = None):
        """
        score single model with provided model, X and y data
        scoring_fcn - sklearn.metrics.roc_auc_score
        """

        y_hat = model.predict(X)
        if sample_weight:
            score = scoring_fcn(
                y, 
                y_hat, 
                average = 'weighted',
                sample_weight=sample_weight
                )
        else:
            score = scoring_fcn(y, y_hat)

        return score


    def score_many(self, models, X, y, scoring_fcn, sample_weight = None):
        """
        
        """

        score_single_part = partial(self.score_single, X=X, y=y, scoring_fcn=scoring_fcn, sample_weight=sample_weight)

        scores = list(map(score_single_part, models))

        return scores


    @staticmethod
    def fitness(scores, top_k = 100):
        """
        """

        return sorted(range(len(scores)), key=lambda i: scores[i], reverse=True)[:top_k]


    def cross(self, parent1, parent2, dominance=0.5):
        """
        """

        dominance_size = int(np.ceil(dominance*len(self.parameters_range.keys())))
        params_parent1 = {k: parent1[k] for k in list(parent1)[:dominance_size]}
        params_parent2 = {k: parent2[k] for k in list(parent2)[dominance_size:]}
        
        child = params_parent1.copy()
        child.update(params_parent2)

        return child

    def mutate_single(self, model_params_dict):
        """
        """

        list_params = list(model_params_dict.keys())
        params_to_mutate = random.choices(list_params, k = random.randint(1, len(list_params)))
        
        model_params_dict = model_params_dict.copy()
        for param in params_to_mutate:
            model_params_dict[param] = random.choice(self.parameters_range[param])
        
        return model_params_dict


    def mutate_many(self, population):
        """
        """

        return list(map(self.mutate_single, population))


    def run(self, X, y, X_valid, y_valid, scoring_fcn, top_k, dominance, max_epochs, mutate_condition, max_score_th=0.95, sample_weight=None):
        """
        """

        epoch = 1
        best_scores = []
        best_models = {}

        # 1st spawn intial population and set epoch to 1
        population, population_parameters = self.spawn()

        # fit initial population
        fitted_models = self.fit_many(population, X, y, sample_weight=sample_weight)

        # score initial population
        population_scores = self.score_many(fitted_models, X_valid, y_valid, scoring_fcn, sample_weight=sample_weight)

        # best score from prev population
        best_model_idx = self.fitness(population_scores, top_k = 1)
        best_scores.append([population_scores[x] for x in best_model_idx])
        
        print(epoch)
        print('best score: ', best_scores[-1])
        
        best_models[epoch] = {
            'model': fitted_models[best_model_idx[-1]],
            'parameters': population_parameters[best_model_idx[-1]]
        }

        # fitness
        fitness_idx = self.fitness(population_scores, top_k = top_k)


        # select children and cross for children
        parents_models = [fitted_models[x] for x in fitness_idx]
        parents_params = [population_parameters[x] for x in fitness_idx]

        children = []

        for p1, p2 in list(itertools.product(parents_params, parents_params)):
            children.append(self.cross(p1, p2, dominance))
            children.append(self.cross(p2, p1, 1.0-dominance))
        
        
        population = []
        for child in children:
            population.append(self.model(**child))
        
        population_parameters = children.copy()
        fitted_models = self.fit_many(population, X, y, sample_weight=sample_weight)
        population_scores = self.score_many(fitted_models, X_valid, y_valid, scoring_fcn, sample_weight=sample_weight)
        
        # best score from prev population
        best_model_idx = self.fitness(population_scores, top_k = 1)
        best_scores.append([population_scores[x] for x in best_model_idx])

        epoch = 2
        print(epoch)
        print('best score: ', best_scores[-1])

        best_models[epoch] = {
            'model': fitted_models[best_model_idx[-1]],
            'parameters': population_parameters[best_model_idx[-1]]
        }

        # fitness
        fitness_idx = self.fitness(population_scores, top_k = top_k)

        not_stop = True

        while(not_stop):

            epoch += 1
            print(epoch)

            # select children and cross for children
            parents_models = [fitted_models[x] for x in fitness_idx]
            parents_params = [population_parameters[x] for x in fitness_idx]

            children = []

            for p1, p2 in list(itertools.product(parents_params, parents_params)):
                children.append(self.cross(p1, p2, dominance))
                children.append(self.cross(p2, p1, 1.0-dominance))
            
            # mutate children's population
            best_score_diff = best_scores[-1][-1] - best_scores[-2][-1]
            if best_score_diff < mutate_condition:
                children = self.mutate_many(children)
                
            population_parameters = []
            for child in children:
                population.append(self.model(**child))
                
            population_parameters = children.copy()
            fitted_models = self.fit_many(population, X, y, sample_weight=sample_weight)
            population_scores = self.score_many(fitted_models, X_valid, y_valid, scoring_fcn, sample_weight=sample_weight)
            
            # best score from prev population
            best_model_idx = self.fitness(population_scores, top_k = 1)
            best_scores.append([population_scores[x] for x in best_model_idx])
            
            best_scores_ = [x[-1] for x in best_scores]
            best_score = max(best_scores_)

            print('best score: ', best_scores[-1])

            best_models[epoch] = {
                'model': fitted_models[best_model_idx[-1]],
                'parameters': population_parameters[best_model_idx[-1]]
            }

            if epoch == max_epochs:
                not_stop = False
                return best_models

            if best_score >= max_score_th:
                non_stop = False
                return best_models

            

In [3]:
pbt = PopulationBasedTraining(parameters_range, initial_population_size = 100, model = DecisionTreeClassifier)

In [4]:
from sklearn.model_selection import train_test_split
import pandas as pd

df_ = pd.read_csv('SC_data.csv')

<jemalloc>: MADV_DONTNEED does not work (memset will be used instead)
<jemalloc>: (This is the expected behaviour if you are running under QEMU)


In [5]:
df_.head()

Unnamed: 0,id,FIRST_MOB,default_flag,ACC_ST_A_Dt_SUM,working_months_new,POSIT_INT_INST_MIN,TRX_CH_CD_PAYMENT_ORDER_MIN3,NKP_first_cd1,not_work_months_new,average_salary_last_3months,...,NKP_Code_Curr_cd1,Miscellaneous Stores_SUM12,ACC_ONE_CUST_CNT12,POSIT_PRINC_INST_MIN3,RUB_CD_IIN_TRX_CH_PAYMENT_ORDER_MAX12,CURR_CD_OTH_MAX12,CNT_ACC_ONE_CUST_CH_12,TRX_CH_CD_PAYMENT_ORDER_CNT_CH_12,TRX_CH_CD_E_CHANNEL_MIN_CH_6,Miscellaneous Stores_MIN_CH_6
0,0.0,2017-08-31,0.0,-0.261387,240.0,-0.476122,0.043286,5.0,-0.820406,-0.393088,...,4.0,-0.323575,-0.270938,-0.118119,-0.15012,999999.0,1.332474,0.69235,0.310368,0.454701
1,1.0,2017-08-31,0.0,0.329268,180.0,-0.672053,-0.144485,2.0,3.667009,1.916147,...,999999.0,0.539598,1.013528,-0.194047,0.113804,999999.0,1.332474,0.69235,0.603401,-2.384447
2,2.0,2017-08-31,0.0,0.090022,120.0,999999.0,0.173523,8.0,-0.820406,-0.139806,...,8.0,0.184667,0.628188,0.114569,-0.030529,-0.152129,1.332474,0.69235,0.603401,0.099843
3,3.0,2017-08-31,0.0,0.088147,180.0,3.395171,-0.150696,9.0,-0.597429,-0.861823,...,9.0,-0.024104,-0.270938,-0.076462,-0.212531,-0.104904,1.332474,0.952748,0.603401,0.454701
4,4.0,2017-08-31,0.0,-0.175203,240.0,2.342245,0.457788,1.0,-0.820406,0.617904,...,1.0,-0.323575,-0.270938,0.265457,999999.0,999999.0,1.332474,0.69235,0.603372,0.454701


In [6]:
x_columns = [x for x in df_.columns if 'FIRST_MOB' not in x]
y_col = ['default_flag']


In [7]:
X, X_valid, y, y_valid  = train_test_split(df_[x_columns], df_[y_col], test_size=0.33)



In [None]:
pbt.run(
    X = X,
    y = y,
    X_valid = X_valid,
    y_valid = y_valid,
    scoring_fcn = roc_auc_score,
    top_k = 100,
    dominance = 0.5,
    max_epochs = 10,
    mutate_condition = .05
)

1
best score:  [0.5]
