In [37]:
from sklearn.model_selection import train_test_split, KFold
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
import pandas as pd
import numpy as np
import os
from imblearn.over_sampling import SMOTE
from sample import create_sampler
from tree_extractor import path_extractor

random_state = 126

class ExpModel:
    def __init__(self, dataset, model):
        self.dataset = dataset
        self.model = model
        self.n_splits = 4
        self._accuracy = []
        self._precision = []
        self._recall = []
        self._f1_score = []

    def init(self):
        if self.model == 'RF':
            if self.dataset == 'german_credit':
                self.target = 'credit_risk'
                parameters = {
                    'random_state': random_state,
                    'max_depth': 12,
                    'n_estimators': 100,
                    'max_leaf_nodes': 150,
                    'min_samples_split': 6,
                    'min_samples_leaf': 3,
                    'bootstrap': True,
                }

                data_table = pd.read_csv('data/german.csv')
                qualitative_features = [
                    'credit_history', 'purpose', 
                    'personal_status_sex', 'other_debtors', 
                    'property', 'other_installment_plans', 
                    'housing', 'job', 'people_liable', 'telephone', 
                    'foreign_worker'
                ]
                for feature in qualitative_features:
                    unique_values = np.unique(data_table[feature].values)
                    sorted(unique_values)
                    if int(unique_values[0]) == 0:
                        for i in unique_values:
                            data_table[feature + ' - '+ str(i)] = data_table[feature].values == i
                    else:
                        for i in unique_values:
                            data_table[feature + ' - '+ str(int(i) - 1)] = data_table[feature].values == i

                #    data_table['installment - '+ concurrent_credits[i]] = data_table['Other installment plans'].values == ix
                #data_table['Account Balance'] = np.array([v if v < 4 else 0 for v in data_table['Account Balance'].values])
                for feature in qualitative_features:
                    data_table = data_table.drop(feature, axis = 1)
                #data_table = data_table.drop('Other installment plans', axis = 1)
                X = data_table.drop(self.target, axis=1).values
                y = data_table[self.target].values
            self.data_table = data_table
            self.X = X
            self.y = y
            self.parameters = parameters
            
            kf = KFold(n_splits = self.n_splits, random_state=random_state, shuffle=True)
            self.splits = []
            for train_index, test_index in kf.split(X):
                self.splits.append((train_index, test_index))
            self.fold = 0

    def has_next_fold(self):
        return self.fold < len(self.splits)
    
    def next_fold(self):
        self.fold += 1

    def train(self):
        sm = SMOTE(random_state=random_state)
        data_table = self.data_table
        X = self.X
        y = self.y
        parameters = self.parameters
        train_index, test_index = self.splits[self.fold]
        X_train = self.X[train_index]
        y_train = self.y[train_index]
        X_test = self.X[test_index]
        y_test = self.y[test_index]
        X_train, y_train = sm.fit_resample(X_train, y_train)
        self.X_train = X_train
        self.X_test = X_test
        self.y_train = y_train
        self.y_test = y_test
        self.data_table = data_table
        clf = RandomForestClassifier(**parameters)
        clf.fit(X_train, y_train)
        self.clf = clf
        self.y_pred = clf.predict(X_test)
        self.features = data_table.drop(self.target, axis=1).columns.to_list()

    def evaluate(self):
        _accuracy = accuracy_score(self.y_test, self.y_pred)
        _precision = precision_score(self.y_test, self.y_pred)
        _recall = recall_score(self.y_test, self.y_pred)
        _f1_score = f1_score(self.y_test, self.y_pred)
        print('Accuracy Score is', _accuracy)
        print('Precision is', _precision)
        print('Recall is', _recall)
        print('F1 Score is', _f1_score)
        self._accuracy.append(_accuracy)
        self._precision.append(_precision)
        self._recall.append(_recall)
        self._f1_score.append(_f1_score)
    
    def summary(self):
        return float(np.mean(self._accuracy)), float(np.mean(self._precision)), float(np.mean(self._recall)), float(np.mean(self._f1_score))

    def oversampling(self, rate = 2):
        is_continuous = []
        is_categorical = []
        is_integer = []

        for feature in self.data_table.columns:
            if feature == self.target:
                continue
            if self.data_table[feature].dtype == 'O':
                is_continuous.append(False)
                is_categorical.append(True)
            else:
                is_continuous.append(True)
                is_categorical.append(False)
            is_integer.append(False)
        sampler = create_sampler(self.X_train, is_continuous, is_categorical, is_integer)
        return sampler(len(self.X_train) * rate)

    def generate_paths(self):
        if self.model == 'RF':
            paths = path_extractor(self.clf, 'random forest', (self.X_train, self.y_train))
        else:
            paths = path_extractor(self.clf, 'lightgbm')
        print('num of paths', len(paths))
        return paths


In [40]:
dataset = 'german_credit'
model = 'RF'
exp = ExpModel(dataset, model)
for max_leaf_nodes in range(80, 200, 10):
    exp.init()
    exp.parameters['max_leaf_nodes'] = max_leaf_nodes
    exp.train()
    paths = exp.generate_paths()
    exp.evaluate()


num of paths 8000
Accuracy Score is 0.788
Precision is 0.8291457286432161
Recall is 0.8967391304347826
F1 Score is 0.8616187989556136
num of paths 8985
Accuracy Score is 0.788
Precision is 0.8291457286432161
Recall is 0.8967391304347826
F1 Score is 0.8616187989556136
num of paths 9774
Accuracy Score is 0.792
Precision is 0.83
Recall is 0.9021739130434783
F1 Score is 0.8645833333333334
num of paths 10083
Accuracy Score is 0.796
Precision is 0.8308457711442786
Recall is 0.907608695652174
F1 Score is 0.8675324675324676
num of paths 10095
Accuracy Score is 0.796
Precision is 0.8308457711442786
Recall is 0.907608695652174
F1 Score is 0.8675324675324676
num of paths 10095
Accuracy Score is 0.796
Precision is 0.8308457711442786
Recall is 0.907608695652174
F1 Score is 0.8675324675324676
num of paths 10095
Accuracy Score is 0.796
Precision is 0.8308457711442786
Recall is 0.907608695652174
F1 Score is 0.8675324675324676
num of paths 10095
Accuracy Score is 0.796
Precision is 0.8308457711442786
R

In [1]:
import pickle
output_data = pickle.load(open('output/german1211.pkl', 'rb'))

[{'name': 'Duration of Credit (month)', 'range': [4, 72], 'importance': 0.05633576975957183, 'dtype': 'numeric'}, {'name': 'Credit Amount', 'range': [250, 18424], 'importance': 0.056286726533428595, 'dtype': 'numeric'}, {'name': 'Instalment per cent', 'range': [1, 4], 'importance': 0.023734298699895035, 'dtype': 'numeric'}, {'name': 'Duration in Current address', 'range': [1, 4], 'importance': 0.031969786276545294, 'dtype': 'numeric'}, {'name': 'Age (years)', 'range': [19, 75], 'importance': 0.03890911106561411, 'dtype': 'numeric'}, {'name': 'No of Credits at this Bank', 'range': [1, 4], 'importance': 0.012889319099541809, 'dtype': 'numeric'}, {'name': 'No of dependents', 'range': [1, 2], 'importance': 0.005498524800822959, 'dtype': 'numeric'}, {'name': 'Account Balance', 'range': [0, 4], 'importance': 0.23189206974256948, 'dtype': 'categoric'}, {'name': 'Payment Status of Previous Credit', 'range': [0, 5], 'importance': 0.0812119819629154, 'dtype': 'categoric'}, {'name': 'Purpose', 'r

In [5]:
names = 'duration,amount,installment_rate,present_residence,age,number_credits,people_liable,status,credit_history,purpose,savings,employment_duration,personal_status_sex,other_debtors,property,other_installment_plans,housing,job,telephone,foreign_worker'.split(',')
for i in range(len(output_data['features'])):
    output_data['features'][i]['name'] = names[i]

In [7]:
pickle.dump(output_data, open('output/german1211.pkl', 'wb'))