In [2]:
import os
import pandas as pd
import shap
from catboost import CatBoostClassifier
from catboost import Pool
from catboost.utils import create_cd
from pathlib import Path

class Application:
    thread_count = 1
    number_cat_futures = 0
    csv_file_path = ""
    out_directory_path = ""
    model_file_name = ""
    cd_file_name = ""
    iterations = 0
    learning_rate = 0
    early_stopping_rounds = 0

    def __init__(
            self, thread_count, number_cat_futures, csv_file_path,
            out_directory_path, model_file_name, cd_file_name,
            iterations, learning_rate, early_stopping_rounds):
        self.thread_count = thread_count
        self.number_cat_futures = number_cat_futures
        self.csv_file_path = csv_file_path
        self.out_directory_path = out_directory_path
        self.model_file_name = model_file_name
        self.cd_file_name = cd_file_name
        self.iterations = iterations
        self.learning_rate = learning_rate
        self.early_stopping_rounds = early_stopping_rounds

        if not Path(self.csv_file_path).is_file():
            raise Exception(
                "Not exiting csv file: {}"
                .format(self.csv_file_path))
        try:
            os.makedirs(out_directory_path, exist_ok = True)
        except OSError:
            raise Exception(
                "Can't create directory: {}"
                .format(out_directory_path))

    def create_cd_file(self):
        print("Start creating cd file")
        train_df = pd.read_csv(
            self.csv_file_path,
            nrows = 1)

        number_futures = len(train_df.columns) - 1
        if number_futures < self.number_cat_futures:
            raise Exception(
                "number_cat_futures must be less than number futures: {}"
                .format(number_futures))

        feature_names = dict()
        for column, name in enumerate(train_df):
            if column == 0:
                continue
            feature_names[column] = name

        create_cd(
            label = 0,
            cat_features = list(range(1, self.number_cat_futures + 1)),
            feature_names = feature_names,
            output_path = os.path.join(
                self.out_directory_path, self.cd_file_name))
        print("cd file successfully created")

    def learn(self):
        print("Start learning...")
        pool = Pool(
            data = self.csv_file_path,
            delimiter = ',',
            column_description = os.path.join(
                self.out_directory_path,
                self.cd_file_name),
            has_header = True,
            thread_count = self.thread_count)
        print('Dataset shape: {}'.format(str(pool.shape)))
        print('Column names: {}\n'.format(pool.get_feature_names()))
        train_pool, eval_pool = pool.train_eval_split(
            has_time = None,
            is_classification = True,
            eval_fraction = 0.2,
            save_eval_pool = True)

        train_dir = self.out_directory_path + "/train"
        model = CatBoostClassifier(
            #eval_metric = 'AUC',
            auto_class_weights='Balanced',
            iterations = self.iterations,
            learning_rate = self.learning_rate,
            early_stopping_rounds = self.early_stopping_rounds,
            loss_function = "Logloss",
            random_seed = 777,
            custom_loss = ['AUC', 'Accuracy', 'Precision'],
            train_dir = train_dir)
        model.fit(
            X = train_pool,
            eval_set = eval_pool,
            verbose = False,
            plot = True)

        print('Model is fitted: {}'.format(model.is_fitted()))
        print('Model params: {}'.format(model.get_params()))
        print('Tree count: {}'.format(model.tree_count_))
        print('Classes: {}'.format(model.classes_))
        
        feature_importance = model.get_feature_importance(prettified = True)        
        print("\nFeature importance:\n")
        with pd.option_context('display.max_rows', None, 'display.max_columns', None):
            print(feature_importance)            

        print("\nLearning is successfully done")
        
        model_file_path = os.path.join(
            self.out_directory_path, self.model_file_name)
        model.save_model(model_file_path, format = "cbm", pool = pool)
        print("Model save to file=" + model_file_path)

if __name__ == "__main__":
    try:
        need_create_cd_file = True
        thread_count = 15
        number_cat_futures = 9
        csv_file_path = "/u03/ml/process_model_data.csv"
        out_directory_path = "/u03/ml"
        model_file_name = "model.bin"
        cd_file_name = "train.cd"
        iterations = 10000
        learning_rate = 0.01
        early_stopping_rounds = 20
        
        application = Application(
            thread_count, number_cat_futures, csv_file_path,
            out_directory_path, model_file_name, cd_file_name,
            iterations, learning_rate, early_stopping_rounds)

        if need_create_cd_file:
            application.create_cd_file()

        application.learn()

    except Exception as e:
        print("ML application is failed. Reason: " + str(e))