In [1]:
from sklearn.preprocessing import LabelEncoder
import pandas as pd
import lightgbm as lgb
from sklearn.model_selection import train_test_split
from copy import deepcopy
from autogbm.metrics import autodl_auc, accuracy
from autogbm.utils.util import get_solution
from autogbm.auto_ingestion.dataset import AutoDLDataset
from autogbm.auto_ingestion import data_io
import os
from autogbm.auto_models.auto_tabular.utils.eda import AutoEDA

class Model(object):
    
    def __init__(self, metadata):
        self.done_training = False
        self.metadata = metadata

        self.metadata_info = metadata.metadata_
        self.train_loop_num = 0

        self.auto_eda = AutoEDA()

        self.X = []
        self.Y = []

        self.pre_increament_preds = True

        self.X_test = None

        self.next_element = None

        self.lgb_info = {}

        self.imp_cols = None

        self.models = {}

        self.sample_cols = None

        self.unknow_cols = None

        self.first_preds = False

        self.model = None

        self.keep_training_booster = False
    
    def simple_lgb(self, X, y, test_x):
        self.params = {
            "boosting_type": "gbdt",
            "objective": "multiclass",
            'num_class': 2,
            "metric": "multi_logloss",
            "verbosity": -1,
            "seed": 2020,
            "num_threads": 4,
        }

        self.hyperparams = {
            'num_leaves': 31,
            'max_depth': -1,
            'min_child_samples': 20,
            'max_bin': 110,
            'subsample': 1,
            'subsample_freq': 1,
            'colsample_bytree': 0.8,
            'min_child_weight': 0.001,
            'min_split_gain': 0.02,
            'reg_alpha': 0.1,
            'reg_lambda': 0.1,
            "learning_rate": 0.1
        }

        self.train_loop_num += 1
        
        if self.train_loop_num == 1:
            lgb_train = lgb.Dataset(X, y)
            self.model = lgb.train({**self.params, **self.hyperparams}, train_set=lgb_train, num_boost_round=10)
            preds = self.model.predict(test_x)
            self.log_feat_importances()
        else:
            lgb_train = lgb.Dataset(X[self.imp_cols], y)
        
            num_boost_round = 10 + self.train_loop_num * 5
            num_boost_round = min(40, num_boost_round)
        
            model = lgb.train({**self.params, **self.hyperparams}, train_set=lgb_train, num_boost_round=num_boost_round)
            preds = model.predict(test_x[self.imp_cols])
            
        return preds

    def log_feat_importances(self):
        importances = pd.DataFrame({'features': [i for i in self.model.feature_name()], 'importances': self.model.feature_importance("gain")})
        importances.sort_values('importances', ascending=False, inplace=True)

        importances = importances[importances['importances'] > 0]
        size = int(len(importances)*0.8)
        if self.imp_cols is None:
            if self.unknow_cols is not None:
                self.imp_cols = self.unknow_cols + [int(col) for col in importances['features'].values]
            else:
                self.imp_cols = [int(col) for col in importances['features'].values]
        else:
            self.imp_cols = [int(col) for col in importances['features'].values]
        self.lgb_info['imp_cols'] = self.imp_cols

# ========
df = pd.read_csv("example_data/bank-additional-full.csv", sep=";")

trans_cols = ["job", "marital", "education", "default", "housing", "loan", "contact", "month", "day_of_week",
              "poutcome", "y"]

for col in trans_cols:
    lbe = LabelEncoder()
    df[col] = lbe.fit_transform(df[col])

label = deepcopy(df["y"])
data = deepcopy(df.drop('y', axis=1))
train_data, test_data, train_label, test_label = train_test_split(pd.DataFrame(data.values), pd.Series(label.values), test_size=0.3, random_state=1024)
print(train_data.shape, test_data.shape)
# =====
formatted_dir = "formatted_data"
new_dataset_dir = formatted_dir + "/" + os.path.basename(formatted_dir)
datanames = data_io.inventory_data(new_dataset_dir)
basename = datanames[0]

D_train = AutoDLDataset(os.path.join(new_dataset_dir, basename, "train"))

model = Model(D_train.get_metadata())
solution = get_solution(solution_dir="formatted_data/formatted_data")
for i in range(50):
    if i == 0:
        sample_num = 500
    elif i == 1:
        sample_num = 1000
    elif i == 2:
        sample_num = 2000
    elif i == 3:
        sample_num = 3000
    else:
        sample_num += 500*2**(i-2)

   
    train_data.reset_index(drop=True, inplace=True)
    train_label.reset_index(drop=True, inplace=True)
    
    if sample_num <= train_data.shape[0]:
        lgb_data = deepcopy(train_data.loc[:sample_num-1,:])
        lgb_label = deepcopy(train_label.loc[:sample_num-1])
        
        y_pred = model.simple_lgb(lgb_data, lgb_label, test_data)
        nauc_score = autodl_auc(solution=solution, prediction=y_pred)
        acc_score = accuracy(solution=solution, prediction=y_pred)
        print("Epoch={}, evaluation: nauc_score={}, acc_score={}".format(i, nauc_score, acc_score)) 
    else:
#         break
        
        lgb_data = deepcopy(train_data)
        lgb_label = deepcopy(train_label)
        
        y_pred = model.simple_lgb(lgb_data, lgb_label, test_data)
        nauc_score = autodl_auc(solution=solution, prediction=y_pred)
        acc_score = accuracy(solution=solution, prediction=y_pred)
        print("Epoch={}, evaluation: nauc_score={}, acc_score={}".format(i, nauc_score, acc_score))
        break

Using TensorFlow backend.


(28831, 20) (12357, 20)
Epoch=0, evaluation: nauc_score=0.8315221260669037, acc_score=0.8544452880443759
Epoch=1, evaluation: nauc_score=0.8528100243330181, acc_score=0.8706310375066253
Epoch=2, evaluation: nauc_score=0.8613518352393794, acc_score=0.8755758319352994
Epoch=3, evaluation: nauc_score=0.8698446147245253, acc_score=0.8811991538658159
Epoch=4, evaluation: nauc_score=0.8746073792154596, acc_score=0.8820709692433889
Epoch=5, evaluation: nauc_score=0.8810370776950558, acc_score=0.8827393702387096
Epoch=6, evaluation: nauc_score=0.887419558244358, acc_score=0.8833887290043773
Epoch=7, evaluation: nauc_score=0.8917304948304763, acc_score=0.8839226411610627
