## 0. Import lib and param

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
import sklearn
import scipy.sparse 

for p in [np, pd, sklearn, scipy]:
    print (p.__name__, p.__version__)

numpy 1.21.4
pandas 1.3.4
sklearn 1.0.1
scipy 1.7.2


In [3]:
##load dataset
file = "data/processed_data"

train_info_path = file + "/train_info.csv"
train_work_path = file + "/train_work.csv"
train_label_path = "data/label_train.csv"

test_info_path = file + "/test_info.csv"
test_work_path = file + "/test_work.csv"
test_label_path =  "data/label_test.csv"


## data frame
train_info = pd.read_csv(train_info_path)
train_work = pd.read_csv(train_work_path)
train_label = pd.read_csv(train_label_path)

test_info = pd.read_csv(test_info_path)
test_work = pd.read_csv(test_work_path)
test_label = pd.read_csv(test_label_path)

In [4]:
train_info.shape, train_work.shape, train_label.shape

((27502, 5), (247559, 19), (27502, 2))

In [5]:
# info: id_bh,	bithYear,	gender,	new_province_id
# label: id_bd, label
# work: main:id_bh , new_work_province_id ,  employee_lv , 
## year_from_date , year_to_date ,  month_from_date, month_to_date 
## num_year_contract, num_month_contract   
## company_type,   id_management ,id_office ,  job_role_encode_knn                                                                                       

## cach xu ly:
## 1. group by thep id_bh o work 
## 2. merge voi info va label

## 1. Segment text - job/role filled None (in train_work)

In [6]:
train_work.shape, test_work.shape

((247559, 19), (162283, 19))

In [7]:
train_work = pd.read_csv("data/processed_data/segment_train_work.csv")
test_work = pd.read_csv("data/processed_data/segment_test_work.csv")
train_work.shape, test_work.shape

((247559, 19), (162283, 19))

## 2. Extract text features (phoBERT)

## 3. Extract text features (TFIDF)

In [8]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.decomposition import TruncatedSVD

vectorizer = TfidfVectorizer(
    ngram_range = (1,2),
        min_df=5, 
    max_df= 0.8, 
    max_features=1000,
    sublinear_tf=True
)
## TF-IDF + SVD
clf = Pipeline([
                ('tfidf', vectorizer ),
                ('svd', TruncatedSVD(n_components = 128, random_state=42)),
                ])
job_role_train = train_work['job_role_fillNan'].values
job_role_test = test_work['job_role_fillNan'].values
# clf.fit_transform(job_role_train)

In [9]:
# train_job_role_tfidf = clf.transform(job_role_train)
# test_job_role_tfidf = clf.transform(job_role_test)

- Group by thanh list => chuyen thanh text khong => tf-idf

## 4. Tao DF de co the train, test

In [10]:
def concat_data(df, df_add, col='id_bh', how='left'):
    df = pd.merge(df, df_add, on=col, how=how)
    return df

In [11]:
use_cosl_train  = ["id_bh" , "new_work_province_id" ,  "employee_lv" , 
"year_from_date" , "year_to_date" ,  "month_from_date", "month_to_date" ,
"num_year_contract", "num_month_contract"   , 
# "job_n_word",
"company_type",   "id_management" ,"id_office" ,  "job_role_encode_knn", 'job_role_fillNan',
"bithYear", "gender", "new_province_id", "label"  ] 
# train
group_train_work = train_work.groupby('id_bh').agg(lambda x: x.tolist())
train = concat_data(group_train_work, train_info, col='id_bh', how='left')
train = concat_data(train, train_label, col='id_bh', how='left')
train = train[use_cosl_train]
# test
group_test_work = test_work.groupby('id_bh').agg(lambda x: x.tolist())
test = concat_data(group_test_work, test_info, col='id_bh', how='left')
test = test[use_cosl_train[:-1]]

In [12]:
def  list2str(list):
  str =""
  for i in list:
    str += (i +" ")
  return str
train['job_role_fillNan_sentence'] = train.apply(lambda row: list2str(row.job_role_fillNan), axis = 1 )
test['job_role_fillNan_sentence'] = test.apply(lambda row: list2str(row.job_role_fillNan), axis = 1 )



In [13]:
clf.fit(train['job_role_fillNan_sentence'].values)
train['job_role_TFIDF'] = list(clf.transform(train['job_role_fillNan_sentence'].values))
test['job_role_TFIDF'] = list(clf.transform(test['job_role_fillNan_sentence'].values))


In [14]:
## create col form numpy arr in jobrole tfidf
for i in range(128):
  train['job_role_TFIDF_' + str(i)] = train['job_role_TFIDF'].apply(lambda x: float(x[i]))
  test['job_role_TFIDF_' + str(i)] = test['job_role_TFIDF'].apply(lambda x: float(x[i]))

  train['job_role_TFIDF_' + str(i)] = train['job_role_TFIDF'].apply(lambda x: float(x[i]))
  test['job_role_TFIDF_' + str(i)] = test['job_role_TFIDF'].apply(lambda x: float(x[i]))


In [15]:
## export 
# train.to_csv(r"data/processed_data/train_df_addTFIDFjob.csv")
# test.to_csv(r"data/processed_data/test_df_addTFIDFjob.csv")

## load lai train, test

In [16]:
# ## reload train, test
# train = pd.read_csv('data/processed_data/train_df_addTFIDFjob.csv')
# test = pd.read_csv('data/processed_data/test_df_addTFIDFjob.csv')

In [17]:
train.shape, test.shape

((27502, 148), (18134, 147))

In [18]:
train.select_dtypes(exclude=[np.number]).columns

Index(['new_work_province_id', 'employee_lv', 'year_from_date', 'year_to_date',
       'month_from_date', 'month_to_date', 'num_year_contract',
       'num_month_contract', 'company_type', 'id_management', 'id_office',
       'job_role_encode_knn', 'job_role_fillNan', 'job_role_fillNan_sentence',
       'job_role_TFIDF'],
      dtype='object')

- **NOTE: con feature job_role_TFIDF: khi group by co the dung lay mean**

## 5. Feature engineering

In [19]:
# 2. Feature engineering
## category list: company_type,	id_management,	id_office,	job_role_encode_knn ==> max frequent, last, count unique
### new_work_province_id: max frequnt, last , count unique

### new_work_province_id, id_office, company_type, id_management


## number:
### employee_lv bo gia tri -1: max, min (bo -1), mode(bo -1), std, mean, gia tri (max-min), count unique
### year_from_date: max, min, std, mode, mean, count unique
### year_to_date: max, min, std, mode, mean, count unique
### month_from_date, month_to_date:  max, min, std, mode, mean
### num_year_contract: max, min, mode, mean, std, q(75), q(25), count unique
### num_month_contract: max, min, mode, mean, std, q(75), q(25), count unique
## create new fea:



In [20]:
from scipy.stats import skew, kurtosis
data = np.random.normal(0, 1, 100)

print("mean : ", np.mean(data))
print("var  : ", np.var(data))
print("skew : ",skew(data))
print("kurt : ",kurtosis(data))

mean :  0.10379776654016885
var  :  1.160246445012187
skew :  -0.15389336944287954
kurt :  0.0769915069507845


In [21]:
from scipy.stats import skew, kurtosis
def most_frequent(List):
    return max(set(List), key = List.count)
def mean_out_nan(list):
    if (-1 in list):
        list.remove(-1)
        return sum(list) / float(len(list))
    return sum(list) / float(len(list))
def bining_olds(x):
    if (x < 0):
        return -999
    elif(0< x < 20 ):
        return 0
    elif( 20<= x < 30 ):
        return 1
    elif (30<= x < 40):
        return 2
    elif (40<= x < 50):
        return 3
    elif (50<= x < 60):
        return 4
    return 5

List = [2, 1, 2, 1, 3, - 1]
# print(count_num_jobs_outNan( List))

In [133]:
import statistics
from statistics import mode
category_cols = [
    "company_type",	"id_management",	"id_office",	"job_role_encode_knn" ,"new_work_province_id"
]
numeric_cols = [
        "employee_lv" , 
        "year_from_date",
        "year_to_date",
        "month_from_date",
         "month_to_date",
        "num_year_contract",
        "num_month_contract",
        # "job_n_word"
]
## utils func for category var
def max_frequent(list):
    if (len(list) == 0):
        return -999
    return(most_frequent(list))
def max_frequent_without_filled_var(list):
    if( -999 in list):
        list.remove(-999)
    if(-1 in list):
        list.remove(-1)
    if(-1.0 in list):
        list.remove(-1.0)
    a_set = set(list)
    if ((len(list) == 0) or (len(a_set) == len(list)) ):
        return -999
    return(most_frequent(list))
def last_value(list):
    if (len(list) == 0):
        return -999
    return list[-1]
def count_unique(list):
    return len(set(list))
def count_unique_without_filled_var(list):
    if( -999 in list):
        list.remove(-999)
    if(-1 in list):
        list.remove(-1)
    return len(set(list))

## utils fun for numeric features: max, min, mode, mean, std, q(75), q(25), count unique
def percentile_25(x):
    return x.quantile(.25)
def percentile_75(x):
    return x.quantile(.75)  
def min_without_filled_var(list):
    if(-1 in list):
        list.remove(-1)
        if (len(list) == 0):
            return -1
        else: 
            return min(list)
    return min(list)
def std(list):
    if (len(list) < 2):
            return -999
    if(-1 in list):
        list.remove(-1)
        if (len(list) < 2):
            return -999
        else: 
            return statistics.stdev(list)
    return statistics.stdev(list)
def mean_out_nan(list):
    if (-1 in list):
        list.remove(-1)
        return sum(list) / float(len(list))

    return (sum(list) / float(len(list)))
def count_num_jobs_outNan(list):
    if (-1 in list):
        list.remove(-1)
        return len(list)
    return len(list)
######################
def features_engineering(df):
    ## categorical cols
    for col in category_cols:
        df['category_max_fre_' + col] = df[col].apply(lambda x: max_frequent(x) ) ## category
        df['category_max_fre_out_filled_var_' + col] = df[col].apply(lambda x: max_frequent_without_filled_var(x) ) ## category
        df['category_last_value_' + col] = df[col].apply(lambda x: last_value(x) ) ## category
        df['numeric_count_unique_' + col] = df[col].apply(lambda x: count_unique(x) ) ## numeric
        df['numeric_count_unique_without_filled_var' + col] = df[col].apply(lambda x: count_unique_without_filled_var(x) ) ## numeric
    ## count features support cate cols:


    ## numeric cols:
    for col in numeric_cols:
        ## chi co employee_lv la co fill nan
        if (col == 'employee_lv'):
            df['numeric_max_lv_' + 'of_'+ col] = df[col].apply(lambda x: max(x) )
            df['numeric_min_lv_' + 'of_'+ col] = df[col].apply(lambda x: min_without_filled_var(x) )
            df['numeric_mode_' + 'of_'+ col] = df[col].apply(lambda x: max_frequent(x) ) 
            df['numeric_mode_out_filled_var_' + 'of_'+ col] = df[col].apply(lambda x: max_frequent_without_filled_var(x) ) 
            df['numeric_mean_' + 'of_'+ col] = df[col].apply(lambda x: sum(x) / float(len(x)) ) 
            df['numeric_mean_out_nan_' + 'of_'+ col] = df[col].apply(lambda x:  mean_out_nan(x))
            df['numeric_std_' + 'of_'+ col] = df[col].apply(lambda x: std(x) )
            df['numeric_count_unique_' + 'of_'+ col]  = df[col].apply(lambda x: count_unique(x) ) ## numeric
            df['numeric_count_unique_out_fillNan_' + 'of_'+ col]  = df[col].apply(lambda x: count_unique_without_filled_var(x) ) ## numeric
            df['numeric_Q25_' + 'of_'+ col ] = df[col].apply(lambda x: np.quantile(x, .25) )
            df['numeric_Q75_' + 'of_'+ col ] = df[col].apply(lambda x: np.quantile(x, .75) )

            df['numeric_skew_' + 'of_'+ col ] = df[col].apply(lambda x: skew(x) )
            df['numeric_kurtosis_' + 'of_'+ col ] = df[col].apply(lambda x: kurtosis(x) )
        else:
            df['numeric_max_' + 'of_'+ col] = df[col].apply(lambda x: max(x) )
            df['numeric_min_' + 'of_'+ col] = df[col].apply(lambda x: min_without_filled_var(x) )
            df['numeric_mode_' + 'of_'+ col] = df[col].apply(lambda x: max_frequent(x) )  
            df['numeric_mean_' + 'of_'+ col] = df[col].apply(lambda x: sum(x) / float(len(x)) ) 
            df['numeric_std_' + 'of_'+ col] = df[col].apply(lambda x: std(x) )
            df['numeric_count_unique_' + 'of_'+ col]  = df[col].apply(lambda x: count_unique(x) ) ## numeric
            df['numeric_Q25_' + 'of_'+ col ] = df[col].apply(lambda x: np.quantile(x, .25) )
            df['numeric_Q75_' + 'of_'+ col ] = df[col].apply(lambda x: np.quantile(x, .75) )
            df['numeric_skew_' + 'of_'+ col ] = df[col].apply(lambda x: skew(x) )
            df['numeric_kurtosis_' + 'of_'+ col ] = df[col].apply(lambda x: kurtosis(x) )
            
    ## create_new_fea between 2 features:(year_from_date, year_to_date) with bithYear
    df['numeric_year_diff_max_year_from_vs_bithYear'] = df['numeric_max_of_year_from_date'] - df['bithYear']
    df['numeric_year_diff_min_year_from_vs_bithYear'] = df['numeric_min_of_year_from_date'] - df['bithYear']
    df['numeric_year_diff_max_year_to_vs_bithYear'] = df['numeric_max_of_year_to_date'] - df['bithYear']
    df['numeric_year_diff_min_year_to_vs_bithYear'] = df['numeric_min_of_year_to_date'] - df['bithYear']

    ## old:
    df['numeric_olds'] = 2022 - df['bithYear']
    ## ratio: num_old vs diff_min_year_from_vs_bithYear
    df['numeric_ratio_olds_vs_(diff_min_year_from_vs_bithYear)'] = df['numeric_olds'] / df['numeric_year_diff_min_year_from_vs_bithYear']
    df['numeric_ratio_olds_vs_(diff_min_year_from_vs_bithYear)'] = df['numeric_ratio_olds_vs_(diff_min_year_from_vs_bithYear)'].apply(lambda x: x if(x> 0) else -999)

    ## dem so lan xuat hien trong bang: 
    df['numeric_count_job'] = df['employee_lv'].apply(lambda x: len(x))
    df['numeric_count_job_withoutNan'] = df['employee_lv'].apply(lambda x: count_num_jobs_outNan(x))

    ## fix value in create_new_fea cols
    new_cols = [col for col in df.columns if 'numeric_year_diff_' in col]
    for col in new_cols:
        df[col] = df[col].apply(lambda x: x if(x> 0) else -999)


    # new_category_cols.append('new_province_id')
      #### phase 2:
      #### category_last_value_ se lay lam dai hien cho category list: =>>  "category_last_value_company_type",	
      ####  "category_last_value_id_management",	"category_last_value_id_office",	"category_last_value_job_role_encode_knn" ,"category_last_value_new_work_province_id"
  
    for col in [
                'numeric_mean_of_num_month_contract' ,
                'numeric_count_job',
                'numeric_count_job_withoutNan',
                'numeric_mean_out_nan_of_employee_lv', 
                 'numeric_mean_of_employee_lv', 
                'numeric_olds'
    ]:  
      ### wordAdd - companyType
      temp =  df.groupby(['category_last_value_new_work_province_id', 
                          'category_last_value_company_type'])[col].agg(['min', 'max', 'mean', 'median','var', percentile_25, percentile_75 ]).rename(
                          {
                              'min': 'agg_'+ col +'_'+ 'workAddress_companyTye' +'_min',
                              'max': 'agg_'+ col + '_'+ 'workAddress_companyTye' +'_max',
                              'mean': 'agg_'+ col + '_'+ 'workAddress_companyTye' +'_mean',
                              'median': 'agg_'+ col + '_'+ 'workAddress_companyTye' +'_median',
                              'var': 'agg_'+ col + '_'+ 'workAddress_companyTye' + '_var',
                              'percentile_25': 'agg_'+ col + '_'+ 'workAddress_companyTye' + '_P25',
                              'percentile_75' : 'agg_'+ col + '_'+ 'workAddress_companyTye' + '_P75',
                          }, axis = 1
                      )
      df = pd.merge(df,temp,on= ['category_last_value_new_work_province_id',
                          'category_last_value_company_type'] ,how='left')
      del temp
      ## companyType - id_management
      temp =  df.groupby(['category_last_value_company_type',
                          'category_last_value_id_management'])[col].agg(['min', 'max', 'mean', 'median','var', percentile_25, percentile_75 ]).rename(
                          {
                              'min': 'agg_'+ col +'_'+ 'companyTye_idManagement' +'_min',
                              'max': 'agg_'+ col + '_'+ 'companyTye_idManagement' +'_max',
                              'mean': 'agg_'+ col + '_'+ 'companyTye_idManagement' +'_mean',
                              'median': 'agg_'+ col + '_'+ 'companyTye_idManagement' +'_median',
                              'var': 'agg_'+ col + '_'+ 'companyTye_idManagement' + '_var',
                              'percentile_25': 'agg_'+ col + '_'+ 'companyTye_idManagement' + '_P25',
                              'percentile_75' : 'agg_'+ col + '_'+ 'companyTye_idManagement' + '_P75',
                          }, axis = 1
                      )
      df = pd.merge(df,temp,on= ['category_last_value_company_type',
                          'category_last_value_id_management'] ,how='left')
      del temp
      ## companyType - job_role
      temp =  df.groupby(['category_last_value_company_type',
                          'category_last_value_job_role_encode_knn'])[col].agg(['min', 'max', 'mean', 'median','var', percentile_25, percentile_75 ]).rename(
                          {
                              'min': 'agg_'+ col +'_'+ 'companyTye_jobRole' +'_min',
                              'max': 'agg_'+ col + '_'+ 'companyTye_jobRole' +'_max',
                              'mean': 'agg_'+ col + '_'+ 'companyTye_jobRole' +'_mean',
                              'median': 'agg_'+ col + '_'+ 'companyTye_jobRole' +'_median',
                              'var': 'agg_'+ col + '_'+ 'companyTye_jobRole' + '_var',
                              'percentile_25': 'agg_'+ col + '_'+ 'companyTye_jobRole' + '_P25',
                              'percentile_75' : 'agg_'+ col + '_'+ 'companyTye_jobRole' + '_P75',
                          }, axis = 1
                      )
      df = pd.merge(df,temp,on= ['category_last_value_company_type',
                          'category_last_value_job_role_encode_knn'] ,how='left')
      del temp
      ## companyType - job_role - id_management
      temp =  df.groupby(['category_last_value_company_type',
                          'category_last_value_job_role_encode_knn',
                          'category_last_value_id_management'])[col].agg(['min', 'max', 'mean', 'median','var', percentile_25, percentile_75 ]).rename(
                          {
                              'min': 'agg_'+ col +'_'+ 'companyTye_jobRole_idManagement' +'_min',
                              'max': 'agg_'+ col + '_'+ 'companyTye_jobRole_idManagement' +'_max',
                              'mean': 'agg_'+ col + '_'+ 'companyTye_jobRole_idManagement' +'_mean',
                              'median': 'agg_'+ col + '_'+ 'companyTye_jobRole_idManagement' +'_median',
                              'var': 'agg_'+ col + '_'+ 'companyTye_jobRole_idManagement' + '_var',
                              'percentile_25': 'agg_'+ col + '_'+ 'companyTye_jobRole_idManagement' + '_P25',
                              'percentile_75' : 'agg_'+ col + '_'+ 'companyTye_jobRole_idManagement' + '_P75',
                          }, axis = 1
                      )
      df = pd.merge(df,temp,on= ['category_last_value_company_type',
                          'category_last_value_job_role_encode_knn',
                          'category_last_value_id_management'] ,how='left')       

      del temp
    ## create bining features: olds: numeric_olds
    df['category_bining_olds'] = df['numeric_olds'].apply(lambda row: bining_olds(row))
    ## check HN - 24, TPHCM - 58, Da Nang - 15, HAI PHONG - 27, CAN THO - 13
    df['numeric_is_in_bigCity'] = df['category_max_fre_new_work_province_id'].apply(lambda x: 1 if (x in [24, 58, 15, 13, 27]) else 0)
    df['numeric_is_in_bigCity_upToDate'] = df['category_last_value_new_work_province_id'].apply(lambda x: 1 if (x in [24, 58, 15, 13, 27]) else 0)
    

    ## count features
    ## count encoding
    for col in [
        "category_last_value_new_work_province_id",
        "category_last_value_company_type",
        "category_last_value_id_management",
        "category_last_value_job_role_encode_knn",
 

        "category_max_fre_new_work_province_id",
        "category_max_fre_company_type",
        "category_max_fre_id_management",
        "category_max_fre_job_role_encode_knn"]:
        temp = df[col].value_counts().to_dict()
        df['numeric_ce_'+col ] = df[col].map(temp)
        del temp
    df.fillna(-999, inplace=True)
    return df
  

In [131]:
tf_idf_cols = [col for col in train.columns if 'job_role_TFIDF_' in col]
col_with_out_tf_idf_cols = [col for col in train.columns if col not in tf_idf_cols]
## remove cols won't be used
col_with_out_tf_idf_cols.remove('job_role_fillNan')
col_with_out_tf_idf_cols.remove('job_role_fillNan_sentence')
col_with_out_tf_idf_cols.remove('job_role_TFIDF')
col_with_out_tf_idf_cols.remove('label')

label_train = train['label'].values
len(tf_idf_cols), len(col_with_out_tf_idf_cols), len(train.columns)

(128, 16, 148)

In [134]:
%%time
fe_train_with_out_tfidf = features_engineering(train[col_with_out_tf_idf_cols])
fe_test_with_out_tfidf = features_engineering(test[col_with_out_tf_idf_cols])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['category_max_fre_' + col] = df[col].apply(lambda x: max_frequent(x) ) ## category
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['category_max_fre_out_filled_var_' + col] = df[col].apply(lambda x: max_frequent_without_filled_var(x) ) ## category
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
 

Wall time: 4min 41s


In [135]:
fe_train = pd.concat([fe_train_with_out_tfidf, train[tf_idf_cols], train['label'] ], axis = 1)
fe_test = pd.concat([fe_test_with_out_tfidf, test[tf_idf_cols] ], axis = 1)


In [136]:
numeric_ml_useful_cols = [col for col in fe_train.columns if 'numeric_' in col] + tf_idf_cols
category_ml_useful_cols = [col for col in fe_train.columns if (('category_' in col) and ('numeric_ce_' not in col) )]
old_cols = ["bithYear",	"gender","new_province_id", "id_bh", "label"]
# agg_cols = 
ml_useful_cols_train = numeric_ml_useful_cols + category_ml_useful_cols + old_cols
ml_useful_cols_test = numeric_ml_useful_cols + category_ml_useful_cols + old_cols[:-1]
len(ml_useful_cols_train), len(ml_useful_cols_test)

(418, 417)

In [137]:
len(set(ml_useful_cols_train))

418

## 6. Test baseline

In [138]:
category_cols = category_ml_useful_cols + [ "new_province_id"]

In [139]:
category_cols 

['category_max_fre_company_type',
 'category_max_fre_out_filled_var_company_type',
 'category_last_value_company_type',
 'category_max_fre_id_management',
 'category_max_fre_out_filled_var_id_management',
 'category_last_value_id_management',
 'category_max_fre_id_office',
 'category_max_fre_out_filled_var_id_office',
 'category_last_value_id_office',
 'category_max_fre_job_role_encode_knn',
 'category_max_fre_out_filled_var_job_role_encode_knn',
 'category_last_value_job_role_encode_knn',
 'category_max_fre_new_work_province_id',
 'category_max_fre_out_filled_var_new_work_province_id',
 'category_last_value_new_work_province_id',
 'category_bining_olds',
 'new_province_id']

In [140]:
df_train = fe_train[ml_useful_cols_train]
df_test = fe_test[ml_useful_cols_test]

In [141]:
df_train.shape, df_test.shape

((27502, 418), (18134, 417))

In [105]:
from lightgbm import LGBMClassifier as lgb
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import f1_score

In [154]:
skf = StratifiedKFold(n_splits= 5, random_state= 42, shuffle= True)
X_train = df_train.drop(columns = ['label', 'id_bh']) 
X_train[category_cols] = X_train[category_cols].astype('category')
y_train = df_train['label']
params_k = {
            'boosting_type': 'gbdt',
            'objective': 'multiclass',
            'num_class':7,
            'categorical_feature': category_cols,
            'verbose' : -1,
            "random_seed": 42,
            # "bagging_fraction": 0.7
}


In [155]:
X_train.shape

(27502, 416)

## Features selection

In [88]:
from sklearn.model_selection import GroupKFold, cross_val_score
from sklearn.ensemble import HistGradientBoostingRegressor, HistGradientBoostingClassifier
from sklearn.feature_selection import SequentialFeatureSelector
from matplotlib.ticker import MaxNLocator

## CROSS VALIDATE

In [157]:
## Kfold
cv_score = []
training_score = []
feature_importance_df =  pd.DataFrame()
for n_fold, (train_idx, valid_idx) in enumerate(skf.split(X_train, y_train)):
    print("Fold %s" % (n_fold))
    train_x, train_y = X_train.iloc[train_idx], y_train.iloc[train_idx]
    valid_x, valid_y = X_train.iloc[valid_idx], y_train.iloc[valid_idx]

    clf = lgb(
        **params_k
    )
    clf.fit(train_x, train_y, verbose= False)
    score_cv = f1_score(valid_y, clf.predict(valid_x),  average='macro')
    score_training = f1_score(train_y, clf.predict(train_x),  average='macro')

    # For create feature importances
    fold_importance_df = pd.DataFrame()
    fold_importance_df["feature"] = X_train.columns
    fold_importance_df["importance"] = clf.feature_importances_
    fold_importance_df["fold"] = i + 1
    feature_importance_df = pd.concat([feature_importance_df, fold_importance_df], axis=0)

    print('F1_SCORE_CV: ', round(score_cv, 4), '\n')
    print('F1_SCORE_TRAINING: ', round(score_training, 4), '\n')

    cv_score.append(score_cv)
    training_score.append(score_training)

Fold 0


Please use categorical_feature argument of the Dataset constructor to pass this parameter.


F1_SCORE_CV:  0.7872 

F1_SCORE_TRAINING:  1.0 

Fold 1


Please use categorical_feature argument of the Dataset constructor to pass this parameter.


F1_SCORE_CV:  0.7471 

F1_SCORE_TRAINING:  1.0 

Fold 2


Please use categorical_feature argument of the Dataset constructor to pass this parameter.


F1_SCORE_CV:  0.8122 

F1_SCORE_TRAINING:  1.0 

Fold 3


Please use categorical_feature argument of the Dataset constructor to pass this parameter.


F1_SCORE_CV:  0.8012 

F1_SCORE_TRAINING:  1.0 

Fold 4


Please use categorical_feature argument of the Dataset constructor to pass this parameter.


F1_SCORE_CV:  0.7898 

F1_SCORE_TRAINING:  1.0 



In [158]:
##  - 0.8049874090219371 
print("CV SCORE: {} - std: {}".format(sum(cv_score)/ 5,std(cv_score) ))
print("TRAINING SCORE: {} - std: {}".format(sum(training_score)/ 5,std(training_score) ))

CV SCORE: 0.7875084698209586 - std: 0.024681535781621092
TRAINING SCORE: 0.9999959189404791 - std: 9.125526508854801e-06


In [159]:
def display_importances(feature_importance_df_, top_k):

    cols = feature_importance_df_[["feature", "importance"]].groupby("feature").mean().sort_values(
        by="importance", ascending=False)[:top_k].index
    
    # best_features = feature_importance_df_.loc[feature_importance_df_.feature.isin(cols)]
    
    # plt.figure(figsize=(12, 8))
    # sns.barplot(x="importance", y="feature", 
    #             data=best_features.sort_values(by="importance", ascending=False))
    # plt.title('LightGBM Features (avg over folds)')
    # plt.tight_layout()
    return cols
# top_fea_50 = display_importances(feature_importance_df_=feature_importance_df, top_k = 20)
# top_fea_100 = display_importances(feature_importance_df_=feature_importance_df, top_k = 100)
# top_fea_150 = display_importances(feature_importance_df_=feature_importance_df, top_k = 150)
top_fea_200 = display_importances(feature_importance_df_=feature_importance_df, top_k = 200)
top_fea_250 = display_importances(feature_importance_df_=feature_importance_df, top_k = 250)
top_fea_300 = display_importances(feature_importance_df_=feature_importance_df, top_k = 300)

In [161]:
## top 250 cv - 0.7978353846662485 - std: 0.018426308739477273 1st
## top 200 cv - 0.7937366078480538 - std: 0.021380086178773374
## top 150 cv - 0.7899814071413578 - std: 0.03008877391310114
## top 100 cv - 0.7802258450609931 - std: 0.02259313462464178
## top 300 cv - 0.7930353865893962 - std: 0.026677523879470658
top_fea_50_cols_list =  list(top_fea_250)
cv_score = []
training_score = []
X_train_useful_cols = X_train[top_fea_50_cols_list]
feature_importance_df =  pd.DataFrame()
for n_fold, (train_idx, valid_idx) in enumerate(skf.split(X_train_useful_cols, y_train)):
    print("Fold %s" % (n_fold))
    train_x, train_y = X_train_useful_cols.iloc[train_idx], y_train.iloc[train_idx]
    valid_x, valid_y = X_train_useful_cols.iloc[valid_idx], y_train.iloc[valid_idx]

    clf = lgb(
        **params_k
    )
    clf.fit(train_x, train_y, verbose= False)
    score_cv = f1_score(valid_y, clf.predict(valid_x),  average='macro')
    score_training = f1_score(train_y, clf.predict(train_x),  average='macro')
    print('F1_SCORE_CV: ', round(score_cv, 4), '\n')
    print('F1_SCORE_TRAINING: ', round(score_training, 4), '\n')

    cv_score.append(score_cv)
    training_score.append(score_training)
print("CV SCORE: {} - std: {}".format(sum(cv_score)/ 5,std(cv_score) ))
print("TRAINING SCORE: {} - std: {}".format(sum(training_score)/ 5,std(training_score) ))

Fold 0


Please use categorical_feature argument of the Dataset constructor to pass this parameter.


F1_SCORE_CV:  0.7917 

F1_SCORE_TRAINING:  1.0 

Fold 1


Please use categorical_feature argument of the Dataset constructor to pass this parameter.


F1_SCORE_CV:  0.776 

F1_SCORE_TRAINING:  1.0 

Fold 2


Please use categorical_feature argument of the Dataset constructor to pass this parameter.


F1_SCORE_CV:  0.8233 

F1_SCORE_TRAINING:  1.0 

Fold 3


Please use categorical_feature argument of the Dataset constructor to pass this parameter.


F1_SCORE_CV:  0.8089 

F1_SCORE_TRAINING:  1.0 

Fold 4


Please use categorical_feature argument of the Dataset constructor to pass this parameter.


F1_SCORE_CV:  0.7893 

F1_SCORE_TRAINING:  1.0 

CV SCORE: 0.7978353846662485 - std: 0.018426308739477273
TRAINING SCORE: 0.9999836758985203 - std: 1.7072051926739718e-05


In [162]:
## select top 250 features important

## 7. Predict

### predict for SMOTE data

In [None]:
# submit_file.head()
# # test_label.head()

In [None]:
# submit_file.to_csv("data/submmit/baseline_lgb_addAggFeaPhase2.csv", index=False)

In [None]:
# submit_file['label'].value_counts()

## 8. OPTUNA TUNE

In [163]:
selected_features = list(top_fea_250)
X_train = df_train.drop(columns = ['label', 'id_bh']) 
X_train[category_cols] = X_train[category_cols].astype('category')
X_train = X_train[selected_features]

In [164]:
import optuna
def objective(trial):
    # Specify a search space using distributions across plausible values of hyperparameters.
    param = {
        'boosting_type': 'gbdt',
        'objective': 'multiclass',
        'num_class':7,
        'categorical_feature': category_cols,       
        "seed": 42,
        # 'lambda_l1': trial.suggest_float('lambda_l1', 1e-8, 10.0, log=True),
        # 'lambda_l2': trial.suggest_float('lambda_l2', 1e-8, 10.0, log=True),
        'num_leaves': trial.suggest_int('num_leaves', 2, 12),
        'n_estimators': trial.suggest_int('n_estimators', 100, 1200),
        'learning_rate': trial.suggest_float('learning_rate', 1e-3, 1),

        'feature_fraction': trial.suggest_float('feature_fraction', 0.4, 1.0),
        'bagging_fraction': trial.suggest_float('bagging_fraction', 0.4, 1.0),
        # 'bagging_freq': trial.suggest_int('bagging_freq', 1, 7),
        # 'min_child_samples': trial.suggest_int('min_child_samples', 5, 100)
    }
    
    # Run LightGBM for the hyperparameter values
    lgbcv = lgb(**param,
                #    categorical_feature=ids_of_categorical,              
                                  
                  )
    cv_score = []
    for n_fold, (train_idx, valid_idx) in enumerate(skf.split(X_train , y_train)):
        # print("Fold %s" % (n_fold))
        train_x, train_y = X_train .iloc[train_idx], y_train.iloc[train_idx]
        valid_x, valid_y = X_train .iloc[valid_idx], y_train.iloc[valid_idx]

        clf = lgbcv
        clf.fit(train_x, train_y, verbose= False)
        score = f1_score(valid_y, clf.predict(valid_x),  average='macro')
        print('F1_SCORE: ', round(score, 4), '\n')
        cv_score.append(score)
    
    return sum(cv_score)/len(cv_score)

study = optuna.create_study(direction="maximize")
study.optimize(objective, n_trials=50) 

[32m[I 2022-04-18 22:06:30,172][0m A new study created in memory with name: no-name-fa7446fa-507c-44a1-98af-6489a65a4f5c[0m
Please use categorical_feature argument of the Dataset constructor to pass this parameter.


F1_SCORE:  0.4047 



Please use categorical_feature argument of the Dataset constructor to pass this parameter.


F1_SCORE:  0.3313 



Please use categorical_feature argument of the Dataset constructor to pass this parameter.


F1_SCORE:  0.3658 



Please use categorical_feature argument of the Dataset constructor to pass this parameter.


F1_SCORE:  0.3592 



Please use categorical_feature argument of the Dataset constructor to pass this parameter.
[32m[I 2022-04-18 22:08:55,113][0m Trial 0 finished with value: 0.33262117543310177 and parameters: {'num_leaves': 10, 'n_estimators': 1046, 'learning_rate': 0.7353735947192181, 'feature_fraction': 0.9727895077873797, 'bagging_fraction': 0.5920311821910044}. Best is trial 0 with value: 0.33262117543310177.[0m


F1_SCORE:  0.2021 



Please use categorical_feature argument of the Dataset constructor to pass this parameter.


F1_SCORE:  0.157 



Please use categorical_feature argument of the Dataset constructor to pass this parameter.


F1_SCORE:  0.1475 



Please use categorical_feature argument of the Dataset constructor to pass this parameter.


F1_SCORE:  0.0706 



Please use categorical_feature argument of the Dataset constructor to pass this parameter.


F1_SCORE:  0.103 



Please use categorical_feature argument of the Dataset constructor to pass this parameter.
[32m[I 2022-04-18 22:10:39,760][0m Trial 1 finished with value: 0.12004875000289832 and parameters: {'num_leaves': 2, 'n_estimators': 1155, 'learning_rate': 0.9073965196712113, 'feature_fraction': 0.5582995687477756, 'bagging_fraction': 0.6046236603596633}. Best is trial 0 with value: 0.33262117543310177.[0m


F1_SCORE:  0.1221 



Please use categorical_feature argument of the Dataset constructor to pass this parameter.


F1_SCORE:  0.3906 



Please use categorical_feature argument of the Dataset constructor to pass this parameter.


F1_SCORE:  0.4321 



Please use categorical_feature argument of the Dataset constructor to pass this parameter.


F1_SCORE:  0.1777 



Please use categorical_feature argument of the Dataset constructor to pass this parameter.


F1_SCORE:  0.1828 



Please use categorical_feature argument of the Dataset constructor to pass this parameter.
[32m[I 2022-04-18 22:12:38,717][0m Trial 2 finished with value: 0.3185454186962356 and parameters: {'num_leaves': 12, 'n_estimators': 653, 'learning_rate': 0.46521297746389184, 'feature_fraction': 0.9787114047811016, 'bagging_fraction': 0.9983118436765506}. Best is trial 0 with value: 0.33262117543310177.[0m


F1_SCORE:  0.4095 



Please use categorical_feature argument of the Dataset constructor to pass this parameter.


F1_SCORE:  0.3662 



Please use categorical_feature argument of the Dataset constructor to pass this parameter.


F1_SCORE:  0.2728 



Please use categorical_feature argument of the Dataset constructor to pass this parameter.


F1_SCORE:  0.1666 



Please use categorical_feature argument of the Dataset constructor to pass this parameter.


F1_SCORE:  0.289 



Please use categorical_feature argument of the Dataset constructor to pass this parameter.
[32m[I 2022-04-18 22:15:11,171][0m Trial 3 finished with value: 0.26913777576365594 and parameters: {'num_leaves': 7, 'n_estimators': 1189, 'learning_rate': 0.9071834265198645, 'feature_fraction': 0.8628857854819365, 'bagging_fraction': 0.9190980235403359}. Best is trial 0 with value: 0.33262117543310177.[0m


F1_SCORE:  0.2512 



Please use categorical_feature argument of the Dataset constructor to pass this parameter.


F1_SCORE:  0.5645 



Please use categorical_feature argument of the Dataset constructor to pass this parameter.


F1_SCORE:  0.5415 



Please use categorical_feature argument of the Dataset constructor to pass this parameter.


F1_SCORE:  0.4008 



Please use categorical_feature argument of the Dataset constructor to pass this parameter.


F1_SCORE:  0.4468 



Please use categorical_feature argument of the Dataset constructor to pass this parameter.
[32m[I 2022-04-18 22:15:55,070][0m Trial 4 finished with value: 0.47037929142793544 and parameters: {'num_leaves': 7, 'n_estimators': 211, 'learning_rate': 0.45094121006277416, 'feature_fraction': 0.844477638025565, 'bagging_fraction': 0.588475626412928}. Best is trial 4 with value: 0.47037929142793544.[0m


F1_SCORE:  0.3983 



Please use categorical_feature argument of the Dataset constructor to pass this parameter.


F1_SCORE:  0.8033 



Please use categorical_feature argument of the Dataset constructor to pass this parameter.


F1_SCORE:  0.7802 



Please use categorical_feature argument of the Dataset constructor to pass this parameter.


F1_SCORE:  0.838 



Please use categorical_feature argument of the Dataset constructor to pass this parameter.


F1_SCORE:  0.8055 



Please use categorical_feature argument of the Dataset constructor to pass this parameter.
[32m[I 2022-04-18 22:18:09,296][0m Trial 5 finished with value: 0.8040849795380097 and parameters: {'num_leaves': 10, 'n_estimators': 621, 'learning_rate': 0.055094909017779053, 'feature_fraction': 0.5490638193721177, 'bagging_fraction': 0.9255868731888846}. Best is trial 5 with value: 0.8040849795380097.[0m


F1_SCORE:  0.7934 



Please use categorical_feature argument of the Dataset constructor to pass this parameter.


F1_SCORE:  0.3913 



Please use categorical_feature argument of the Dataset constructor to pass this parameter.


F1_SCORE:  0.2998 



Please use categorical_feature argument of the Dataset constructor to pass this parameter.


F1_SCORE:  0.3285 



Please use categorical_feature argument of the Dataset constructor to pass this parameter.


F1_SCORE:  0.3222 



Please use categorical_feature argument of the Dataset constructor to pass this parameter.
[32m[I 2022-04-18 22:20:31,326][0m Trial 6 finished with value: 0.3284742198118729 and parameters: {'num_leaves': 8, 'n_estimators': 1118, 'learning_rate': 0.331436764167221, 'feature_fraction': 0.7044685335822753, 'bagging_fraction': 0.5392404885139344}. Best is trial 5 with value: 0.8040849795380097.[0m


F1_SCORE:  0.3005 



Please use categorical_feature argument of the Dataset constructor to pass this parameter.


F1_SCORE:  0.2152 


Please use categorical_feature argument of the Dataset constructor to pass this parameter.



F1_SCORE:  0.2923 



Please use categorical_feature argument of the Dataset constructor to pass this parameter.


F1_SCORE:  0.1407 



Please use categorical_feature argument of the Dataset constructor to pass this parameter.


F1_SCORE:  0.4499 



Please use categorical_feature argument of the Dataset constructor to pass this parameter.
[32m[I 2022-04-18 22:22:22,942][0m Trial 7 finished with value: 0.3116839959150476 and parameters: {'num_leaves': 6, 'n_estimators': 897, 'learning_rate': 0.368634106120184, 'feature_fraction': 0.6352150561720549, 'bagging_fraction': 0.802251757729795}. Best is trial 5 with value: 0.8040849795380097.[0m


F1_SCORE:  0.4603 



Please use categorical_feature argument of the Dataset constructor to pass this parameter.


F1_SCORE:  0.4298 



Please use categorical_feature argument of the Dataset constructor to pass this parameter.


F1_SCORE:  0.2591 



Please use categorical_feature argument of the Dataset constructor to pass this parameter.


F1_SCORE:  0.3824 



Please use categorical_feature argument of the Dataset constructor to pass this parameter.


F1_SCORE:  0.2022 



Please use categorical_feature argument of the Dataset constructor to pass this parameter.
[32m[I 2022-04-18 22:23:59,949][0m Trial 8 finished with value: 0.31190596328949516 and parameters: {'num_leaves': 12, 'n_estimators': 645, 'learning_rate': 0.7698340495925156, 'feature_fraction': 0.8612585350183969, 'bagging_fraction': 0.5300221485988336}. Best is trial 5 with value: 0.8040849795380097.[0m


F1_SCORE:  0.286 



Please use categorical_feature argument of the Dataset constructor to pass this parameter.


F1_SCORE:  0.1234 



Please use categorical_feature argument of the Dataset constructor to pass this parameter.


F1_SCORE:  0.1843 



Please use categorical_feature argument of the Dataset constructor to pass this parameter.


F1_SCORE:  0.4419 



Please use categorical_feature argument of the Dataset constructor to pass this parameter.


F1_SCORE:  0.2998 



Please use categorical_feature argument of the Dataset constructor to pass this parameter.
[32m[I 2022-04-18 22:26:47,773][0m Trial 9 finished with value: 0.2592794718638352 and parameters: {'num_leaves': 3, 'n_estimators': 1060, 'learning_rate': 0.536231563809302, 'feature_fraction': 0.9350641868920488, 'bagging_fraction': 0.734198997280791}. Best is trial 5 with value: 0.8040849795380097.[0m


F1_SCORE:  0.2469 



Please use categorical_feature argument of the Dataset constructor to pass this parameter.


F1_SCORE:  0.7493 



Please use categorical_feature argument of the Dataset constructor to pass this parameter.


F1_SCORE:  0.7148 



Please use categorical_feature argument of the Dataset constructor to pass this parameter.


F1_SCORE:  0.7956 



Please use categorical_feature argument of the Dataset constructor to pass this parameter.


F1_SCORE:  0.7621 



Please use categorical_feature argument of the Dataset constructor to pass this parameter.
[32m[I 2022-04-18 22:27:43,364][0m Trial 10 finished with value: 0.7568227463099036 and parameters: {'num_leaves': 10, 'n_estimators': 275, 'learning_rate': 0.011624348885723002, 'feature_fraction': 0.41283369645198736, 'bagging_fraction': 0.8503853043060997}. Best is trial 5 with value: 0.8040849795380097.[0m


F1_SCORE:  0.7622 



Please use categorical_feature argument of the Dataset constructor to pass this parameter.


F1_SCORE:  0.7722 



Please use categorical_feature argument of the Dataset constructor to pass this parameter.


F1_SCORE:  0.7457 



Please use categorical_feature argument of the Dataset constructor to pass this parameter.


F1_SCORE:  0.7973 



Please use categorical_feature argument of the Dataset constructor to pass this parameter.


F1_SCORE:  0.7855 



Please use categorical_feature argument of the Dataset constructor to pass this parameter.
[32m[I 2022-04-18 22:28:14,935][0m Trial 11 finished with value: 0.7779433233702003 and parameters: {'num_leaves': 10, 'n_estimators': 162, 'learning_rate': 0.0316159594449965, 'feature_fraction': 0.42253170498181075, 'bagging_fraction': 0.8534093875412327}. Best is trial 5 with value: 0.8040849795380097.[0m


F1_SCORE:  0.789 



Please use categorical_feature argument of the Dataset constructor to pass this parameter.


F1_SCORE:  0.7958 



Please use categorical_feature argument of the Dataset constructor to pass this parameter.


F1_SCORE:  0.7472 



Please use categorical_feature argument of the Dataset constructor to pass this parameter.


F1_SCORE:  0.8312 



Please use categorical_feature argument of the Dataset constructor to pass this parameter.


F1_SCORE:  0.7628 



Please use categorical_feature argument of the Dataset constructor to pass this parameter.
[32m[I 2022-04-18 22:29:24,270][0m Trial 12 finished with value: 0.7810893746521329 and parameters: {'num_leaves': 10, 'n_estimators': 433, 'learning_rate': 0.092598047270813, 'feature_fraction': 0.4395585973522201, 'bagging_fraction': 0.9179845292701804}. Best is trial 5 with value: 0.8040849795380097.[0m


F1_SCORE:  0.7684 



Please use categorical_feature argument of the Dataset constructor to pass this parameter.


F1_SCORE:  0.7618 



Please use categorical_feature argument of the Dataset constructor to pass this parameter.


F1_SCORE:  0.5908 



Please use categorical_feature argument of the Dataset constructor to pass this parameter.


F1_SCORE:  0.6126 



Please use categorical_feature argument of the Dataset constructor to pass this parameter.


F1_SCORE:  0.5562 



Please use categorical_feature argument of the Dataset constructor to pass this parameter.
[32m[I 2022-04-18 22:30:33,715][0m Trial 13 finished with value: 0.6136076125781801 and parameters: {'num_leaves': 9, 'n_estimators': 415, 'learning_rate': 0.18169063102646357, 'feature_fraction': 0.5248635257434766, 'bagging_fraction': 0.983038746867819}. Best is trial 5 with value: 0.8040849795380097.[0m


F1_SCORE:  0.5465 



Please use categorical_feature argument of the Dataset constructor to pass this parameter.


F1_SCORE:  0.6638 



Please use categorical_feature argument of the Dataset constructor to pass this parameter.


F1_SCORE:  0.6077 



Please use categorical_feature argument of the Dataset constructor to pass this parameter.


F1_SCORE:  0.6015 



Please use categorical_feature argument of the Dataset constructor to pass this parameter.


F1_SCORE:  0.6352 



Please use categorical_feature argument of the Dataset constructor to pass this parameter.
[32m[I 2022-04-18 22:31:47,241][0m Trial 14 finished with value: 0.6095909414656264 and parameters: {'num_leaves': 5, 'n_estimators': 548, 'learning_rate': 0.18039164355402001, 'feature_fraction': 0.5328955783106626, 'bagging_fraction': 0.4404361315090438}. Best is trial 5 with value: 0.8040849795380097.[0m


F1_SCORE:  0.5397 



Please use categorical_feature argument of the Dataset constructor to pass this parameter.


F1_SCORE:  0.7916 



Please use categorical_feature argument of the Dataset constructor to pass this parameter.


F1_SCORE:  0.2311 



Please use categorical_feature argument of the Dataset constructor to pass this parameter.


F1_SCORE:  0.2185 



Please use categorical_feature argument of the Dataset constructor to pass this parameter.


F1_SCORE:  0.4061 



Please use categorical_feature argument of the Dataset constructor to pass this parameter.
[32m[I 2022-04-18 22:34:39,364][0m Trial 15 finished with value: 0.4207873359688552 and parameters: {'num_leaves': 11, 'n_estimators': 832, 'learning_rate': 0.1461878877008136, 'feature_fraction': 0.6702496464002498, 'bagging_fraction': 0.7524903270836536}. Best is trial 5 with value: 0.8040849795380097.[0m


F1_SCORE:  0.4566 



Please use categorical_feature argument of the Dataset constructor to pass this parameter.


F1_SCORE:  0.2968 



Please use categorical_feature argument of the Dataset constructor to pass this parameter.


F1_SCORE:  0.1447 



Please use categorical_feature argument of the Dataset constructor to pass this parameter.


F1_SCORE:  0.4188 



Please use categorical_feature argument of the Dataset constructor to pass this parameter.


F1_SCORE:  0.3204 



Please use categorical_feature argument of the Dataset constructor to pass this parameter.
[32m[I 2022-04-18 22:35:27,059][0m Trial 16 finished with value: 0.3167269799242038 and parameters: {'num_leaves': 9, 'n_estimators': 419, 'learning_rate': 0.3024409345479492, 'feature_fraction': 0.47799351551288427, 'bagging_fraction': 0.9125656456347726}. Best is trial 5 with value: 0.8040849795380097.[0m


F1_SCORE:  0.4029 



Please use categorical_feature argument of the Dataset constructor to pass this parameter.


F1_SCORE:  0.7832 



Please use categorical_feature argument of the Dataset constructor to pass this parameter.


F1_SCORE:  0.7459 



Please use categorical_feature argument of the Dataset constructor to pass this parameter.


F1_SCORE:  0.6872 



Please use categorical_feature argument of the Dataset constructor to pass this parameter.


F1_SCORE:  0.6354 



Please use categorical_feature argument of the Dataset constructor to pass this parameter.
[32m[I 2022-04-18 22:36:19,369][0m Trial 17 finished with value: 0.7034903553163385 and parameters: {'num_leaves': 5, 'n_estimators': 380, 'learning_rate': 0.12868437967781265, 'feature_fraction': 0.6035329984447824, 'bagging_fraction': 0.9031124629375582}. Best is trial 5 with value: 0.8040849795380097.[0m


F1_SCORE:  0.6658 



Please use categorical_feature argument of the Dataset constructor to pass this parameter.


F1_SCORE:  0.289 



Please use categorical_feature argument of the Dataset constructor to pass this parameter.


F1_SCORE:  0.2089 



Please use categorical_feature argument of the Dataset constructor to pass this parameter.


F1_SCORE:  0.3543 



Please use categorical_feature argument of the Dataset constructor to pass this parameter.


F1_SCORE:  0.4218 



Please use categorical_feature argument of the Dataset constructor to pass this parameter.
[32m[I 2022-04-18 22:37:24,992][0m Trial 18 finished with value: 0.3275945738055702 and parameters: {'num_leaves': 8, 'n_estimators': 589, 'learning_rate': 0.5745851824065222, 'feature_fraction': 0.7415535412378645, 'bagging_fraction': 0.666797120763853}. Best is trial 5 with value: 0.8040849795380097.[0m


F1_SCORE:  0.3639 



Please use categorical_feature argument of the Dataset constructor to pass this parameter.


F1_SCORE:  0.2437 



Please use categorical_feature argument of the Dataset constructor to pass this parameter.


F1_SCORE:  0.1802 



Please use categorical_feature argument of the Dataset constructor to pass this parameter.


F1_SCORE:  0.2181 



Please use categorical_feature argument of the Dataset constructor to pass this parameter.


F1_SCORE:  0.1614 



Please use categorical_feature argument of the Dataset constructor to pass this parameter.
[32m[I 2022-04-18 22:38:51,893][0m Trial 19 finished with value: 0.25214430606962607 and parameters: {'num_leaves': 11, 'n_estimators': 787, 'learning_rate': 0.26417823998774986, 'feature_fraction': 0.4760262294297126, 'bagging_fraction': 0.8111913551210587}. Best is trial 5 with value: 0.8040849795380097.[0m


F1_SCORE:  0.4573 



Please use categorical_feature argument of the Dataset constructor to pass this parameter.


F1_SCORE:  0.803 



Please use categorical_feature argument of the Dataset constructor to pass this parameter.


F1_SCORE:  0.7972 



Please use categorical_feature argument of the Dataset constructor to pass this parameter.


F1_SCORE:  0.8255 



Please use categorical_feature argument of the Dataset constructor to pass this parameter.


F1_SCORE:  0.7958 



Please use categorical_feature argument of the Dataset constructor to pass this parameter.
[32m[I 2022-04-18 22:40:08,435][0m Trial 20 finished with value: 0.8032616680040343 and parameters: {'num_leaves': 9, 'n_estimators': 487, 'learning_rate': 0.07468835446475243, 'feature_fraction': 0.5897317786356696, 'bagging_fraction': 0.9521718944737911}. Best is trial 5 with value: 0.8040849795380097.[0m


F1_SCORE:  0.7948 



Please use categorical_feature argument of the Dataset constructor to pass this parameter.


F1_SCORE:  0.8083 



Please use categorical_feature argument of the Dataset constructor to pass this parameter.


F1_SCORE:  0.7937 



Please use categorical_feature argument of the Dataset constructor to pass this parameter.


F1_SCORE:  0.8285 



Please use categorical_feature argument of the Dataset constructor to pass this parameter.


F1_SCORE:  0.7924 



Please use categorical_feature argument of the Dataset constructor to pass this parameter.
[32m[I 2022-04-18 22:41:27,096][0m Trial 21 finished with value: 0.8038617317226876 and parameters: {'num_leaves': 9, 'n_estimators': 510, 'learning_rate': 0.0724125450162992, 'feature_fraction': 0.5839565751409596, 'bagging_fraction': 0.939306743963646}. Best is trial 5 with value: 0.8040849795380097.[0m


F1_SCORE:  0.7963 



Please use categorical_feature argument of the Dataset constructor to pass this parameter.


F1_SCORE:  0.762 



Please use categorical_feature argument of the Dataset constructor to pass this parameter.


F1_SCORE:  0.7341 



Please use categorical_feature argument of the Dataset constructor to pass this parameter.


F1_SCORE:  0.8093 



Please use categorical_feature argument of the Dataset constructor to pass this parameter.


F1_SCORE:  0.7773 



Please use categorical_feature argument of the Dataset constructor to pass this parameter.
[32m[I 2022-04-18 22:43:17,005][0m Trial 22 finished with value: 0.771759625320752 and parameters: {'num_leaves': 8, 'n_estimators': 721, 'learning_rate': 0.0036329747089792336, 'feature_fraction': 0.5970326359689252, 'bagging_fraction': 0.9803829294384141}. Best is trial 5 with value: 0.8040849795380097.[0m


F1_SCORE:  0.7761 



Please use categorical_feature argument of the Dataset constructor to pass this parameter.


F1_SCORE:  0.3637 



Please use categorical_feature argument of the Dataset constructor to pass this parameter.


F1_SCORE:  0.4354 



Please use categorical_feature argument of the Dataset constructor to pass this parameter.


F1_SCORE:  0.372 



Please use categorical_feature argument of the Dataset constructor to pass this parameter.


F1_SCORE:  0.4754 



Please use categorical_feature argument of the Dataset constructor to pass this parameter.
[32m[I 2022-04-18 22:44:32,086][0m Trial 23 finished with value: 0.418896367373467 and parameters: {'num_leaves': 9, 'n_estimators': 543, 'learning_rate': 0.24434204810616694, 'feature_fraction': 0.7502794725183589, 'bagging_fraction': 0.9481688597338999}. Best is trial 5 with value: 0.8040849795380097.[0m


F1_SCORE:  0.448 



Please use categorical_feature argument of the Dataset constructor to pass this parameter.


F1_SCORE:  0.8032 



Please use categorical_feature argument of the Dataset constructor to pass this parameter.


F1_SCORE:  0.775 



Please use categorical_feature argument of the Dataset constructor to pass this parameter.


F1_SCORE:  0.8394 



Please use categorical_feature argument of the Dataset constructor to pass this parameter.


F1_SCORE:  0.7991 



Please use categorical_feature argument of the Dataset constructor to pass this parameter.
[32m[I 2022-04-18 22:46:05,956][0m Trial 24 finished with value: 0.80220349216852 and parameters: {'num_leaves': 11, 'n_estimators': 489, 'learning_rate': 0.0785759144056299, 'feature_fraction': 0.5815521978887462, 'bagging_fraction': 0.8649964067418328}. Best is trial 5 with value: 0.8040849795380097.[0m


F1_SCORE:  0.7944 



Please use categorical_feature argument of the Dataset constructor to pass this parameter.


F1_SCORE:  0.5569 



Please use categorical_feature argument of the Dataset constructor to pass this parameter.


F1_SCORE:  0.5783 



Please use categorical_feature argument of the Dataset constructor to pass this parameter.


F1_SCORE:  0.5972 



Please use categorical_feature argument of the Dataset constructor to pass this parameter.


F1_SCORE:  0.599 



Please use categorical_feature argument of the Dataset constructor to pass this parameter.
[32m[I 2022-04-18 22:46:57,610][0m Trial 25 finished with value: 0.5745137128452412 and parameters: {'num_leaves': 9, 'n_estimators': 290, 'learning_rate': 0.20967770737692124, 'feature_fraction': 0.6584347492981538, 'bagging_fraction': 0.8013571251285241}. Best is trial 5 with value: 0.8040849795380097.[0m


F1_SCORE:  0.5412 



Please use categorical_feature argument of the Dataset constructor to pass this parameter.


F1_SCORE:  0.3149 



Please use categorical_feature argument of the Dataset constructor to pass this parameter.


F1_SCORE:  0.2947 



Please use categorical_feature argument of the Dataset constructor to pass this parameter.


F1_SCORE:  0.2927 



Please use categorical_feature argument of the Dataset constructor to pass this parameter.


F1_SCORE:  0.3654 



Please use categorical_feature argument of the Dataset constructor to pass this parameter.
[32m[I 2022-04-18 22:48:12,407][0m Trial 26 finished with value: 0.31470005918388627 and parameters: {'num_leaves': 8, 'n_estimators': 708, 'learning_rate': 0.372409025403322, 'feature_fraction': 0.5124329060751593, 'bagging_fraction': 0.9496190586771038}. Best is trial 5 with value: 0.8040849795380097.[0m


F1_SCORE:  0.3059 



Please use categorical_feature argument of the Dataset constructor to pass this parameter.


F1_SCORE:  0.7913 



Please use categorical_feature argument of the Dataset constructor to pass this parameter.


F1_SCORE:  0.7657 



Please use categorical_feature argument of the Dataset constructor to pass this parameter.


F1_SCORE:  0.8395 



Please use categorical_feature argument of the Dataset constructor to pass this parameter.


F1_SCORE:  0.8038 



Please use categorical_feature argument of the Dataset constructor to pass this parameter.
[32m[I 2022-04-18 22:49:07,420][0m Trial 27 finished with value: 0.7959171731184218 and parameters: {'num_leaves': 6, 'n_estimators': 314, 'learning_rate': 0.0960699934154885, 'feature_fraction': 0.6291640046314946, 'bagging_fraction': 0.8640892830430873}. Best is trial 5 with value: 0.8040849795380097.[0m


F1_SCORE:  0.7794 



Please use categorical_feature argument of the Dataset constructor to pass this parameter.


F1_SCORE:  0.2371 



Please use categorical_feature argument of the Dataset constructor to pass this parameter.


F1_SCORE:  0.4139 



Please use categorical_feature argument of the Dataset constructor to pass this parameter.


F1_SCORE:  0.3731 



Please use categorical_feature argument of the Dataset constructor to pass this parameter.


F1_SCORE:  0.2438 



Please use categorical_feature argument of the Dataset constructor to pass this parameter.
[32m[I 2022-04-18 22:50:09,742][0m Trial 28 finished with value: 0.30766143524235934 and parameters: {'num_leaves': 11, 'n_estimators': 483, 'learning_rate': 0.632611486821247, 'feature_fraction': 0.7014980332786888, 'bagging_fraction': 0.7417891306939}. Best is trial 5 with value: 0.8040849795380097.[0m


F1_SCORE:  0.2703 



Please use categorical_feature argument of the Dataset constructor to pass this parameter.


F1_SCORE:  0.8029 



Please use categorical_feature argument of the Dataset constructor to pass this parameter.


F1_SCORE:  0.7753 



Please use categorical_feature argument of the Dataset constructor to pass this parameter.


F1_SCORE:  0.845 



Please use categorical_feature argument of the Dataset constructor to pass this parameter.


F1_SCORE:  0.8089 



Please use categorical_feature argument of the Dataset constructor to pass this parameter.
[32m[I 2022-04-18 22:52:37,218][0m Trial 29 finished with value: 0.8025094329490037 and parameters: {'num_leaves': 10, 'n_estimators': 842, 'learning_rate': 0.06798364930454684, 'feature_fraction': 0.4776538885632693, 'bagging_fraction': 0.9473386520346981}. Best is trial 5 with value: 0.8040849795380097.[0m


F1_SCORE:  0.7805 



Please use categorical_feature argument of the Dataset constructor to pass this parameter.


F1_SCORE:  0.1293 



Please use categorical_feature argument of the Dataset constructor to pass this parameter.


F1_SCORE:  0.447 



Please use categorical_feature argument of the Dataset constructor to pass this parameter.


F1_SCORE:  0.3481 



Please use categorical_feature argument of the Dataset constructor to pass this parameter.


F1_SCORE:  0.2745 



Please use categorical_feature argument of the Dataset constructor to pass this parameter.
[32m[I 2022-04-18 22:54:04,256][0m Trial 30 finished with value: 0.27332131730929893 and parameters: {'num_leaves': 9, 'n_estimators': 963, 'learning_rate': 0.9922415952175831, 'feature_fraction': 0.5592086084201222, 'bagging_fraction': 0.6899893388993091}. Best is trial 5 with value: 0.8040849795380097.[0m


F1_SCORE:  0.1678 



Please use categorical_feature argument of the Dataset constructor to pass this parameter.


F1_SCORE:  0.8018 



Please use categorical_feature argument of the Dataset constructor to pass this parameter.


F1_SCORE:  0.7501 



Please use categorical_feature argument of the Dataset constructor to pass this parameter.


F1_SCORE:  0.832 



Please use categorical_feature argument of the Dataset constructor to pass this parameter.


F1_SCORE:  0.8212 



Please use categorical_feature argument of the Dataset constructor to pass this parameter.
[32m[I 2022-04-18 22:56:19,892][0m Trial 31 finished with value: 0.7968593481261431 and parameters: {'num_leaves': 10, 'n_estimators': 789, 'learning_rate': 0.07565390290165155, 'feature_fraction': 0.46653837083834476, 'bagging_fraction': 0.944351524470554}. Best is trial 5 with value: 0.8040849795380097.[0m


F1_SCORE:  0.7793 



Please use categorical_feature argument of the Dataset constructor to pass this parameter.


F1_SCORE:  0.1151 



Please use categorical_feature argument of the Dataset constructor to pass this parameter.


F1_SCORE:  0.3916 



Please use categorical_feature argument of the Dataset constructor to pass this parameter.


F1_SCORE:  0.2974 



Please use categorical_feature argument of the Dataset constructor to pass this parameter.


F1_SCORE:  0.4638 



Please use categorical_feature argument of the Dataset constructor to pass this parameter.
[32m[I 2022-04-18 22:57:55,140][0m Trial 32 finished with value: 0.35145305923833436 and parameters: {'num_leaves': 12, 'n_estimators': 609, 'learning_rate': 0.16497362672800125, 'feature_fraction': 0.5033326707290389, 'bagging_fraction': 0.8847709615640202}. Best is trial 5 with value: 0.8040849795380097.[0m


F1_SCORE:  0.4893 



Please use categorical_feature argument of the Dataset constructor to pass this parameter.


F1_SCORE:  0.8092 



Please use categorical_feature argument of the Dataset constructor to pass this parameter.


F1_SCORE:  0.7813 



Please use categorical_feature argument of the Dataset constructor to pass this parameter.


F1_SCORE:  0.8407 



Please use categorical_feature argument of the Dataset constructor to pass this parameter.


F1_SCORE:  0.8003 



Please use categorical_feature argument of the Dataset constructor to pass this parameter.
[32m[I 2022-04-18 23:00:09,003][0m Trial 33 finished with value: 0.804340488458444 and parameters: {'num_leaves': 10, 'n_estimators': 707, 'learning_rate': 0.05167480920346117, 'feature_fraction': 0.5610142385078413, 'bagging_fraction': 0.9983900760318126}. Best is trial 33 with value: 0.804340488458444.[0m


F1_SCORE:  0.7901 



Please use categorical_feature argument of the Dataset constructor to pass this parameter.


F1_SCORE:  0.3035 



Please use categorical_feature argument of the Dataset constructor to pass this parameter.


F1_SCORE:  0.2902 



Please use categorical_feature argument of the Dataset constructor to pass this parameter.


F1_SCORE:  0.1301 



Please use categorical_feature argument of the Dataset constructor to pass this parameter.


F1_SCORE:  0.2702 



Please use categorical_feature argument of the Dataset constructor to pass this parameter.
[32m[I 2022-04-18 23:01:28,282][0m Trial 34 finished with value: 0.2506187995688145 and parameters: {'num_leaves': 11, 'n_estimators': 698, 'learning_rate': 0.24208554975830376, 'feature_fraction': 0.556896948938272, 'bagging_fraction': 0.989902582559451}. Best is trial 33 with value: 0.804340488458444.[0m


F1_SCORE:  0.2591 



Please use categorical_feature argument of the Dataset constructor to pass this parameter.


F1_SCORE:  0.7576 



Please use categorical_feature argument of the Dataset constructor to pass this parameter.


F1_SCORE:  0.7223 



Please use categorical_feature argument of the Dataset constructor to pass this parameter.


F1_SCORE:  0.7952 



Please use categorical_feature argument of the Dataset constructor to pass this parameter.


F1_SCORE:  0.7422 



Please use categorical_feature argument of the Dataset constructor to pass this parameter.
[32m[I 2022-04-18 23:03:04,636][0m Trial 35 finished with value: 0.7529293446079304 and parameters: {'num_leaves': 7, 'n_estimators': 534, 'learning_rate': 0.004011695653524799, 'feature_fraction': 0.5665493402230786, 'bagging_fraction': 0.9988044352363412}. Best is trial 33 with value: 0.804340488458444.[0m


F1_SCORE:  0.7474 



Please use categorical_feature argument of the Dataset constructor to pass this parameter.


F1_SCORE:  0.3657 



Please use categorical_feature argument of the Dataset constructor to pass this parameter.


F1_SCORE:  0.214 



Please use categorical_feature argument of the Dataset constructor to pass this parameter.


F1_SCORE:  0.3346 



Please use categorical_feature argument of the Dataset constructor to pass this parameter.


F1_SCORE:  0.2247 



Please use categorical_feature argument of the Dataset constructor to pass this parameter.
[32m[I 2022-04-18 23:04:12,765][0m Trial 36 finished with value: 0.28493403660181704 and parameters: {'num_leaves': 9, 'n_estimators': 650, 'learning_rate': 0.43980177164867496, 'feature_fraction': 0.6061841533555183, 'bagging_fraction': 0.9567417220316193}. Best is trial 33 with value: 0.804340488458444.[0m


F1_SCORE:  0.2857 



Please use categorical_feature argument of the Dataset constructor to pass this parameter.


F1_SCORE:  0.6083 



Please use categorical_feature argument of the Dataset constructor to pass this parameter.


F1_SCORE:  0.6213 



Please use categorical_feature argument of the Dataset constructor to pass this parameter.


F1_SCORE:  0.541 



Please use categorical_feature argument of the Dataset constructor to pass this parameter.


F1_SCORE:  0.8027 



Please use categorical_feature argument of the Dataset constructor to pass this parameter.
[32m[I 2022-04-18 23:05:35,834][0m Trial 37 finished with value: 0.6389313211168756 and parameters: {'num_leaves': 7, 'n_estimators': 476, 'learning_rate': 0.1285247864241788, 'feature_fraction': 0.6610826751363293, 'bagging_fraction': 0.8365108526820493}. Best is trial 33 with value: 0.804340488458444.[0m


F1_SCORE:  0.6212 



Please use categorical_feature argument of the Dataset constructor to pass this parameter.


F1_SCORE:  0.3481 



Please use categorical_feature argument of the Dataset constructor to pass this parameter.


F1_SCORE:  0.4588 



Please use categorical_feature argument of the Dataset constructor to pass this parameter.


F1_SCORE:  0.2811 



Please use categorical_feature argument of the Dataset constructor to pass this parameter.


F1_SCORE:  0.3043 



Please use categorical_feature argument of the Dataset constructor to pass this parameter.
[32m[I 2022-04-18 23:06:19,231][0m Trial 38 finished with value: 0.35170311529817033 and parameters: {'num_leaves': 8, 'n_estimators': 338, 'learning_rate': 0.7620741525352577, 'feature_fraction': 0.634593091564967, 'bagging_fraction': 0.8942314944718945}. Best is trial 33 with value: 0.804340488458444.[0m


F1_SCORE:  0.3662 



Please use categorical_feature argument of the Dataset constructor to pass this parameter.


F1_SCORE:  0.8067 



Please use categorical_feature argument of the Dataset constructor to pass this parameter.


F1_SCORE:  0.7814 



Please use categorical_feature argument of the Dataset constructor to pass this parameter.


F1_SCORE:  0.839 



Please use categorical_feature argument of the Dataset constructor to pass this parameter.


F1_SCORE:  0.8257 



Please use categorical_feature argument of the Dataset constructor to pass this parameter.
[32m[I 2022-04-18 23:09:30,329][0m Trial 39 finished with value: 0.8099290842156943 and parameters: {'num_leaves': 12, 'n_estimators': 588, 'learning_rate': 0.05473800714730781, 'feature_fraction': 0.7900926028491161, 'bagging_fraction': 0.7746443499929081}. Best is trial 39 with value: 0.8099290842156943.[0m


F1_SCORE:  0.7969 



Please use categorical_feature argument of the Dataset constructor to pass this parameter.


F1_SCORE:  0.3653 



Please use categorical_feature argument of the Dataset constructor to pass this parameter.


F1_SCORE:  0.2195 



Please use categorical_feature argument of the Dataset constructor to pass this parameter.


F1_SCORE:  0.37 



Please use categorical_feature argument of the Dataset constructor to pass this parameter.


F1_SCORE:  0.3442 



Please use categorical_feature argument of the Dataset constructor to pass this parameter.
[32m[I 2022-04-18 23:11:58,764][0m Trial 40 finished with value: 0.33385519253459317 and parameters: {'num_leaves': 12, 'n_estimators': 950, 'learning_rate': 0.3314384917797593, 'feature_fraction': 0.7830717354089556, 'bagging_fraction': 0.7807370624462594}. Best is trial 39 with value: 0.8099290842156943.[0m


F1_SCORE:  0.3703 



Please use categorical_feature argument of the Dataset constructor to pass this parameter.


F1_SCORE:  0.8118 



Please use categorical_feature argument of the Dataset constructor to pass this parameter.


F1_SCORE:  0.7889 



Please use categorical_feature argument of the Dataset constructor to pass this parameter.


F1_SCORE:  0.8287 



Please use categorical_feature argument of the Dataset constructor to pass this parameter.


F1_SCORE:  0.8298 



Please use categorical_feature argument of the Dataset constructor to pass this parameter.
[32m[I 2022-04-18 23:15:34,773][0m Trial 41 finished with value: 0.8101751814855515 and parameters: {'num_leaves': 12, 'n_estimators': 606, 'learning_rate': 0.04758825215802909, 'feature_fraction': 0.9125911608337296, 'bagging_fraction': 0.6099589912189116}. Best is trial 41 with value: 0.8101751814855515.[0m


F1_SCORE:  0.7917 



Please use categorical_feature argument of the Dataset constructor to pass this parameter.


F1_SCORE:  0.8076 



Please use categorical_feature argument of the Dataset constructor to pass this parameter.


F1_SCORE:  0.7821 



Please use categorical_feature argument of the Dataset constructor to pass this parameter.


F1_SCORE:  0.8282 



Please use categorical_feature argument of the Dataset constructor to pass this parameter.


F1_SCORE:  0.8131 



Please use categorical_feature argument of the Dataset constructor to pass this parameter.
[32m[I 2022-04-18 23:19:10,529][0m Trial 42 finished with value: 0.804427766787627 and parameters: {'num_leaves': 12, 'n_estimators': 605, 'learning_rate': 0.05074665092681257, 'feature_fraction': 0.9270366725249224, 'bagging_fraction': 0.6490432081819215}. Best is trial 41 with value: 0.8101751814855515.[0m


F1_SCORE:  0.7911 



Please use categorical_feature argument of the Dataset constructor to pass this parameter.


F1_SCORE:  0.8044 



Please use categorical_feature argument of the Dataset constructor to pass this parameter.


F1_SCORE:  0.7856 



Please use categorical_feature argument of the Dataset constructor to pass this parameter.


F1_SCORE:  0.8327 



Please use categorical_feature argument of the Dataset constructor to pass this parameter.


F1_SCORE:  0.8291 



Please use categorical_feature argument of the Dataset constructor to pass this parameter.
[32m[I 2022-04-18 23:24:42,910][0m Trial 43 finished with value: 0.8083032050903046 and parameters: {'num_leaves': 12, 'n_estimators': 748, 'learning_rate': 0.04256973436117961, 'feature_fraction': 0.9408235052513154, 'bagging_fraction': 0.6164647539507578}. Best is trial 41 with value: 0.8101751814855515.[0m


F1_SCORE:  0.7898 



Please use categorical_feature argument of the Dataset constructor to pass this parameter.


F1_SCORE:  0.7885 



Please use categorical_feature argument of the Dataset constructor to pass this parameter.


F1_SCORE:  0.7663 



Please use categorical_feature argument of the Dataset constructor to pass this parameter.


F1_SCORE:  0.4407 



Please use categorical_feature argument of the Dataset constructor to pass this parameter.


F1_SCORE:  0.7719 



Please use categorical_feature argument of the Dataset constructor to pass this parameter.
[32m[I 2022-04-18 23:28:51,699][0m Trial 44 finished with value: 0.6659745553157724 and parameters: {'num_leaves': 12, 'n_estimators': 735, 'learning_rate': 0.12352105449955995, 'feature_fraction': 0.9402464283853548, 'bagging_fraction': 0.6323547929848223}. Best is trial 41 with value: 0.8101751814855515.[0m


F1_SCORE:  0.5624 



Please use categorical_feature argument of the Dataset constructor to pass this parameter.


F1_SCORE:  0.7911 



Please use categorical_feature argument of the Dataset constructor to pass this parameter.


F1_SCORE:  0.7785 



Please use categorical_feature argument of the Dataset constructor to pass this parameter.


F1_SCORE:  0.8175 



Please use categorical_feature argument of the Dataset constructor to pass this parameter.


F1_SCORE:  0.8298 



Please use categorical_feature argument of the Dataset constructor to pass this parameter.
[32m[I 2022-04-18 23:33:24,015][0m Trial 45 finished with value: 0.8027176309786819 and parameters: {'num_leaves': 12, 'n_estimators': 590, 'learning_rate': 0.03377426615084557, 'feature_fraction': 0.9957980222482098, 'bagging_fraction': 0.5643685345602776}. Best is trial 41 with value: 0.8101751814855515.[0m


F1_SCORE:  0.7967 



Please use categorical_feature argument of the Dataset constructor to pass this parameter.


F1_SCORE:  0.2984 



Please use categorical_feature argument of the Dataset constructor to pass this parameter.


F1_SCORE:  0.1991 



Please use categorical_feature argument of the Dataset constructor to pass this parameter.


F1_SCORE:  0.4088 



Please use categorical_feature argument of the Dataset constructor to pass this parameter.


F1_SCORE:  0.4227 



Please use categorical_feature argument of the Dataset constructor to pass this parameter.




[32m[I 2022-04-18 23:35:53,922][0m Trial 46 finished with value: 0.30431501087863577 and parameters: {'num_leaves': 12, 'n_estimators': 659, 'learning_rate': 0.2181277209299775, 'feature_fraction': 0.8269026314672314, 'bagging_fraction': 0.6268186218646192}. Best is trial 41 with value: 0.8101751814855515.[0m


F1_SCORE:  0.1925 



Please use categorical_feature argument of the Dataset constructor to pass this parameter.


F1_SCORE:  0.7826 



Please use categorical_feature argument of the Dataset constructor to pass this parameter.


F1_SCORE:  0.7853 



Please use categorical_feature argument of the Dataset constructor to pass this parameter.


F1_SCORE:  0.8286 



Please use categorical_feature argument of the Dataset constructor to pass this parameter.


F1_SCORE:  0.8123 



Please use categorical_feature argument of the Dataset constructor to pass this parameter.
[32m[I 2022-04-18 23:41:24,076][0m Trial 47 finished with value: 0.8026492385477797 and parameters: {'num_leaves': 11, 'n_estimators': 877, 'learning_rate': 0.038771144978104743, 'feature_fraction': 0.9026218636644445, 'bagging_fraction': 0.5861204737077506}. Best is trial 41 with value: 0.8101751814855515.[0m


F1_SCORE:  0.8045 



Please use categorical_feature argument of the Dataset constructor to pass this parameter.


F1_SCORE:  0.4128 



Please use categorical_feature argument of the Dataset constructor to pass this parameter.


F1_SCORE:  0.4128 



Please use categorical_feature argument of the Dataset constructor to pass this parameter.


F1_SCORE:  0.1745 



Please use categorical_feature argument of the Dataset constructor to pass this parameter.


F1_SCORE:  0.1691 



Please use categorical_feature argument of the Dataset constructor to pass this parameter.
[32m[I 2022-04-18 23:44:14,265][0m Trial 48 finished with value: 0.3350268524937191 and parameters: {'num_leaves': 12, 'n_estimators': 802, 'learning_rate': 0.1941915126966386, 'feature_fraction': 0.897176159159028, 'bagging_fraction': 0.48868537065428724}. Best is trial 41 with value: 0.8101751814855515.[0m


F1_SCORE:  0.506 



Please use categorical_feature argument of the Dataset constructor to pass this parameter.


F1_SCORE:  0.7663 



Please use categorical_feature argument of the Dataset constructor to pass this parameter.


F1_SCORE:  0.7758 



Please use categorical_feature argument of the Dataset constructor to pass this parameter.


F1_SCORE:  0.8012 



Please use categorical_feature argument of the Dataset constructor to pass this parameter.


F1_SCORE:  0.7687 



Please use categorical_feature argument of the Dataset constructor to pass this parameter.
[32m[I 2022-04-18 23:45:47,969][0m Trial 49 finished with value: 0.7799246907255758 and parameters: {'num_leaves': 2, 'n_estimators': 676, 'learning_rate': 0.1230158890043171, 'feature_fraction': 0.9508371613914272, 'bagging_fraction': 0.6600856967220281}. Best is trial 41 with value: 0.8101751814855515.[0m


F1_SCORE:  0.7876 



In [166]:
## cv :0.8101751814855515
best_params = {
        'boosting_type': 'gbdt',
        'objective': 'multiclass',
        'num_class':7,
        'categorical_feature': category_cols,       
        "seed": 42,} 
best_params.update(study.best_params)
best_params

{'boosting_type': 'gbdt',
 'objective': 'multiclass',
 'num_class': 7,
 'categorical_feature': ['category_max_fre_company_type',
  'category_max_fre_out_filled_var_company_type',
  'category_last_value_company_type',
  'category_max_fre_id_management',
  'category_max_fre_out_filled_var_id_management',
  'category_last_value_id_management',
  'category_max_fre_id_office',
  'category_max_fre_out_filled_var_id_office',
  'category_last_value_id_office',
  'category_max_fre_job_role_encode_knn',
  'category_max_fre_out_filled_var_job_role_encode_knn',
  'category_last_value_job_role_encode_knn',
  'category_max_fre_new_work_province_id',
  'category_max_fre_out_filled_var_new_work_province_id',
  'category_last_value_new_work_province_id',
  'category_bining_olds',
  'new_province_id'],
 'seed': 42,
 'num_leaves': 12,
 'n_estimators': 606,
 'learning_rate': 0.04758825215802909,
 'feature_fraction': 0.9125911608337296,
 'bagging_fraction': 0.6099589912189116}

In [167]:
id_test = df_test['id_bh']
X_test = df_test.drop(columns = [ 'id_bh']) 
X_test[category_cols] = X_test[category_cols].astype('category')

X_test = X_test[selected_features]
clf = lgb(**best_params)
clf.fit( X_train, y_train)
preds =  clf.predict(X_test)

predict_df = pd.DataFrame()
predict_df['id_bh'] =id_test
predict_df['label'] = preds

## merge voi label_test
submit_file = concat_data(test_label,predict_df )

Please use categorical_feature argument of the Dataset constructor to pass this parameter.




In [168]:
submit_file.to_csv("data/submmit/tuned_lg_with_top250_selected_features_cv081017.csv", index=False)

In [172]:
top_fea_250_df = pd.DataFrame(list(top_fea_250), columns = ['top_250_fea'])
top_fea_250_df.to_csv("top250fea.csv",  index = False)
