In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
##load dataset
file = "data/processed_data"

train_info_path = file + "/train_info.csv"
train_work_path = file + "/train_work.csv"
train_label_path = "data/label_train.csv"

test_info_path = file + "/test_info.csv"
test_work_path = file + "/test_work.csv"
test_label_path =  "data/label_test.csv"


## data frame
train_info = pd.read_csv(train_info_path)
train_work = pd.read_csv(train_work_path)
train_label = pd.read_csv(train_label_path)

test_info = pd.read_csv(test_info_path)
test_work = pd.read_csv(test_work_path)
test_label = pd.read_csv(test_label_path)

In [3]:
train_info.shape, train_work.shape, train_label.shape

((27502, 5), (247559, 19), (27502, 2))

In [4]:
# info: id_bh,	bithYear,	gender,	new_province_id
# label: id_bd, label
# work: main:id_bh , new_work_province_id ,  employee_lv , 
## year_from_date , year_to_date ,  month_from_date, month_to_date 
## num_year_contract, num_month_contract   
## company_type,   id_management ,id_office ,  job_role_encode_knn                                                                                       

## cach xu ly:
## 1. group by thep id_bh o work 
## 2. merge voi info va label

## 1. Tao DF de co the train, test

In [5]:
def concat_data(df, df_add, col='id_bh', how='left'):
    df = pd.merge(df, df_add, on=col, how=how)
    return df

In [6]:
use_cosl_train  = ["id_bh" , "new_work_province_id" ,  "employee_lv" , 
"year_from_date" , "year_to_date" ,  "month_from_date", "month_to_date" ,
"num_year_contract", "num_month_contract"   ,
"company_type",   "id_management" ,"id_office" ,  "job_role_encode_knn", 
"bithYear", "gender", "new_province_id", "label"  ] 
# train
group_train_work = train_work.groupby('id_bh').agg(lambda x: x.tolist())
train = concat_data(group_train_work, train_info, col='id_bh', how='left')
train = concat_data(train, train_label, col='id_bh', how='left')
train = train[use_cosl_train]
# test
group_test_work = test_work.groupby('id_bh').agg(lambda x: x.tolist())
test = concat_data(group_test_work, test_info, col='id_bh', how='left')
test = test[use_cosl_train[:-1]]

In [7]:
## export 
train.to_csv(r"data/processed_data/train_df.csv")
test.to_csv(r"data/processed_data/test_df.csv")

In [8]:
train.shape, test.shape

((27502, 17), (18134, 16))

# 2. Feature engineering

In [9]:
# 2. Feature engineering
## category list: company_type,	id_management,	id_office,	job_role_encode_knn ==> max frequent, last, count unique
### new_work_province_id: max frequnt, last , count unique

### new_work_province_id, id_office, company_type, id_management


## number:
### employee_lv bo gia tri -1: max, min (bo -1), mode(bo -1), std, mean, gia tri (max-min), count unique
### year_from_date: max, min, std, mode, mean, count unique
### year_to_date: max, min, std, mode, mean, count unique
### month_from_date, month_to_date:  max, min, std, mode, mean
### num_year_contract: max, min, mode, mean, std, q(75), q(25), count unique
### num_month_contract: max, min, mode, mean, std, q(75), q(25), count unique
## create new fea:

train['bithYear'].describe()

count    27502.000000
mean      1984.355756
std          8.842248
min       1941.000000
25%       1979.000000
50%       1985.000000
75%       1991.000000
max       2002.000000
Name: bithYear, dtype: float64

In [10]:
import statistics
from statistics import mode
## utils func for category var
def max_frequent(list):
    if (len(list) == 0):
        return -999
    return(mode(list))
def max_frequent_without_filled_var(list):
    if( -999 in list):
        list.remove(-999)
    if(-1 in list):
        list.remove(-1)
    if(-1.0 in list):
        list.remove(-1.0)
    a_set = set(list)
    if ((len(list) == 0) or (len(a_set) == len(list)) ):
        return -999
    return(mode(list))
def last_value(list):
    if (len(list) == 0):
        return -999
    return list[-1]
def count_unique(list):
    return len(set(list))
def count_unique_without_filled_var(list):
    if( -999 in list):
        list.remove(-999)
    if(-1 in list):
        list.remove(-1)
    return len(set(list))

category_cols = [
    "company_type",	"id_management",	"id_office",	"job_role_encode_knn" ,"new_work_province_id"
]
numeric_cols = [
        "employee_lv" , 
        "year_from_date",
        "year_to_date",
        "month_from_date",
        "num_year_contract",
        "num_month_contract",
]
## utils fun for numeric features: max, min, mode, mean, std, q(75), q(25), count unique
def min_without_filled_var(list):
    if(-1 in list):
        list.remove(-1)
        if (len(list) == 0):
            return -1
        else: 
            return min(list)
    return min(list)
def std(list):
    if (len(list) < 2):
            return -999
    if(-1 in list):
        list.remove(-1)
        if (len(list) < 2):
            return -999
        else: 
            return statistics.stdev(list)
    return statistics.stdev(list)

######################
def features_engineering(df):
    ## categorical cols
    for col in category_cols:
        df['category_max_fre_' + col] = df[col].apply(lambda x: max_frequent(x) ) ## category
        df['category_max_fre_out_filled_var_' + col] = df[col].apply(lambda x: max_frequent_without_filled_var(x) ) ## category
        df['category_last_value_' + col] = df[col].apply(lambda x: last_value(x) ) ## category
        df['numeric_count_unique_' + col] = df[col].apply(lambda x: count_unique(x) ) ## numeric
        df['numeric_count_unique_without_filled_var' + col] = df[col].apply(lambda x: count_unique_without_filled_var(x) ) ## numeric

    ## numeric cols:
    for col in numeric_cols:
        ## chi co employee_lv la co fill nan
        if (col == 'employee_lv'):
            df['numeric_max_lv_' + 'of_'+ col] = df[col].apply(lambda x: max(x) )
            df['numeric_min_lv_' + 'of_'+ col] = df[col].apply(lambda x: min_without_filled_var(x) )
            df['numeric_mode_' + 'of_'+ col] = df[col].apply(lambda x: max_frequent(x) ) 
            df['numeric_mode_out_filled_var_' + 'of_'+ col] = df[col].apply(lambda x: max_frequent_without_filled_var(x) ) 
            df['numeric_mean_' + 'of_'+ col] = df[col].apply(lambda x: sum(x) / float(len(x)) ) 
            df['numeric_std_' + 'of_'+ col] = df[col].apply(lambda x: std(x) )
            df['numeric_count_unique_' + 'of_'+ col]  = df[col].apply(lambda x: count_unique(x) ) ## numeric
            df['numeric_count_unique_' + 'of_'+ col]  = df[col].apply(lambda x: count_unique_without_filled_var(x) ) ## numeric
            df['numeric_Q25_' + 'of_'+ col ] = df[col].apply(lambda x: np.quantile(x, .25) )
            df['numeric_Q75_' + 'of_'+ col ] = df[col].apply(lambda x: np.quantile(x, .75) )
        else:
            df['numeric_max_' + 'of_'+ col] = df[col].apply(lambda x: max(x) )
            df['numeric_min_' + 'of_'+ col] = df[col].apply(lambda x: min_without_filled_var(x) )
            df['numeric_mode_' + 'of_'+ col] = df[col].apply(lambda x: max_frequent(x) )  
            df['numeric_mean_' + 'of_'+ col] = df[col].apply(lambda x: sum(x) / float(len(x)) ) 
            df['numeric_std_' + 'of_'+ col] = df[col].apply(lambda x: std(x) )
            df['numeric_count_unique_' + 'of_'+ col]  = df[col].apply(lambda x: count_unique(x) ) ## numeric
            df['numeric_Q25_' + 'of_'+ col ] = df[col].apply(lambda x: np.quantile(x, .25) )
            df['numeric_Q75_' + 'of_'+ col ] = df[col].apply(lambda x: np.quantile(x, .75) )
    ## create_new_fea between 2 features:(year_from_date, year_to_date) with bithYear
    df['numeric_year_diff_max_year_from_vs_bithYear'] = df['numeric_max_of_year_from_date'] - df['bithYear']
    df['numeric_year_diff_min_year_from_vs_bithYear'] = df['numeric_min_of_year_from_date'] - df['bithYear']
    df['numeric_year_diff_max_year_to_vs_bithYear'] = df['numeric_max_of_year_to_date'] - df['bithYear']
    df['numeric_year_diff_min_year_to_vs_bithYear'] = df['numeric_min_of_year_to_date'] - df['bithYear']

    ## old:
    df['numeric_olds'] = 2022 - df['bithYear']
    ## ratio: num_old vs diff_min_year_from_vs_bithYear
    df['numeric_ratio_olds_vs_(diff_min_year_from_vs_bithYear)'] = df['numeric_olds'] / df['numeric_year_diff_min_year_from_vs_bithYear']
    df['numeric_ratio_olds_vs_(diff_min_year_from_vs_bithYear)'] = df['numeric_ratio_olds_vs_(diff_min_year_from_vs_bithYear)'].apply(lambda x: x if(x> 0) else -999)
    ## fix value in create_new_fea cols
    new_cols = [col for col in df.columns if 'numeric_year_diff_' in col]
    for col in new_cols:
        df[col] = df[col].apply(lambda x: x if(x> 0) else -999)
    return df
fe_train = features_engineering(train)

  

In [11]:
fe_test = features_engineering(test)

In [12]:
numeric_ml_useful_cols = [col for col in fe_train.columns if 'numeric_' in col]
category_ml_useful_cols = [col for col in fe_train.columns if 'category_' in col]
old_cols = ["bithYear",	"gender","new_province_id", "id_bh", "label"]
ml_useful_cols_train = numeric_ml_useful_cols + category_ml_useful_cols + old_cols
ml_useful_cols_test = numeric_ml_useful_cols + category_ml_useful_cols + old_cols[:-1]
len(ml_useful_cols_train), len(ml_useful_cols_test)

(85, 84)

## 3. Test baseline

In [14]:
category_cols = category_ml_useful_cols + ["gender", "new_province_id"]

In [15]:
df_train = fe_train[ml_useful_cols_train]
df_test = fe_test[ml_useful_cols_test]

In [16]:
# df_train.info()

In [17]:
from lightgbm import LGBMClassifier as lgb
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import f1_score

In [18]:
## cal class weight
from sklearn.utils import class_weight

class_weights = class_weight.compute_class_weight(
                                    class_weight = "balanced",
                                    classes = [1,2,3,4,5,6,7], 
                                    y = df_train['label'] 
                                    )
class_weights = {i : class_weights[i] for i in range(7)}
class_weights

{0: 32.46989374262102,
 1: 0.3963338185067228,
 2: 1.0159961579666779,
 3: 0.5015775747296237,
 4: 1.557833918658661,
 5: 1.2327760096821911,
 6: 66.590799031477}

In [19]:
skf = StratifiedKFold(n_splits= 5, random_state= 42, shuffle= True)
X_train = df_train.drop(columns = ['label', 'id_bh']) 
X_train[category_cols] = X_train[category_cols].astype('category')
y_train = df_train['label']
cv_score = []
params_k = {
            'boosting_type': 'gbdt',
            'objective': 'multiclass',
            'num_class':7,
            'categorical_feature': category_cols,
            'verbose' : 0,
           # # 'class_weight': class_weights_dict,
            # 'subsample': 0.5,
            # 'subsample_freq': 1,
            # 'learning_rate': 0.01,
            #  'num_leaves': 2**11-1,
            #  'min_data_in_leaf': 2**12-1,
            # 'feature_fraction': 0.5,
            # 'max_bin': 100,
            # 'n_estimators': 500,
            # 'boost_from_average': False,
            # "feature_name": X_train.columns.tolist(),
            "random_seed":42
}


In [20]:
## Kfold
for n_fold, (train_idx, valid_idx) in enumerate(skf.split(X_train, y_train)):
    print("Fold %s" % (n_fold))
    train_x, train_y = X_train.iloc[train_idx], y_train.iloc[train_idx]
    valid_x, valid_y = X_train.iloc[valid_idx], y_train.iloc[valid_idx]

    clf = lgb(
        **params_k
    )
    clf.fit(train_x, train_y, verbose= False)
    score = f1_score(valid_y, clf.predict(valid_x),  average='macro')
    print('F1_SCORE: ', round(score, 4), '\n')
    cv_score.append(score)

Fold 0


Please use categorical_feature argument of the Dataset constructor to pass this parameter.


You can set `force_col_wise=true` to remove the overhead.
F1_SCORE:  0.732 

Fold 1
You can set `force_col_wise=true` to remove the overhead.


Please use categorical_feature argument of the Dataset constructor to pass this parameter.


F1_SCORE:  0.7394 

Fold 2


Please use categorical_feature argument of the Dataset constructor to pass this parameter.


You can set `force_col_wise=true` to remove the overhead.
F1_SCORE:  0.7524 

Fold 3
You can set `force_col_wise=true` to remove the overhead.


Please use categorical_feature argument of the Dataset constructor to pass this parameter.


F1_SCORE:  0.731 

Fold 4


Please use categorical_feature argument of the Dataset constructor to pass this parameter.


You can set `force_col_wise=true` to remove the overhead.
F1_SCORE:  0.7484 



In [21]:
print(sum(cv_score)/ 5)
print(std(cv_score))

0.7406467632104027
0.009577205369576682


## Predict

In [22]:
df_test['id_bh']

0         100000688
1         100005503
2         100006307
3         100008625
4         100009238
            ...    
18129    9714758429
18130    9714758435
18131    9715056070
18132    9715637756
18133    9716003602
Name: id_bh, Length: 18134, dtype: int64

In [23]:
id_test = df_test['id_bh']
X_test = df_test.drop(columns = [ 'id_bh']) 
X_test[category_cols] = X_test[category_cols].astype('category')

clf = lgb(**params_k)
clf.fit( X_train, y_train)
preds =  clf.predict(X_test)

predict_df = pd.DataFrame()
predict_df['id_bh'] =id_test
predict_df['label'] = preds

## merge voi label_test
submit_file = concat_data(test_label,predict_df )

Please use categorical_feature argument of the Dataset constructor to pass this parameter.


You can set `force_col_wise=true` to remove the overhead.


In [24]:
submit_file.head()
# test_label.head()

Unnamed: 0,id_bh,label
0,113118886,2
1,2521527855,2
2,2421701111,2
3,2620332660,2
4,2905004143,6


In [25]:
# submit_file.to_csv("data/submmit/baseline_lgb.csv", index=False)

In [26]:
submit_file['label'].value_counts()

2    6526
4    5424
3    2321
6    2059
5    1747
1      35
7      22
Name: label, dtype: int64

## OPTUNA TUNE

In [29]:
# import optuna
# def objective(trial):
#     # Specify a search space using distributions across plausible values of hyperparameters.
#     param = {
#         'boosting_type': 'gbdt',
#         'objective': 'multiclass',
#         'num_class':7,
#         'categorical_feature': category_cols,       
#         "seed": 42,
#         'lambda_l1': trial.suggest_float('lambda_l1', 1e-8, 10.0, log=True),
#         'lambda_l2': trial.suggest_float('lambda_l2', 1e-8, 10.0, log=True),
#         'num_leaves': trial.suggest_int('num_leaves', 2, 256),
#         'feature_fraction': trial.suggest_float('feature_fraction', 0.4, 1.0),
#         'bagging_fraction': trial.suggest_float('bagging_fraction', 0.4, 1.0),
#         'bagging_freq': trial.suggest_int('bagging_freq', 1, 7),
#         'min_child_samples': trial.suggest_int('min_child_samples', 5, 100)
#     }
    
#     # Run LightGBM for the hyperparameter values
#     lgbcv = lgb(**param,
#                 #    categorical_feature=ids_of_categorical,              
                                  
#                   )
#     cv_score = []
#     for n_fold, (train_idx, valid_idx) in enumerate(skf.split(X_train , y_train)):
#         # print("Fold %s" % (n_fold))
#         train_x, train_y = X_train .iloc[train_idx], y_train.iloc[train_idx]
#         valid_x, valid_y = X_train .iloc[valid_idx], y_train.iloc[valid_idx]

#         clf = lgbcv
#         clf.fit(train_x, train_y, verbose= False)
#         score = f1_score(valid_y, clf.predict(valid_x),  average='macro')
#         print('F1_SCORE: ', round(score, 4), '\n')
#         cv_score.append(score)
    
#     return sum(cv_score)/len(cv_score)

# study = optuna.create_study(direction="maximize")
# study.optimize(objective, n_trials=50) 

## BEST SCORE

In [30]:
## cv_best score: 0.7523 
## best param
# {'boosting_type': 'gbdt',
#  'objective': 'multiclass',
#  'num_class': 7,
#  'categorical_feature': ['category_max_fre_company_type',
#   'category_max_fre_out_filled_var_company_type',
#   'category_last_value_company_type',
#   'category_max_fre_id_management',
#   'category_max_fre_out_filled_var_id_management',
#   'category_last_value_id_management',
#   'category_max_fre_id_office',
#   'category_max_fre_out_filled_var_id_office',
#   'category_last_value_id_office',
#   'category_max_fre_job_role_encode_knn',
#   'category_max_fre_out_filled_var_job_role_encode_knn',
#   'category_last_value_job_role_encode_knn',
#   'category_max_fre_new_work_province_id',
#   'category_max_fre_out_filled_var_new_work_province_id',
#   'category_last_value_new_work_province_id',
#   'gender',
#   'new_province_id'],
#  'seed': 42,
#  'lambda_l1': 0.0009552784116637494,
#  'lambda_l2': 0.9252524868601694,
#  'num_leaves': 194,
#  'feature_fraction': 0.6686224568825423,
#  'bagging_fraction': 0.9710918845473522,
#  'bagging_freq': 6,
#  'min_child_samples': 83}

## update best param
# best_params = {
#         'boosting_type': 'gbdt',
#         'objective': 'multiclass',
#         'num_class':7,
#         'categorical_feature': category_cols,       
#         "seed": 42,
# } 
# best_params.update(study.best_params)
# best_params

In [None]:
## submmit
# id_test = df_test['id_bh']
# X_test = df_test.drop(columns = [ 'id_bh']) 
# X_test[category_cols] = X_test[category_cols].astype('category')

# clf = lgb(**best_params)
# clf.fit( X_train, y_train)
# preds =  clf.predict(X_test)

# predict_df = pd.DataFrame()
# predict_df['id_bh'] =id_test
# predict_df['label'] = preds

# ## merge voi label_test
# submit_file = concat_data(test_label,predict_df )
# submit_file.to_csv("data/submmit/baseline_lgb.csv", index=False)