In [1]:
import os
train_path = '../../raw_train_artifact'
test_path = '../../raw_test_artifact'
input_path = '../../input_artifact'
model_path = '../../model_artifact'
if not os.path.isdir(model_path): os.mkdir(model_path)

In [2]:
import sys
import gc
import time
import numpy as np

import pandas as pd
pd.set_option('display.max_columns',120)
pd.set_option('display.max_rows',2000)
pd.set_option('precision',5)
pd.set_option('float_format', '{:.5f}'.format)

import tqdm
import joblib
import json

from lightgbm import LGBMClassifier
from sklearn.model_selection import KFold
from sklearn.metrics import accuracy_score, roc_auc_score

In [3]:
import logging

log_path = 'LightGBM with Avg-pooled Creative Embedding.log'
if os.path.isfile(log_path): os.remove(log_path)
    
logger = logging.getLogger(__name__)
logger.setLevel(logging.INFO)

formatter = logging.Formatter('%(asctime)s %(levelname)-s: %(message)s', datefmt='%H:%M:%S')

fh = logging.FileHandler(log_path)
fh.setLevel(logging.INFO)
fh.setFormatter(formatter)
logger.addHandler(fh)

sh = logging.StreamHandler(sys.stdout)
sh.setLevel(logging.INFO)
sh.setFormatter(formatter)
logger.addHandler(sh)

## Load Data

In [4]:
age_path = os.path.join(input_path, 'tag_age.npy')
with open(age_path, 'rb') as f:
    tag_age = np.load(f)

In [5]:
gender_path = os.path.join(input_path, 'tag_gender.npy')
with open(gender_path, 'rb') as f:
    tag_gender = np.load(f)

In [6]:
avg_pool_file_path = os.path.join(input_path, 'input_creative_avg_pool.npy')
with open(avg_pool_file_path, 'rb') as f:
    inp = np.load(f)

## Age Model

In [7]:
N_FOLD = 5

folds = KFold(n_splits=N_FOLD, shuffle=True, random_state=1898)
oof_pred = np.zeros(inp.shape[0])
gc.enable()

for n_iter, (train_idx, test_idx) in enumerate(folds.split(inp)):
    x_train, y_train = inp[train_idx,:], tag_age[train_idx]
    x_test, y_test = inp[test_idx,:], tag_age[test_idx]
    
    logger.info(f'Fold {n_iter+1}/{N_FOLD}')
    start = time.time()
    
    clf = LGBMClassifier(
        n_estimators=5000,
        learning_rate=0.01,
        num_leaves=128,
        colsample_bytree=.8,
        subsample=.9,
        max_depth=15,
        reg_alpha=.1,
        reg_lambda=.1,
        min_split_gain=.01,
        min_child_weight=2,
        random_state=1898,
        objective='multiclass'
    )
    
    clf.fit(x_train, y_train, 
            eval_set= [(x_train, y_train), (x_test, y_test)], 
            eval_metric='multi_logloss', verbose=250, early_stopping_rounds=150
    )
    
    logger.info(f'Model training is done after {time.time()-start:.2f}s')
    
    oof_pred[test_idx] = clf.predict(x_test, num_iteration=clf.best_iteration_)
    cur_accuracy = accuracy_score(y_test, oof_pred[test_idx])
    logger.info(f'OOF Accuracy: {cur_accuracy:.6f}')
    
    gbm_artifact = f'GBM_Creative_Age_{n_iter+1}.pkl'
    joblib.dump(clf, os.path.join(model_path, gbm_artifact))
    
    del clf, x_train, y_train, x_test, y_test
    gc.collect()
    
full_accuracy = accuracy_score(tag_age, oof_pred)

logger.info(f'Total OOF Accuracy: {full_accuracy:.6f}')

23:50:15 INFO: Fold 1/5
Training until validation scores don't improve for 150 rounds
[250]	training's multi_logloss: 1.66998	valid_1's multi_logloss: 1.71677
[500]	training's multi_logloss: 1.55583	valid_1's multi_logloss: 1.63458
[750]	training's multi_logloss: 1.48901	valid_1's multi_logloss: 1.59768
[1000]	training's multi_logloss: 1.43936	valid_1's multi_logloss: 1.57686
[1250]	training's multi_logloss: 1.39826	valid_1's multi_logloss: 1.56351
[1500]	training's multi_logloss: 1.36224	valid_1's multi_logloss: 1.55424
[1750]	training's multi_logloss: 1.32978	valid_1's multi_logloss: 1.54742
[2000]	training's multi_logloss: 1.29999	valid_1's multi_logloss: 1.54223
[2250]	training's multi_logloss: 1.27246	valid_1's multi_logloss: 1.53832
[2500]	training's multi_logloss: 1.24686	valid_1's multi_logloss: 1.53534
[2750]	training's multi_logloss: 1.22308	valid_1's multi_logloss: 1.5332
[3000]	training's multi_logloss: 1.20088	valid_1's multi_logloss: 1.53158
[3250]	training's multi_loglos

## Gender Model

In [8]:
N_FOLD = 5

folds = KFold(n_splits=N_FOLD, shuffle=True, random_state=1898)
oof_pred = np.zeros(inp.shape[0])
oof_pred_prob = np.zeros(inp.shape[0])

gc.enable()

for n_iter, (train_idx, test_idx) in enumerate(folds.split(inp)):
    x_train, y_train = inp[train_idx,:], tag_gender[train_idx]
    x_test, y_test = inp[test_idx,:], tag_gender[test_idx]
    
    logger.info(f'Fold {n_iter+1}/{N_FOLD}')
    start = time.time()
    
    clf = LGBMClassifier(
        n_estimators=5000,
        learning_rate=0.01,
        num_leaves=128,
        colsample_bytree=.8,
        subsample=.9,
        max_depth=15,
        reg_alpha=.1,
        reg_lambda=.1,
        min_split_gain=.01,
        min_child_weight=2,
        random_state=1898
    )
    
    clf.fit(x_train, y_train, 
            eval_set= [(x_train, y_train), (x_test, y_test)], 
            eval_metric='auc', verbose=250, early_stopping_rounds=150
    )
    
    logger.info(f'Model training is done after {time.time()-start:.2f}s')
    
    oof_pred[test_idx] = clf.predict(x_test, num_iteration=clf.best_iteration_)
    oof_pred_prob[test_idx] = clf.predict_proba(x_test, num_iteration=clf.best_iteration_)[:,1]
    cur_accuracy = accuracy_score(y_test, oof_pred[test_idx])
    cur_auc = roc_auc_score(y_test, oof_pred_prob[test_idx])
    logger.info(f'OOF Accuracy: {cur_accuracy:.6f}, AUC: {cur_auc:.6f}')
    
    gbm_artifact = f'GBM_Creative_Gender_{n_iter+1}.pkl'
    joblib.dump(clf, os.path.join(model_path, gbm_artifact))
    
    del clf, x_train, y_train, x_test, y_test
    gc.collect()
    
full_accuracy = accuracy_score(tag_gender, oof_pred)
full_auc = roc_auc_score(tag_gender, oof_pred_prob)

logger.info(f'Total OOF Accuracy: {full_accuracy:.6f}, AUC: {full_auc:.6f}')

21:43:29 INFO: Fold 1/5
Training until validation scores don't improve for 150 rounds
[250]	training's auc: 0.955673	training's binary_logloss: 0.275063	valid_1's auc: 0.952709	valid_1's binary_logloss: 0.281623
[500]	training's auc: 0.962635	training's binary_logloss: 0.233767	valid_1's auc: 0.958428	valid_1's binary_logloss: 0.245256
[750]	training's auc: 0.966587	training's binary_logloss: 0.218171	valid_1's auc: 0.961024	valid_1's binary_logloss: 0.234585
[1000]	training's auc: 0.969581	training's binary_logloss: 0.207826	valid_1's auc: 0.962565	valid_1's binary_logloss: 0.229153
[1250]	training's auc: 0.972097	training's binary_logloss: 0.199549	valid_1's auc: 0.96355	valid_1's binary_logloss: 0.22581
[1500]	training's auc: 0.974255	training's binary_logloss: 0.192463	valid_1's auc: 0.964253	valid_1's binary_logloss: 0.223412
[1750]	training's auc: 0.976171	training's binary_logloss: 0.186147	valid_1's auc: 0.964757	valid_1's binary_logloss: 0.221657
[2000]	training's auc: 0.97783