In [None]:
#!/usr/bin/env python
# coding: utf-8
get_ipython().run_line_magic('matplotlib', 'inline')
import pandas as pd
import time
import numpy as np
from datetime import datetime
from sklearn.externals import joblib 
import os
from konlpy.tag import Mecab
import lightgbm as lgb
print(lgb.__version__)

from sklearn import metrics

In [None]:
print(os.getcwd())

base_path = '.'

df_train = pd.read_csv(os.path.join(base_path , 'input/train.csv'), index_col=0)
df_test = pd.read_csv(os.path.join(base_path , 'input/public_test.csv'), index_col=0)
df_test['smishing'] = -1

df_fea = pd.concat([df_train, df_test])
df_fea.shape

### Mecab

In [None]:
# mecab = Mecab()
# # df_space['morphs'] = df_space['spacing'].apply(lambda x: mecab.morphs(x))
# df_fea['nouns'] = df_fea['text'].apply(lambda x: mecab.nouns(x))

# df_fea['nouns_str'] = df_fea['nouns'].apply(lambda x: ' '.join(x))

In [None]:
# df_fea.to_pickle('df_fea.pkl')

In [None]:
df_fea = pd.read_pickle('df_fea.pkl')

### Count

In [None]:
from sklearn.feature_extraction.text import CountVectorizer

vectorizer = CountVectorizer(max_df=0.25, min_df=50)

vectorizer = vectorizer.fit(df_fea[df_fea['smishing']==1]['nouns_str'].values)
cnt_vec = vectorizer.transform(df_fea['nouns_str'].values).toarray()

cnt_dict = {'cnt_{0:03d}'.format(i):'cnt_{0:03d}_{1}'.format(i, c) for i, c in enumerate(vectorizer.get_feature_names())}
cnt_cols = sorted(cnt_dict.keys())

df_cnt_vec = pd.DataFrame(cnt_vec, columns=cnt_cols, dtype=np.uint8)
df_cnt_vec.shape

In [None]:
# df_cnt_vec = df_cnt_vec.astype(np.int32)

# vectorizer2 = CountVectorizer(analyzer='word', ngram_range=(2, 2))
# X2 = vectorizer2.fit_transform(corpus)
# print(vectorizer2.get_feature_names())

### tfidf

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf_size = None
stop_words = [
#    '은행',
#     '고객',
#     '가능',
#     '전화',
#     '기간',
#     '대출',
#     '금리',
#     '상담',
#     '광고',
#     '상품',
#     '센터',
]
vectorizer = TfidfVectorizer(max_features=tfidf_size, 
                             stop_words=stop_words, 
                             mid_df=200)

vectorizer = vectorizer.fit(df_fea[df_fea['smishing']==1]['nouns_str'].values)

tfidf = vectorizer.transform(df_fea['nouns_str'].values).toarray()
tfidf_dict = {'tfidf_{0:03d}'.format(v):'tfidf_{0:03d}_{1}'.format(v, k) for k, v in sorted(vectorizer.vocabulary_.items(), key=lambda item: item[1])}
tfidf_cols = sorted(tfidf_dict.keys())

df_tfidf = pd.DataFrame(tfidf, columns=tfidf_cols)

### Merged

In [None]:
print(df_fea.shape, df_cnt_vec.shape, df_tfidf.shape)

df_merged = pd.concat([df_fea.reset_index(), df_cnt_vec[cnt_cols], df_tfidf[tfidf_cols]], axis=1)
print(df_merged.shape)
# df_merged = pd.concat([df_merged, df_tfidf[tfidf_cols]], axis=1)
# print(df_merged.shape)

print(df_merged.info())
df_merged.head(10)

In [None]:
# df_merged.to_pickle('df_merged.pkl')

In [None]:
train_size = len(df_train)
print(train_size)

cat_cols = []
fea_cols = cnt_cols + tfidf_cols + cat_cols
# fea_cols = cnt_cols + cat_cols
# fea_cols = tfidf_cols + cat_cols
len(fea_cols)

### Model

In [None]:
def eval_summary(y_true, y_score, cut_off=0.5):
    y_pred = y_score.copy()
    y_pred[y_pred > cut_off] = 1
    y_pred[y_pred <= cut_off] = 0

    eval_dict = {}
    fpr, tpr, thresholds = metrics.roc_curve(y_true, y_score, pos_label=1)
    
    eval_dict['auc'] = metrics.auc(fpr, tpr)
    eval_dict['confusion_matrix'] = metrics.confusion_matrix(y_true, y_pred)
    
    pre, rec, _, _ = metrics.precision_recall_fscore_support(y_true, y_pred, pos_label=1)
    eval_dict['precision'] = pre[1]
    eval_dict['recall'] = rec[1]
    
    return eval_dict

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedKFold

lgb_ts = datetime.now().strftime('%Y%m%dT%H%M%S')

initscore_filename = ''
params = {
    'boosting':'gbdt',
#     'boosting':'dart',
    'num_leaves': 15, 
    'objective': 'binary',
    'metric':'auc',
    'num_threads': -1,
    'learning_rate': 0.01,
#     'is_unbalance': True,
    'scale_pos_weight':20,
    'bagging_fraction':0.3,
    'bagging_freq':10,
    'feature_fraction':0.4,
    'initscore_filename':initscore_filename,
#     'lambda_l1':200,
#     'lambda_l2':2000,
    
    'device_type':'gpu',
    
    
}

data_params = {
    'max_bin':64,
    'enable_bundle': False,
    
}
num_round = 5000

[df_test.drop(c, axis=1, inplace=True) for c in df_test.columns if 'smishing_' in c]

df_test = df_merged.loc[train_size:]

import gc
gc.collect()

In [None]:
# model_index, hidden_index = train_test_split(range(train_size),     
#     test_size=0.1, random_state=1984)

skf_g = StratifiedKFold(n_splits=11, random_state=1984)
cnt_g = 0

for model_index, hidden_index in skf_g.split(range(train_size), df_train['smishing'].values):
    cnt_g = cnt_g + 1
    df_model = df_merged.iloc[model_index]
    df_hidden = df_merged.iloc[hidden_index]
    print('model_set\n', df_model['smishing'].value_counts())
    print('hidden_set\n', df_hidden['smishing'].value_counts())
    
    [df_model.drop(c, axis=1, inplace=True) for c in df_model.columns if 'smishing_' in c]
    [df_hidden.drop(c, axis=1, inplace=True) for c in df_hidden.columns if 'smishing_' in c]


    X = df_model[fea_cols].values
    y = df_model['smishing'].values
#     print(np.unique(y, return_counts=True))

    X_hidden = df_hidden[fea_cols].values
    y_hidden = df_hidden['smishing'].values
#     print(np.unique(y_hidden, return_counts=True))

    skf = StratifiedKFold(n_splits=10, random_state=8405)
#     print(skf)
    cnt = 0
    train_data = lgb.Dataset(X, label=y, feature_name=fea_cols, categorical_feature=cat_cols, 
                             free_raw_data=False, params=data_params)

    for train_index, valid_index in skf.split(X, y):
        cnt = cnt + 1
        print('\n', cnt_g, '*' * 20, cnt, '*' * 20)

        train_set = train_data.subset(train_index, params=data_params).construct()
        valid_set = train_data.subset(valid_index, params=data_params).construct()
        print('train_set', np.unique(train_set.get_label(), return_counts=True))
        print('valid_set', np.unique(valid_set.get_label(), return_counts=True))
        
        bst = lgb.train(params, train_set, num_round, categorical_feature=cat_cols,
                        early_stopping_rounds=300, 
                        valid_sets=[train_set, valid_set],
                        verbose_eval=200,
                        # init_model=init_model,
                       )
        
        renamed_cols = [cnt_dict[c] if c in cnt_dict.keys() else c for c in fea_cols]
        renamed_cols = [tfidf_dict[c] if c in tfidf_dict.keys() else c for c in renamed_cols]
        impt_dict = {k:v for k, v in zip(renamed_cols, bst.feature_importance(importance_type='split'))}
        print('split:', sorted(impt_dict.items(), key=(lambda x:x[1]), reverse=True)[:5])
        
        impt_dict = {k:v for k, v in zip(renamed_cols, bst.feature_importance(importance_type='gain'))}
        print('gain:', sorted(impt_dict.items(), key=(lambda x:x[1]), reverse=True)[:5])
 
        
    #     for r in sorted(impt_dict.items(), key=(lambda x:x[1]), reverse=True):
    #         print(r)
    #         if r[1] < 1:
    #             del_fea_cols.append(r[0])
    #     print(del_fea_cols)
    #     print(len(del_fea_cols))
    #     lgb.plot_importance(bst, height=0.3, figsize=(20, 100), max_num_features=100)
        df_model['smishing_{}_{}'.format(cnt_g, cnt)] = bst.predict(X)        
        df_hidden['smishing_{}_{}'.format(cnt_g, cnt)] = bst.predict(X_hidden)        
        df_test['smishing_{}_{}'.format(cnt_g, cnt)] = bst.predict(df_test[fea_cols].values)    
        
        print('model\n', eval_summary(y, df_model['smishing_{}_{}'.format(cnt_g, cnt)].values, cut_off=0.5))
        print('hidden\n', eval_summary(y_hidden, df_hidden['smishing_{}_{}'.format(cnt_g, cnt)].values, cut_off=0.5))

    # Predict 정리
    y = df_hidden['smishing'].values
    pred = df_hidden[[c for c in df_hidden.columns if 'smishing_' in c]].mean(axis=1)
    
    fpr, tpr, thresholds = metrics.roc_curve(y, pred, pos_label=1)
    print('\n', '#' * 10, cnt_g, 'auc:', metrics.auc(fpr, tpr))

In [None]:
pred_cols = [c for c in df_test.columns if 'smishing_' in c]
print(len(pred_cols))
df_test['pred_max'] = df_test[pred_cols].max(axis=1)
df_test['pred_min'] = df_test[pred_cols].min(axis=1)
df_test['pred_mean'] = df_test[pred_cols].mean(axis=1)
df_test['pred_std'] = df_test[pred_cols].std(axis=1)

In [None]:
df_test['pred_std'].max(), df_test['pred_std'].min()

In [None]:
df_test['pred_mean'].hist(bins=100)

In [None]:
lgb_ts

In [None]:
df_test['smishing'] = df_test['pred_mean']
df_test[['id', 'smishing']].to_csv('{}.csv'.format(lgb_ts), index=False)
df_test[['id', 'smishing', 'text']].sort_values('smishing', ascending=False).to_csv('{}_text.csv'.format(lgb_ts), index=False)

In [None]:
# eval_hist = lgb.cv(params, train_data, num_boost_round=num_round, nfold=20, stratified=True, shuffle=True, 
#        metrics=None, fobj=None, feval=None, init_model=None, feature_name='auto', categorical_feature='auto',
#        early_stopping_rounds=500, fpreproc=None, verbose_eval=100, show_stdv=True, seed=0, callbacks=None,
#        eval_train_metric=False)


In [None]:
# for train_index, valid_index in skf.split(X, y):
#     cnt = cnt + 1
#     print('\n', '*' * 20, cnt, '*' * 20)
    
#     # init_bst_name = 'model.txt'
#     # init_model = joblib.load(init_bst_name)
#     # bst = None
#     # init_model = bst
#     train_set = train_data.subset(train_index).construct()
#     valid_set = train_data.subset(valid_index).construct()
#     print('train_set', np.unique(train_set.get_label(), return_counts=True))
#     print('valid_set', np.unique(valid_set.get_label(), return_counts=True))
#     bst = lgb.train(param, train_set, num_round, categorical_feature=cat_cols,
#                     early_stopping_rounds=200, 
#                     valid_sets=[train_set, valid_set],
#                     verbose_eval=200,
#                     # init_model=init_model,
#                    )
#     impt_dict = {k:v for k, v in zip(fea_cols, bst.feature_importance())}
#     del_fea_cols = []
#     print(sorted(impt_dict.items(), key=(lambda x:x[1]), reverse=True)[:5])
# #     for r in sorted(impt_dict.items(), key=(lambda x:x[1]), reverse=True):
# #         print(r)
# #         if r[1] < 1:
# #             del_fea_cols.append(r[0])
# #     print(del_fea_cols)
# #     print(len(del_fea_cols))
# #     lgb.plot_importance(bst, height=0.3, figsize=(20, 100), max_num_features=100)
#     df_model['smishing_{}'.format(cnt)] = bst.predict(X)        
#     df_hidden['smishing_{}'.format(cnt)] = bst.predict(X_hidden)        
#     df_test['smishing_{}'.format(cnt)] = bst.predict(df_test[fea_cols].values)    