In [None]:
#!/usr/bin/env python
# coding: utf-8
get_ipython().run_line_magic('matplotlib', 'inline')
import pandas as pd
import time
import numpy as np
from datetime import datetime
from sklearn.externals import joblib 
import os
from konlpy.tag import Mecab
import lightgbm as lgb
print(lgb.__version__)

from sklearn import metrics

In [None]:
os.getcwd()

In [None]:
base_path = '.'

df_train = pd.read_csv(os.path.join(base_path , 'input/train.csv'), index_col=0)
df_test = pd.read_csv(os.path.join(base_path , 'input/public_test.csv'), index_col=0)
df_test['smishing'] = -1

df_fea = pd.concat([df_train, df_test])
df_fea.shape

### Mecab

In [None]:
mecab = Mecab()
# df_space['morphs'] = df_space['spacing'].apply(lambda x: mecab.morphs(x))
df_fea['nouns'] = df_fea['text'].apply(lambda x: mecab.nouns(x))
# df_space['text_morphs'] = df_train['text'].apply(lambda x: mecab.morphs(x))

### W2V

In [None]:
# from gensim.models import Word2Vec
# w2v_size = 100
# w2v_model = Word2Vec(df_fea['nouns'].values, 
#                            size=w2v_size, 
#                            window = 5, 
#                            min_count=25, 
#                            workers=16, 
#                            iter=100, 
#                            sg=1)
# ts = datetime.now().strftime('%Y%m%dT%H%M%S')
# file_name = 'model/gensim_w2v_{}'.format(ts)
# print(file_name)
# joblib.dump(w2v_model, '{}.pkl'.format(file_name))
# w2v_model.save('{}.model'.format(file_name))
# w2v_model

In [None]:
ts = '20191214T055747'
file_name = 'model/gensim_{}'.format(ts)

w2v_model = joblib.load(os.path.join(base_path, '{}.pkl'.format(file_name)))
w2v_size = w2v_model.wv.vectors.shape[1]

In [None]:
def mean_w2v(row):
    nouns = row['nouns']
    w2v = np.zeros(w2v_size)

    
    for n in nouns:
        if n in w2v_model.wv.vocab.keys():
            w2v = np.add(w2v, w2v_model.wv[n])
            
    return w2v if len(nouns) == 0 else np.true_divide(w2v, len(nouns))

w2v_cols = ['w2v_{}'.format(i) for i in range(w2v_size)]


df_fea[w2v_cols] = df_fea.apply(mean_w2v, axis=1, result_type='expand')

In [None]:
df_fea.head(10)

### tfidf

In [None]:
tfidf_size = 100
tfidf_cols = ['tfidf_{}'.format(i) for i in range(tfidf_size)]

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer(max_features=tfidf_size)

df_tfidf = pd.DataFrame(vectorizer.fit_transform(df_fea['nouns'].apply(lambda x: ' '.join(x))).toarray(), columns=tfidf_cols)

### Merged

In [None]:
df_fea.shape, df_tfidf.shape

In [None]:
df_merged = pd.concat([df_fea.reset_index(), df_tfidf[tfidf_cols]], axis=1)
df_merged.shape

In [None]:
df_merged.head(10)

In [None]:
# df_merged.to_csv('data_{}.csv'.format(ts))

In [None]:
cat_cols = []
fea_cols = w2v_cols + tfidf_cols + cat_cols

fea_cols

In [None]:
train_size = len(df_train)
print(train_size)

### Model

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedKFold

lgb_ts = datetime.now().strftime('%Y%m%dT%H%M%S')

initscore_filename = ''
params = {
    'boosting':'gbdt',
#     'boosting':'dart',
    'num_leaves': 15, 
    'objective': 'binary',
    'metric':'auc',
    'num_threads': 16,
    'learning_rate': 0.01,
    'is_unbalance': True,
    'bagging_fraction':0.15,
    'bagging_freq':20,
    'feature_fraction':0.1,
    'initscore_filename':initscore_filename,
#     'lambda_l1':200,
#     'lambda_l2':2000,
    'device_type':'gpu',
}
num_round = 10000

[df_test.drop(c, axis=1, inplace=True) for c in df_test.columns if 'smishing_' in c]

df_test = df_merged.loc[train_size:]

In [None]:
# model_index, hidden_index = train_test_split(range(train_size),     
#     test_size=0.1, random_state=1984)

skf_g = StratifiedKFold(n_splits=11)
cnt_g = 0

for model_index, hidden_index in skf_g.split(range(train_size), df_train['smishing'].values):
    cnt_g = cnt_g + 1
    df_model = df_merged.iloc[model_index]
    df_hidden = df_merged.iloc[hidden_index]
    print('model_set\n', df_model['smishing'].value_counts())
    print('hidden_set\n', df_hidden['smishing'].value_counts())
    
    [df_model.drop(c, axis=1, inplace=True) for c in df_model.columns if 'smishing_' in c]
    [df_hidden.drop(c, axis=1, inplace=True) for c in df_hidden.columns if 'smishing_' in c]


    X = df_model[fea_cols].values
    y = df_model['smishing'].values
#     print(np.unique(y, return_counts=True))

    X_hidden = df_hidden[fea_cols].values
    y_hidden = df_hidden['smishing'].values
#     print(np.unique(y_hidden, return_counts=True))

    skf = StratifiedKFold(n_splits=10)
#     print(skf)
    cnt = 0
    train_data = lgb.Dataset(X, label=y, feature_name=fea_cols, categorical_feature=cat_cols, free_raw_data=False)

    for train_index, valid_index in skf.split(X, y):
        cnt = cnt + 1
        print('\n', cnt_g, '*' * 20, cnt, '*' * 20)

        train_set = train_data.subset(train_index).construct()
        valid_set = train_data.subset(valid_index).construct()
        print('train_set', np.unique(train_set.get_label(), return_counts=True))
        print('valid_set', np.unique(valid_set.get_label(), return_counts=True))
        bst = lgb.train(params, train_set, num_round, categorical_feature=cat_cols,
                        early_stopping_rounds=500, 
                        valid_sets=[train_set, valid_set],
                        verbose_eval=200,
                        # init_model=init_model,
                       )
        impt_dict = {k:v for k, v in zip(fea_cols, bst.feature_importance())}
        del_fea_cols = []
        print(sorted(impt_dict.items(), key=(lambda x:x[1]), reverse=True)[:5])
    #     for r in sorted(impt_dict.items(), key=(lambda x:x[1]), reverse=True):
    #         print(r)
    #         if r[1] < 1:
    #             del_fea_cols.append(r[0])
    #     print(del_fea_cols)
    #     print(len(del_fea_cols))
    #     lgb.plot_importance(bst, height=0.3, figsize=(20, 100), max_num_features=100)
        df_model['smishing_{}_{}'.format(cnt_g, cnt)] = bst.predict(X)        
        df_hidden['smishing_{}_{}'.format(cnt_g, cnt)] = bst.predict(X_hidden)        
        df_test['smishing_{}_{}'.format(cnt_g, cnt)] = bst.predict(df_test[fea_cols].values)    

    # Predict 정리
    y = df_hidden['smishing'].values
    pred = df_hidden[[c for c in df_hidden.columns if 'smishing_' in c]].mean(axis=1)
    
    fpr, tpr, thresholds = metrics.roc_curve(y, pred, pos_label=1)
    print('\n', '#' * 10, cnt_g, 'auc:', metrics.auc(fpr, tpr))

In [None]:
pred_cols = [c for c in df_test.columns if 'smishing_' in c]
print(len(pred_cols))
df_test['pred_max'] = df_test[pred_cols].max(axis=1)
df_test['pred_min'] = df_test[pred_cols].min(axis=1)
df_test['pred_mean'] = df_test[pred_cols].mean(axis=1)
df_test['pred_std'] = df_test[pred_cols].std(axis=1)

In [None]:
df_test['pred_std'].max()

In [None]:
df_test['pred_std'].min()

In [None]:
df_test['pred_mean'].hist(bins=100)

In [None]:
df_test.sort_values('pred_mean', ascending=False)[['text', 'pred_mean']].to_csv('')

In [None]:
lgb_ts

In [None]:
df_test['smishing'] = df_test['pred_mean']
df_test[['id', 'smishing']].to_csv('{}.csv'.format(lgb_ts), index=False)
df_test[['id', 'smishing', 'text']].sort_values('smishing', ascending=False).to_csv('{}_text.csv'.format(lgb_ts), index=False)

In [None]:
# eval_hist = lgb.cv(params, train_data, num_boost_round=num_round, nfold=20, stratified=True, shuffle=True, 
#        metrics=None, fobj=None, feval=None, init_model=None, feature_name='auto', categorical_feature='auto',
#        early_stopping_rounds=500, fpreproc=None, verbose_eval=100, show_stdv=True, seed=0, callbacks=None,
#        eval_train_metric=False)


In [None]:
# for train_index, valid_index in skf.split(X, y):
#     cnt = cnt + 1
#     print('\n', '*' * 20, cnt, '*' * 20)
    
#     # init_bst_name = 'model.txt'
#     # init_model = joblib.load(init_bst_name)
#     # bst = None
#     # init_model = bst
#     train_set = train_data.subset(train_index).construct()
#     valid_set = train_data.subset(valid_index).construct()
#     print('train_set', np.unique(train_set.get_label(), return_counts=True))
#     print('valid_set', np.unique(valid_set.get_label(), return_counts=True))
#     bst = lgb.train(param, train_set, num_round, categorical_feature=cat_cols,
#                     early_stopping_rounds=200, 
#                     valid_sets=[train_set, valid_set],
#                     verbose_eval=200,
#                     # init_model=init_model,
#                    )
#     impt_dict = {k:v for k, v in zip(fea_cols, bst.feature_importance())}
#     del_fea_cols = []
#     print(sorted(impt_dict.items(), key=(lambda x:x[1]), reverse=True)[:5])
# #     for r in sorted(impt_dict.items(), key=(lambda x:x[1]), reverse=True):
# #         print(r)
# #         if r[1] < 1:
# #             del_fea_cols.append(r[0])
# #     print(del_fea_cols)
# #     print(len(del_fea_cols))
# #     lgb.plot_importance(bst, height=0.3, figsize=(20, 100), max_num_features=100)
#     df_model['smishing_{}'.format(cnt)] = bst.predict(X)        
#     df_hidden['smishing_{}'.format(cnt)] = bst.predict(X_hidden)        
#     df_test['smishing_{}'.format(cnt)] = bst.predict(df_test[fea_cols].values)    