In [None]:
#!/usr/bin/env python
# coding: utf-8
get_ipython().run_line_magic('matplotlib', 'inline')
import pandas as pd
import time
import numpy as np
from datetime import datetime
from sklearn.externals import joblib 
import os
from konlpy.tag import Mecab
import lightgbm as lgb
print(lgb.__version__)

from sklearn import metrics
from sklearn.preprocessing import MinMaxScaler
from sklearn.externals import joblib 

import gc

from tqdm import tqdm_notebook



import warnings
warnings.filterwarnings(action='ignore')


### Model

In [None]:
import torch
print(torch.__version__)
import torch.nn.functional as F
from torch.utils.data import Dataset,DataLoader

from torchsummary import summary

print(torch.cuda.get_device_name(0))
print(torch.cuda.is_available())

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
# device = torch.device('cpu')
device

#### eval_summary

In [None]:
def eval_summary(y_true, y_score, cut_off=0.5):
    if len(y_true) == 0 or len(y_score) == 0:
        return 'zero length'
    if len(y_true) != len(y_score):
        return 'diff length'
    
    y_pred = y_score.copy()
    y_pred[y_pred > cut_off] = 1
    y_pred[y_pred <= cut_off] = 0

    eval_dict = {}
    fpr, tpr, thresholds = metrics.roc_curve(y_true, y_score, pos_label=1)
    
    eval_dict['auc'] = metrics.auc(fpr, tpr)
    eval_dict['confusion_matrix'] = metrics.confusion_matrix(y_true, y_pred)
    
    pre, rec, _, _ = metrics.precision_recall_fscore_support(y_true, y_pred, pos_label=1)
    eval_dict['precision'] = pre[1]
    eval_dict['recall'] = rec[1]
    
    return eval_dict

In [None]:
import subprocess
def show_gpu(msg):
    """
    ref: https://discuss.pytorch.org/t/access-gpu-memory-usage-in-pytorch/3192/4
    """
    def query(field):
        return(subprocess.check_output(
            ['nvidia-smi', f'--query-gpu={field}',
                '--format=csv,nounits,noheader'], 
            encoding='utf-8'))
    def to_int(result):
        return int(result.strip().split('\n')[0])
    
    used = to_int(query('memory.used'))
    total = to_int(query('memory.total'))
    pct = used / total
    print('\n' + msg, f'{100*pct:2.1f}% ({used} out of {total})')   

In [None]:
# import gc

# for tracked_object in gc.get_objects():
#     if torch.is_tensor(tracked_object):
#         print("{} {} {}".format(
#             type(tracked_object).__name__,
#            "GPU" if tracked_object.is_cuda else "" ,
#           " pinned" if tracked_object.is_pinned() else "",
# ))

#### predict_train_data

In [None]:
def predict_train_data():
    print('predict_train_data ...')
    with torch.no_grad(): 
        model.eval()
        y_true_list = []
        y_score_list = []

        for i, data in enumerate(train_loader):
            X_batch, y_batch = data

            y_true = y_batch
            y_true_list.append(y_true[:, 1].cpu().detach().numpy())

#             X_batch = torch.Tensor(X_batch.float()).type(dtype=torch.float32).to(device)
#             y_batch = torch.Tensor(y_batch.float()).type(dtype=torch.float32).to(device)
            
            X_batch = X_batch.to(device)
            y_batch = y_batch.to(device)
            
            y_pred = model(X_batch)
            loss = criterion(y_pred, y_batch)
            
            y_pred = torch.sigmoid(y_pred)
            y_score_list.append(y_pred[:, 1].cpu().detach().numpy())
            
            del X_batch, y_batch, y_pred
        
        
        return np.concatenate(y_true_list, axis=0), np.concatenate(y_score_list, axis=0)


In [None]:
class DNNModel(torch.nn.Module):
    def __init__(self, input_size, dropout_probability=0.3):
        super(DNNModel,self).__init__()
        relu = torch.nn.ReLU()
        dropout = torch.nn.Dropout(p=dropout_probability)

        self.model = torch.nn.Sequential(
#             torch.nn.Linear(input_size, 4096), torch.nn.ReLU(), dropout,
#             torch.nn.Linear(4096, 2048), torch.nn.ReLU(), dropout,
            torch.nn.Linear(input_size, 1024), relu, torch.nn.BatchNorm1d(1024), dropout, 
            torch.nn.Linear(1024, 512), relu, torch.nn.BatchNorm1d(512), dropout,
            torch.nn.Linear(512, 256), relu, torch.nn.BatchNorm1d(256), dropout,
            torch.nn.Linear(256, 128), relu, torch.nn.BatchNorm1d(128), dropout,
            torch.nn.Linear(128, 2), 
#             torch.nn.Sigmoid()
        )
    def forward(self, x):        
        return self.model(x) 

class KBDataset(Dataset):
    def __init__(self, file_path, y_col, pos_only=False):
#         df = pd.read_pickle(file_path)
        df = joblib.load(file_path) 
        print(df.shape)
        print(df.info())
        
        self.columns = [c for c in df.columns if c not in [y_col, 'id']]
        self.X = df[self.columns].values
        self.y = pd.get_dummies(df[y_col], prefix=y_col).values
        
        if pos_only:
            self.X = self.X[df[y_col] == 1]
            self.y = self.y[df[y_col] == 1]
            print('pos only')

    def __len__(self):
        return len(self.X)
    
    def get_feature_names(self):
        return self.columns

    def __getitem__(self, idx):
        return self.X[idx].astype(np.float32), self.y[idx].astype(np.float32)

#### Train

In [None]:
merged_ts = '20191229T155539'
train_path = 'data/df_merged_{}_train.pkl'.format(merged_ts)
test_path = 'data/df_merged_{}_test.pkl'.format(merged_ts)

df_test = joblib.load(test_path) 

In [None]:
dataset = KBDataset(train_path, 'smishing')
train_size = len(dataset)
fea_cols = dataset.get_feature_names()

# model_ts = datetime.now().strftime('%Y%m%dT%H%M%S')
model = DNNModel(input_size=len(fea_cols), dropout_probability=0.7).to(device)
epoch = 1
print(summary(model, (len(fea_cols), )))

In [None]:
# criterion = torch.nn.BCELoss(reduction='mean').to(device)
pos_weight = torch.Tensor([1., 50.,])
# pos_weight = torch.Tensor([1., 1.,])
criterion = torch.nn.BCEWithLogitsLoss(reduction='sum', pos_weight=pos_weight).to(device)
optimizer = torch.optim.Adam(model.parameters(), lr = 0.0001)

In [None]:
train_loader = DataLoader(dataset=dataset,
#                           batch_size=100000,
#                         batch_size=int(train_size * 0.7),
                          batch_size=train_size // 16,
#                           batch_size=10000,
                          shuffle=True,
                          num_workers=16,
                         drop_last=True
                         )

In [None]:
dataset_pos = KBDataset(train_path, 'smishing', pos_only=True)
train_pos_loader = DataLoader(dataset=dataset_pos,
                          batch_size=len(dataset_pos),
                          shuffle=True,
                          num_workers=0,
                         drop_last=True
                         )

In [None]:
model_ts = datetime.now().strftime('%Y%m%dT%H%M%S')
print(model_ts)
print('epoch:', epoch)
total_epoch = 20
print('# of train_loader:', len(train_loader))

for e in tqdm_notebook(range(epoch, epoch + total_epoch), total=total_epoch, desc = 'epoch'):
    model.train()
    total_loss = 0
    for i, data in enumerate(train_loader):
#     for i, data in enumerate(train_pos_loader):
#     for i, data in tqdm_notebook(enumerate(train_loader), total=len(train_loader), desc = 'epoch{}_batch'.format(e)):
#         print(e, i)
        X_batch, y_batch = data
        
        X_batch = X_batch.to(device)
        y_batch = y_batch.to(device)

        y_pred = model(X_batch)
        print(y_pred, y_batch)
        
        loss = criterion(y_pred, y_batch)

        total_loss = total_loss + loss.item()
        
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        
        del X_batch, y_batch, y_pred
        gc.collect()

    print(e-1, 'loss_sum', total_loss)
    
    if e % 10 == 0:
        print('epoch', e)
        y_true, y_score = predict_train_data()
        print(eval_summary(y_true, y_score, cut_off=0.5))

torch.save(model.state_dict(), 'model/{}_{}.model'.format(model_ts, epoch))


In [None]:
df = pd.Series(y_score)
df.hist(bins=100, figsize=(20, 5))
(df * 10).astype(int).value_counts(sort=False)

In [None]:
# df_model[(y_score <= 0.5) & (y_true == 1)]['text']

In [None]:
# df_model[(y_score > 0.5) & (y_true == 0)]['text']

In [None]:
x_test = torch.Tensor(df_test[fea_cols].values).to(device)
df_test['smishing'] = torch.sigmoid(model(x_test))[:, 1].cpu().detach().numpy()
df_test['smishing'].hist(bins=100, figsize=(20, 5))

In [None]:
# 0     1504
# 1       11
# 2        6
# 3        6
# 4        2
# 5        3
# 6        2
# 9       39
# 10      53
(df_test['smishing'] * 10).astype(int).value_counts(sort=False)

In [None]:
model_ts

In [None]:
df_test[['smishing']].to_csv('{}.csv'.format(model_ts), index=True)
# df_test[['id', 'smishing', 'text']].sort_values('smishing', ascending=False).to_csv('{}_text.csv'.format(model_ts), index=False)
