In [None]:
#!/usr/bin/env python
# coding: utf-8
get_ipython().run_line_magic('matplotlib', 'inline')
import pandas as pd
import time
import numpy as np
from datetime import datetime
from sklearn.externals import joblib 
import os
from konlpy.tag import Mecab
import lightgbm as lgb
print(lgb.__version__)

from sklearn import metrics
from sklearn.preprocessing import MinMaxScaler
from sklearn.externals import joblib 
from sklearn.model_selection import StratifiedKFold

import gc

from tqdm import tqdm_notebook

import warnings
warnings.filterwarnings(action='ignore')


import torch
print(torch.__version__)
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader

from torchsummary import summary

print(torch.cuda.get_device_name(0))
print(torch.cuda.is_available())

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
# device = torch.device('cpu')
device

#### Model

In [None]:
class LRModel(torch.nn.Module):
    def __init__(self, input_size):
        super(LRModel,self).__init__()

        self.model = torch.nn.Sequential(
            torch.nn.Linear(input_size, 2), 
#             torch.nn.Sigmoid()
        )
        
    def forward(self, x):        
        return self.model(x) 

In [None]:
class NNModel(torch.nn.Module):
    def __init__(self, input_size, dropout_probability=0.3):
        super(NNModel,self).__init__()
        relu = torch.nn.ReLU()
        dropout = torch.nn.Dropout(p=dropout_probability)

        self.model = torch.nn.Sequential(
#             torch.nn.Linear(input_size, 4096), relu, torch.nn.BatchNorm1d(4096), dropout,
#             torch.nn.Linear(4096, 2048), relu, torch.nn.BatchNorm1d(2048), dropout,
            
            torch.nn.Linear(input_size, 2048), relu, torch.nn.BatchNorm1d(2048), dropout,
            torch.nn.Linear(2048, 1024), relu, torch.nn.BatchNorm1d(1024), dropout,
            
#             torch.nn.Linear(input_size, 1024), relu, torch.nn.BatchNorm1d(1024), dropout, 

            torch.nn.Linear(1024, 512), relu, torch.nn.BatchNorm1d(512), dropout,
            torch.nn.Linear(512, 512), relu, torch.nn.BatchNorm1d(512), dropout,
            torch.nn.Linear(512, 256), relu, torch.nn.BatchNorm1d(256), dropout,
            torch.nn.Linear(256, 128), relu, torch.nn.BatchNorm1d(128), dropout,
            torch.nn.Linear(128, 2), 
#             torch.nn.Sigmoid()
        )
        
    def forward(self, x):        
        return self.model(x) 

#### Load Data

In [None]:
temp = torch.ones(1).to(device)

temp2 = torch.ones(1).to(device)

In [None]:
merged_ts = '20191231T165424_6099'
train_path = 'data/df_merged_{}_train.pkl'.format(merged_ts)

df_model = joblib.load(train_path)
df_model.info()


In [None]:
df = df_model[:10000]

In [None]:
df.info()

In [None]:
df_32 = df.astype(np.float32)
df_32.info()

In [None]:
x_gpu = torch.Tensor(df_32.values).to(device)

In [None]:
y_gpu = torch.Tensor(df_32['smishing'].values).to(device)

In [None]:
linear = torch.nn.Linear(x_gpu.size()[1], 1).to(device)

In [None]:
o1 = linear(x_gpu)

In [None]:
o1

In [None]:
output = torch.sigmoid(o1)

In [None]:
output

#### Load Model

In [None]:
# model = NNModel(input_size=input_size, dropout_probability=0.7).to(device)
# epoch = 1
# print(summary(model, (input_size, )))

#### Training

In [None]:
# import EarlyStopping
from torchtools import EarlyStopping

In [None]:
[df_test.drop(c, axis=1, inplace=True) for c in df_test.columns if 'smishing_' in c]

skf = StratifiedKFold(n_splits=5, random_state=8405)

for cv, index in enumerate(skf.split(df_model[fea_cols], df_model['smishing'])):
    train_index, valid_index = index
    
    print(len(train_index), len(valid_index))
    print('\nCV', cv)
    model = NNModel(input_size=input_size, dropout_probability=0.7).to(device)
#     model =  LRModel(input_size=input_size).to(device)

    # initialize the early_stopping object
    early_stopping = EarlyStopping(patience=15, verbose=True)

    epoch = 1
    if cv == 0:
        print(summary(model, (input_size, )))
    
    pos_weight = torch.Tensor([1., 10.,])
#     pos_weight = torch.Tensor([1., 1.,])
    criterion = torch.nn.BCEWithLogitsLoss(reduction='sum', pos_weight=pos_weight).to(device)
    optimizer = torch.optim.Adam(model.parameters(), lr = 0.001)

    model_ts = datetime.now().strftime('%Y%m%dT%H%M%S')
    print(model_ts)
    print('Epoch:', epoch)

    N_EPOCHS = 100
    is_summary = True
    for e in tqdm_notebook(range(epoch, epoch + N_EPOCHS), total=N_EPOCHS, desc = 'CV {} Epoch'.format(cv)):
        start_time = time.time()
        train_set = KBDataset(df_model.loc[train_index, fea_cols + ['smishing']], 'smishing')
        valid_set = KBDataset(df_model.loc[valid_index, fea_cols + ['smishing']], 'smishing')

        train_loss, train_acc = train_torch(train_set)
        valid_loss, valid_acc = test_torch(valid_set)
        print('CV {} Epoch {}\n\tTrain loss: {}\tValid loss: {}\t{}'.format(cv, e, train_loss, valid_loss, train_loss / valid_loss))
        
        early_stopping(valid_loss, model)
        
        
        if early_stopping.counter == 0:
            if is_summary:
                _, _, y_true, y_score = pred_torch(valid_set)            
                print('\t', eval_summary(y_true, y_score, cut_off=0.5))
                is_summary = False
        else:
            is_summary = True
            

        if early_stopping.early_stop:
            print("\tEarly stopping epoch {}, valid loss {}".format(e, valid_loss))
            break
    
        del train_set, valid_set
        gc.collect()
        
        epoch = e + 1
    
    # load the last checkpoint with the best model
    model.load_state_dict(torch.load('checkpoint.pt'))
    
    valid_set = KBDataset(df_model.loc[valid_index, fea_cols + ['smishing']], 'smishing')
    _, _, y_true, y_score = pred_torch(valid_set)            
    print('\t', eval_summary(y_true, y_score, cut_off=0.5))

    train_set = KBDataset(df_model.loc[train_index, fea_cols + ['smishing']], 'smishing')
    _, _, y_true, y_score = pred_torch(train_set)            
    print('END CV {} eval summary (train)\n'.format(cv), eval_summary(y_true, y_score, cut_off=0.5))

    torch.save(model.state_dict(), 'model/{}_{}_{}.model'.format(model_ts, cv, epoch-1))
    
    model.eval()
    pred_col = 'smishing_{}'.format(cv)
    df_test[pred_col] = torch.sigmoid(model(x_test))[:, 1].cpu().detach().numpy()
    df_test[[pred_col]].to_csv('submit/{}_{}_nn.csv'.format(model_ts, pred_col), index=True)
    
#     break

#### Predict Train

In [None]:
df = pd.Series(y_score)
df.hist(bins=100, figsize=(20, 5))
(df * 10).astype(int).value_counts(sort=False)

In [None]:
# df_model[(y_score <= 0.5) & (y_true == 1)]['text']

In [None]:
# df_model[(y_score > 0.5) & (y_true == 0)]['text']

#### Predict Test

In [None]:
pred_cols = [c for c in df_test.columns if 'smishing_' in c]
print(len(pred_cols))
df_test['pred_max'] = df_test[pred_cols].max(axis=1)
df_test['pred_min'] = df_test[pred_cols].min(axis=1)
df_test['pred_mean'] = df_test[pred_cols].mean(axis=1)
df_test['pred_std'] = df_test[pred_cols].std(axis=1)

print(df_test['pred_std'].max(), df_test['pred_std'].min(), df_test['pred_std'].mean())

df_test['smishing'] = df_test['pred_mean']

In [None]:
df_test['smishing'].hist(bins=100, figsize=(20, 5))

In [None]:
for c in pred_cols:
    print(c)
    display((df_test[c] * 10).astype(int).value_counts(sort=False))

In [None]:
# 0     1504
# 1       11
# 2        6
# 3        6
# 4        2
# 5        3
# 6        2
# 9       39
# 10      53
(df_test['smishing'] * 10).astype(int).value_counts(sort=False)

In [None]:
model_ts

In [None]:
df_test[['smishing']].to_csv('submit/{}_nn.csv'.format(model_ts), index=True)
# df_test[['id', 'smishing', 'text']].sort_values('smishing', ascending=False).to_csv('{}_text.csv'.format(model_ts), index=False)
