In [1]:
%load_ext autoreload
%autoreload 2
from glob import glob
import os
from pathlib import Path
import gc
import re
import sys
import yaml
import datetime
import numpy as np
import pandas as pd
from tqdm import tqdm
from func.utils import get_numeric_features, get_categorical_features,\
read_pkl_gzip, to_pkl_gzip, parallel_load_data, get_filename, logger_func, timer
from ieee_train import eval_train, eval_check_feature
from kaggle_utils import reduce_mem_usage, move_feature
logger = logger_func()

import torch
import torch.nn as nn
from torch.utils.data import DataLoader,TensorDataset
from torch.nn.functional import binary_cross_entropy
from torch.optim import Adam

import time

from sklearn.metrics import roc_auc_score
from sklearn.preprocessing import MinMaxScaler

import random
import  warnings
pd.set_option('display.max_columns', 500)
warnings.filterwarnings('ignore')

2019-09-23 23:45:57,198 func.utils 347 [INFO]    [logger_func] start 


In [2]:
def seed_everything(seed=1208):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True

In [3]:
COLUMN_ID = 'TransactionID'
COLUMN_DT = 'TransactionDT'
COLUMN_TARGET = 'isFraud'
COLUMN_GROUP = 'DT-M'
COLUMNS_IGNORE = [COLUMN_ID, COLUMN_DT, COLUMN_TARGET, COLUMN_GROUP, 'is_train', 'date']

def filter_feature(path):
    if path.count(''):
        return True
    else:
        return False

paths_train = glob('../feature/raw_use/*_train.gz')
paths_test  = glob('../feature/raw_use/*_test.gz')

# paths_train = glob('../submit/re_sub/*_train.gz')
# paths_test  = glob('../submit/re_sub/*_test.gz')
# paths_train += glob('../submit/add_feature/*_train.gz')
# paths_test  += glob('../submit/add_feature/*_test.gz')
# paths_train += glob('../feature/valid_use/531*_train.gz')
# paths_test  += glob('../feature/valid_use/531*_test.gz')
# paths_train += glob('../feature/valid_use/532*_train.gz')
# paths_test  += glob('../feature/valid_use/532*_test.gz')


paths_train_feature = []
paths_test_feature  = []

# df_train = reduce_mem_usage( parallel_load_data(paths_train) )
# df_test  = reduce_mem_usage( parallel_load_data(paths_test) )
df_train = parallel_load_data(paths_train).iloc[:10000]
df_test  = parallel_load_data(paths_test).iloc[:10000]
Y = df_train[COLUMN_TARGET]
df_train.drop(COLUMN_TARGET, axis=1, inplace=True)

In [4]:
train_ids = df_train[COLUMN_ID]
test_ids = df_test[COLUMN_ID]

df_all = pd.concat((df_train, df_test),axis=0)

del df_train, df_test
gc.collect()

36

In [5]:
# In practice, among the numerical variables, many corresponds to identifiers. *In the current dataset, the truly numerical variables are in fact rare*. Below, I make a list of the variables which are truly numerical, according the the description of the data.
cols_all_num = get_numeric_features(df_all, COLUMNS_IGNORE)

cols_binary = [col for col in cols_all_num if df_all[col].nunique() == 2]

cols_cat = [col for col in df_all.columns if (col.count('uid')) & (col not in cols_binary)]
cols_num = list(set(cols_all_num) - set(cols_cat))

## Frequency Encoding

In [6]:
# def frequency_encoding(variable):
#     # t = pd.concat([train[variable], test[variable]]).value_counts().reset_index()
#     t = df_all[variable].value_counts().reset_index()
#     t = t.reset_index()
#     t.loc[t[variable] == 1, 'level_0'] = np.nan
#     t.set_index('index', inplace=True)
#     max_label = t['level_0'].max() + 1
#     t.fillna(max_label, inplace=True)
#     return t.to_dict()['level_0']

# frequency_encoded_variables = [
#     'Census_OEMModelIdentifier',
#     'CityIdentifier',
#     'Census_FirmwareVersionIdentifier',
#     'AvSigVersion',
#     'Census_ProcessorModelIdentifier',
#     'Census_OEMNameIdentifier',
#     'DefaultBrowsersIdentifier',
#     'AVProductStatesIdentifier',
#     'OsBuildLab',
# ]

# for variable in tqdm(frequency_encoded_variables):
#     freq_enc_dict = frequency_encoding(variable)
#     df_all[variable] = df_all[variable].map(lambda x: freq_enc_dict.get(x, np.nan))
#     categorical_columns.remove(variable)

## Prepare Embedding Columns

In [7]:
embed_cols = []
len_embed_cols = []
for col in cols_cat:
    embed_cols.append(col)
    len_embed_cols.append(df_all[col].nunique())
print('\n Number of embed features :', len(embed_cols))


 Number of embed features : 19


## Preprocess Other Columns

In [8]:
# set index to unique identifier
df_all = df_all.set_index(COLUMN_ID)

# Select the numeric features
other_cols = [x for x in df_all.columns if x not in embed_cols]

# Impute missing values in order to scale
df_all[other_cols] = df_all[other_cols].fillna(value=0)


# Fit the scaler only on df_all data
scaler = MinMaxScaler().fit(df_all[other_cols])
df_all.loc[:, other_cols] = scaler.transform(df_all[other_cols])

# other_cols = [c for c in df_all.columns if (not c in embed_cols)]

# Create Model

In [9]:
train = df_all.loc[train_ids, embed_cols+other_cols]
test = df_all.loc[test_ids, embed_cols+other_cols]
print(train.shape, test.shape)

del df_all
gc.collect()

(10000, 63) (10000, 63)


137

In [10]:
from sklearn.model_selection import train_test_split
seed = 1208
X_train, X_valid, y_train, y_valid = train_test_split(train, Y , test_size=0.20, random_state=seed)

torch_X_train = torch.FloatTensor(X_train.values)
torch_X_valid = torch.FloatTensor(X_valid.values)
torch_y_train = torch.FloatTensor(y_train.values.astype(np.int32))
torch_y_valid = torch.FloatTensor(y_valid.values.astype(np.int32))
torch_test  = torch.FloatTensor(test.values)

In [28]:
a = torch.tensor([0,1,2])
model.emb_layers[0](a)

tensor([[-0.0463,  0.7180,  0.3123,  ..., -0.0706,  2.3758, -0.4642],
        [-0.3286, -0.1409,  0.8102,  ...,  0.6836,  0.6042,  1.0068],
        [-1.1372, -0.1712, -1.2252,  ..., -0.0583,  0.0772,  0.0157]],
       grad_fn=<EmbeddingBackward>)

In [29]:
class NeuralNet(nn.Module):
    def __init__(self):
        super().__init__()
        self.emb_layers = nn.ModuleList()
        self.dropout = nn.Dropout(.20)
        self.num_categorical = len(len_embed_cols)
        self.num_numeric = len(other_cols)
        
        for embed_col, len_embed_col in zip(embed_cols, len_embed_cols):
            self.emb_layers.append(nn.Embedding(len_embed_col, len_embed_col // 2))

        ff_inp_dim = sum(e.embedding_dim for e in self.emb_layers) + self.num_numeric
        self.ff = nn.Sequential(
            nn.Linear(ff_inp_dim, 128),
            nn.ReLU(),
            nn.Dropout(p=.20),
            nn.Linear(128, 64),
            nn.ReLU(),
            nn.Dropout(p=.15),
            nn.Linear(64, 32),
            nn.Sigmoid()
        )


    def forward(self, x_batch):
        emb_indices = x_batch[:, :self.num_categorical].long()
        emb_outs = []
        for i, emb_layer in enumerate(self.emb_layers):
            tmp = emb_indices[:, i]
            print(tmp.shape)
            print(emb_layer)
            emb_out = emb_layer(emb_indices[:, i])
            emb_out = self.dropout(emb_out)
            emb_outs.append(emb_out)
        
        embs = torch.cat(emb_outs, dim=1)

        x_numerical = x_batch[:, self.num_categorical:]
        embs_num = torch.cat([embs, x_numerical], dim=1)
        out = self.ff(embs_num)
        return out
    
    
# always call this before training for deterministic results
seed_everything(seed)

batch_size = 512
n_epochs = 6

# init model
model = NeuralNet()

# init Binary Cross Entropy loss
loss_fn = torch.nn.BCELoss(reduction='mean')

# init optimizer
optimizer = Adam(model.parameters())

In [21]:
#prepare iterators for training
torch_train = torch.utils.data.TensorDataset(torch_X_train, torch_y_train)
train_loader = torch.utils.data.DataLoader(torch_train, batch_size=batch_size, shuffle=True)
torch_valid = torch.utils.data.TensorDataset(torch_X_valid, torch_y_valid)
valid_loader = torch.utils.data.DataLoader(torch_valid, batch_size=batch_size, shuffle=False)

# init predictions
train_preds = np.zeros((torch_X_train.size(0)))
valid_preds = np.zeros((torch_X_valid.size(0)))

In [30]:
for epoch in tqdm(range(n_epochs)): 
    
    with timer(f"  * Epoch{epoch}"):
        avg_loss = 0.  
        # set the module in training mode.
        model.train()
        
        with timer(f"  * Train"):
    
            for x_batch, y_batch in tqdm(train_loader, disable=True):
                # Forward pass: compute predicted y by passing x to the model.
                y_pred = model(x_batch)
                # Compute and print loss.
                loss = loss_fn(y_pred, y_batch)
                # Before the backward pass, use the optimizer object to zero all of the
                # gradients for the Tensors it will update (which are the learnable weights
                # of the model)
                optimizer.zero_grad()
                # Backward pass: compute gradient of the loss with respect to model parameters
                loss.backward()
                # Calling the step function on an Optimizer makes an update to its parameters
                optimizer.step()
                avg_loss += loss.item() / len(train_loader)
    
        with timer(f"  * Valid"):
            
            # set evaluation mode of the model. This disabled operations which are only applied during training like dropout
            model.eval()
        
            avg_val_loss = 0.
            for i, (x_batch, y_batch) in enumerate(valid_loader):
                # detach returns a new Tensor, detached from the current graph whose result will never require gradient
                y_val_pred = model(x_batch).detach()
                avg_val_loss += loss_fn(y_val_pred, y_batch).item() / len(valid_loader)
        
                valid_preds[i * batch_size:(i+1) * batch_size] = y_val_pred.cpu().numpy()[:, 0]
            elapsed_time = time.time() - start_time 
            print('\nEpoch {}/{} \t loss={:.4f} \t val_loss={:.4f} \t time={:.2f}s'.format(
                epoch + 1, n_epochs, avg_loss, avg_val_loss, elapsed_time))
            print('AUC_VAL{} '.format(roc_auc_score(torch_y_val.cpu(),valid_preds).round(3)))

  0%|          | 0/6 [00:00<?, ?it/s]

torch.Size([512])
Embedding(3007, 1503)





RuntimeError: index out of range: Tried to access index 1414356048 out of table with 3006 rows. at /pytorch/aten/src/TH/generic/THTensorEvenMoreMath.cpp:237

In [None]:
torch_test = torch.utils.data.TensorDataset(torch_test)
test_loader = torch.utils.data.DataLoader(torch_test, batch_size=batch_size, shuffle=False)
test_preds = np.zeros((len(torch_test)))


for i, (x_batch,) in enumerate(test_loader):
    y_pred = model(x_batch).detach()
    test_preds[i * batch_size:(i+1) * batch_size] = y_pred.cpu().numpy()[:, 0]

In [None]:
fpr, tpr, _ = roc_curve(torch_y_val.cpu(),valid_preds)

roc_auc = auc(fpr,tpr)

plt.figure(figsize=(10,6))
lw = 2
plt.plot(fpr, tpr, color='darkorange',
         lw=lw, label='ROC curve (area = %0.2f)' % roc_auc)
plt.plot([0, 1], [0, 1], color='navy', lw=lw, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver operating characteristic example')
plt.legend(loc="lower right")
plt.show()

# Submission

In [None]:
submission = pd.DataFrame({'MachineIdentifier':test_ids,'HasDetections':test_preds})

In [None]:
submission.head()

In [None]:
# submission.to_csv('nn_embeddings.csv.gz', index=False, ,compression='gzip')