# Import Libraries and Initial Settings

In [1]:
import os
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import DataLoader
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import KFold
from sklearn.metrics import roc_auc_score
from tqdm import tqdm

In [2]:
CUDA = 1
RANDOM_STATE = 2021
device = torch.device(f'cuda:{CUDA}' if torch.cuda.is_available() else 'cpu')
torch.manual_seed(RANDOM_STATE)
np.random.seed(RANDOM_STATE)

# Load data

In [3]:
PATH = '../data'
train_err = pd.read_csv(os.path.join(PATH, 'train_err_data.csv'))
train_problem = pd.read_csv(os.path.join(PATH, 'train_problem_data.csv'))
test_err = pd.read_csv(os.path.join(PATH, 'test_err_data.csv'))

# Define Dataset and Model Class

In [4]:
class Dataset(torch.utils.data.Dataset):
    def __init__(self, data):
        super(Dataset, self).__init__()
        self.data = data

    def __getitem__(self, index):
        input = torch.Tensor(self.data[index, :-1])
        label = torch.Tensor(self.data[index, -1:])
        return input, label

    def __len__(self):
        return len(self.data)

In [5]:
class Net(nn.Module):
    def __init__(self, input_num):
        super(Net, self).__init__()

        self.dense = nn.Sequential(
            nn.Linear(input_num, 256),
            nn.ReLU(),
            nn.Dropout(),
            nn.Linear(256, 512),
            nn.ReLU(),
            nn.Dropout(),
            nn.Linear(512, 256),
            nn.ReLU(),
            nn.Dropout(),
            nn.Linear(256, 1),
            nn.Sigmoid()
        )

    def forward(self, x):
        x = self.dense(x)
        return x

# Data Prepration

In [6]:
def make_data(err, problem, users, scaler=None):
    global TOTAL_ERR_NUM, INPUT_NUM, used_code_dict

    data = np.zeros((len(users), INPUT_NUM + 1))
    user2idx = {user: idx for idx, user in enumerate(users)}

    # errtype features
    user_err = err[['user_id', 'errtype']].values
    for user, errtype in tqdm(user_err):
        idx = user2idx[user]
        data[idx, errtype] += 1
        data[idx, 0] += 1  # total_errs

    # errcode features
    code_start_index = 1 + TOTAL_ERR_NUM
    user_id = err['user_id'].values
    errcode = err['errcode'].values
    for user, errcode in tqdm(zip(user_id, errcode)):
        if errcode in used_code_dict:
            col = code_start_index + used_code_dict[errcode]
            data[user2idx[user], col] += 1

    # model_changed, fwver changed feature
    model_changed_col = 1 + TOTAL_ERR_NUM + len(used_code_dict)
    fwver_changed_col = model_changed_col + 1
    model_nm = err['model_nm'].values
    fwvers   = err['fwver'   ].values
    prev_user, prev_model, prev_fwver = None, None, None
    for user, model, fwver in tqdm(zip(user_id, model_nm, fwvers)):
        if prev_user is None or prev_user != user:
            prev_user  = user
            prev_model = model
            prev_fwver = fwver
        else:
            if prev_model is None:
                prev_model = model
            elif prev_model != model:
                prev_model = model
                data[user2idx[user], model_changed_col] = 1.
            if prev_fwver is None:
                prev_fwver = fwver
            elif prev_fwver != fwver:
                prev_fwver = fwver
                data[user2idx[user], fwver_changed_col] = 1.

    # Standard-scale
    scaler = None
    if scaler is None:
        scaler = StandardScaler()
        scaler.fit(data[:, :-3])
    data[:, :-3] = scaler.transform(data[:, :-3])

    if problem is not None:  # if train data
        true_targets = [user2idx[user] for user in problem['user_id']]
        data[true_targets, -1] = 1.

    return data, scaler

In [7]:
# constants for data preprocess
TOTAL_ERR_NUM = 42
USED_CODES = ['5', '6', 'V-21008', 'terminate by peer user', 'H-51042', '4',
              'connection fail to establish', '82', '13', '14', '83', '99', '3', '88',
              'connection timeout', '100', 'connectionterminated by local host', '91',
              'UNKNOWN', '95', '87', '94', '78', '89', '90', '81', '86', 'active',
              '85', '84', '2', 'NFANDROID2', 'S-61001', '1', '80', '79', 'B-A8002',
              'standby', '8.0', '0', 'S-65002', 'Q-64002']
INPUT_NUM = 1 + TOTAL_ERR_NUM + len(USED_CODES) + 2  # total_errs + ... + model_changed + fwver_changed
tv_users   = np.array(range(10000, 25000))
test_users = np.array(range(30000, 44999))
print('# of inputs:', INPUT_NUM)

# get hashes such as used_code_dict
used_code_dict = {code: i for i, code in enumerate(USED_CODES)}

# get a scaler and make test data
tvdata  , scaler = make_data(train_err, train_problem, tv_users)
testdata, _      = make_data(test_err , None         , test_users, scaler)

  1%|          | 106088/16554663 [00:00<00:30, 533836.67it/s]# of inputs: 87
100%|██████████| 16554663/16554663 [00:30<00:00, 549330.52it/s]
16554663it [00:13, 1227280.31it/s]
16554663it [00:05, 2874463.82it/s]
100%|██████████| 16532648/16532648 [00:29<00:00, 551165.63it/s]
16532648it [00:13, 1235567.17it/s]
16532648it [00:05, 2853819.29it/s]


# Training

In [8]:
# hyperparameters for learning
LR = 1e-3
BATCH_SIZE = 1024
MAX_EPOCHS = 100
PATIENCE = 15

k_fold = KFold(n_splits=5, shuffle=True, random_state=RANDOM_STATE)
best_aucs = [0.] * k_fold.get_n_splits()

for i_fold, (train_idx, valid_idx) in enumerate(k_fold.split(tvdata)):
    print(f'Fold {i_fold} started!')

    traindata = tvdata[train_idx]
    validdata = tvdata[valid_idx]

    trainset = Dataset(traindata)
    validset = Dataset(validdata)

    trainloader = DataLoader(trainset, batch_size=BATCH_SIZE, shuffle=True)
    validloader = DataLoader(validset, batch_size=BATCH_SIZE, shuffle=False)

    net = Net(INPUT_NUM).to(device)
    optimizer = torch.optim.Adam(net.parameters(), lr=LR)
    criterion = nn.BCELoss()

    stop_cnt = 0

    # loop over the dataset multiple times
    for epoch in range(1, MAX_EPOCHS + 1):
        print(f'Epoch {epoch} ', end='\t')

        # Training
        net.train()
        running_loss = 0.0
        saved_outputs = []
        saved_labels = []

        for i, data in enumerate(trainloader, 0):
            inputs, labels = data
            inputs, labels = inputs.to(device), labels.to(device)

            # zero the parameter gradients
            optimizer.zero_grad()

            # forward + backward + optimize
            outputs = net(inputs)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()

            running_loss  += loss.item() * len(outputs)
            saved_outputs += outputs.squeeze()        .tolist()
            saved_labels  += (labels.squeeze() >= 0.5).tolist()

        running_loss /= len(trainloader.dataset)
        auc = roc_auc_score(saved_labels, saved_outputs)

        print(f'train loss={running_loss:.3f} \t'
              f'train auc={ auc         :.3f} \t', end='')

        # Validation
        net.eval()
        running_loss = 0.0
        saved_outputs = []
        saved_labels = []

        for i, data in enumerate(validloader, 0):
            inputs, labels = data
            inputs, labels = inputs.to(device), labels.to(device)

            outputs = net(inputs)
            loss = criterion(outputs, labels)

            running_loss  += loss.item() * len(outputs)
            saved_outputs += outputs.squeeze()        .tolist()
            saved_labels  += (labels.squeeze() >= 0.5).tolist()

        running_loss /= len(validloader.dataset)
        auc = roc_auc_score(saved_labels, saved_outputs)

        print(f'valid loss={running_loss:.3f} \t'
              f'valid auc={ auc         :.3f} \t', end='')

        if auc > best_aucs[i_fold]:
            best_aucs[i_fold] = auc
            torch.save(net.state_dict(), f'../models/{i_fold}.pt')
            print(f'model saved!', end='')
            stop_cnt = 0
        else:
            stop_cnt += 1
            if stop_cnt > PATIENCE:
                print()
                break
        print()

uc=0.820 	valid loss=0.491 	valid auc=0.801 	
Epoch 44 	train loss=0.453 	train auc=0.821 	valid loss=0.494 	valid auc=0.801 	
Epoch 45 	train loss=0.454 	train auc=0.820 	valid loss=0.491 	valid auc=0.800 	
Epoch 46 	train loss=0.453 	train auc=0.821 	valid loss=0.494 	valid auc=0.802 	model saved!
Epoch 47 	train loss=0.450 	train auc=0.823 	valid loss=0.495 	valid auc=0.800 	
Epoch 48 	train loss=0.453 	train auc=0.823 	valid loss=0.495 	valid auc=0.800 	
Epoch 49 	train loss=0.451 	train auc=0.823 	valid loss=0.495 	valid auc=0.800 	
Epoch 50 	train loss=0.458 	train auc=0.823 	valid loss=0.495 	valid auc=0.800 	
Epoch 51 	train loss=0.456 	train auc=0.824 	valid loss=0.493 	valid auc=0.802 	
Epoch 52 	train loss=0.451 	train auc=0.821 	valid loss=0.494 	valid auc=0.800 	
Epoch 53 	train loss=0.445 	train auc=0.827 	valid loss=0.495 	valid auc=0.800 	
Epoch 54 	train loss=0.444 	train auc=0.827 	valid loss=0.497 	valid auc=0.799 	
Epoch 55 	train loss=0.447 	train auc=0.826 	valid 

# Make a Submission File

In [9]:
testset = Dataset(testdata)
testloader = DataLoader(testset, batch_size=BATCH_SIZE, shuffle=False)

net = Net(INPUT_NUM)
net.to(device) 

votes = np.zeros(len(testloader.dataset))

for i in range(k_fold.get_n_splits()):
    net.load_state_dict(torch.load(f'../models/{i}.pt'))

    saved_outputs = []

    for i, data in enumerate(testloader, 0):
        inputs, labels = data
        inputs, labels = inputs.to(device), labels.to(device)

        outputs = net(inputs)
        saved_outputs += outputs.squeeze().tolist()

    votes += np.array(saved_outputs)

votes = votes / k_fold.get_n_splits()

sample_submission = pd.read_csv(os.path.join(PATH, 'sample_submission.csv'))
sample_submission['problem'] = votes
sample_submission.to_csv('../submission.csv', index=False)

# Print the AUC Score

In [10]:
cv_auc = sum(best_aucs) / len(best_aucs)
cv_auc

0.8041029649548914