# Import Libraries and Initial Settings

In [1]:
import os
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import DataLoader
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import KFold
from sklearn.metrics import roc_auc_score
from tqdm import tqdm

In [2]:
CUDA = 1
RANDOM_STATE = 2021
device = torch.device(f'cuda:{CUDA}' if torch.cuda.is_available() else 'cpu')
torch.manual_seed(RANDOM_STATE)
np.random.seed(RANDOM_STATE)

# Load data

In [3]:
PATH = '../data'
train_err = pd.read_csv(os.path.join(PATH, 'train_err_data.csv'))
train_problem = pd.read_csv(os.path.join(PATH, 'train_problem_data.csv'))
test_err = pd.read_csv(os.path.join(PATH, 'test_err_data.csv'))

# Define Dataset and Model Class

In [4]:
class Dataset(torch.utils.data.Dataset):
    def __init__(self, data):
        super(Dataset, self).__init__()
        self.data = data

    def __getitem__(self, index):
        input = torch.Tensor(self.data[index, :-1])
        label = torch.Tensor(self.data[index, -1:])
        return input, label

    def __len__(self):
        return len(self.data)

In [5]:
class Net(nn.Module):
    def __init__(self, input_num):
        super(Net, self).__init__()

        self.dense = nn.Sequential(
            nn.Linear(input_num, 256),
            nn.ReLU(),
            nn.Dropout(),
            nn.Linear(256, 512),
            nn.ReLU(),
            nn.Dropout(),
            nn.Linear(512, 256),
            nn.ReLU(),
            nn.Dropout(),
            nn.Linear(256, 1),
            nn.Sigmoid()
        )

    def forward(self, x):
        x = self.dense(x)
        return x

# Data Prepration

In [6]:
def make_data(err, problem, users, scaler=None, time_before=False):
    global TOTAL_ERR_NUM, INPUT_NUM, used_code_dict

    data = np.zeros((len(users), INPUT_NUM + 1))
    user2idx = {user: idx for idx, user in enumerate(users)}

    if time_before:
        alive_func = lambda row: row['time'] <= init_report_time[row['user_id']] \
                                 if row['user_id'] in init_report_time \
                                 else True
        alive = err.apply(alive_func, axis=1)
        err = err[alive]

    user_err = err[['user_id', 'errtype']].values
    for user, errtype in tqdm(user_err):
        idx = user2idx[user]
        data[idx, errtype] += 1
        data[idx, 0] += 1  # total_errs

    code_start_index = 1 + TOTAL_ERR_NUM
    user_id = err['user_id'].values
    errcode = err['errcode'].values
    for user, errcode in tqdm(zip(user_id, errcode)):
        if errcode in used_code_dict:
            col = code_start_index + used_code_dict[errcode]
            data[user2idx[user], col] += 1

    # Standard-scale
    if scaler is None:
        scaler = StandardScaler()
        scaler.fit(data[:, :-1])
    data[:, :-1] = scaler.transform(data[:, :-1])

    if problem is not None:  # if train data
        true_targets = [user2idx[user] for user in problem['user_id']]
        data[true_targets, -1] = 1.

    return data, scaler

In [7]:
# constants for data preprocess
TOTAL_ERR_NUM = 42
USED_CODES = ['5', '6', 'V-21008', 'terminate by peer user', 'H-51042', '4',
              'connection fail to establish', '82', '13', '14', '83', '99', '3', '88',
              'connection timeout', '100', 'connectionterminated by local host', '91',
              'UNKNOWN', '95', '87', '94', '78', '89', '90', '81', '86', 'active',
              '85', '84', '2', 'NFANDROID2', 'S-61001', '1', '80', '79', 'B-A8002',
              'standby', '8.0', '0', 'S-65002', 'Q-64002']
INPUT_NUM = 1 + TOTAL_ERR_NUM + len(USED_CODES)  # total_errs + ...
tv_users   = np.array(range(10000, 25000))
test_users = np.array(range(30000, 44999))
print('# of inputs:', INPUT_NUM)

# get hashes such as used_code_dict, init_report_time
used_code_dict = {code: i for i, code in enumerate(USED_CODES)}
init_report_time = {}
for _, row in train_problem.iterrows():
    if row['user_id'] not in init_report_time:
        init_report_time[row['user_id']] = row['time']
    else:
        if init_report_time[row['user_id']] > row['time']:  # save earlier time
            init_report_time[row['user_id']] = row['time']

# get a scaler and make test data
_,   scaler = make_data(train_err, train_problem, tv_users)
testdata, _ = make_data(test_err , None         , test_users , scaler)

# of inputs: 85
100%|██████████| 16554663/16554663 [00:30<00:00, 549228.37it/s]
16554663it [00:13, 1206089.30it/s]
100%|██████████| 16532648/16532648 [00:30<00:00, 541081.16it/s]
16532648it [00:13, 1220338.76it/s]


# Training

In [8]:
# hyperparameters for learning
LR = 1e-3
BATCH_SIZE = 1024
MAX_EPOCHS = 100
PATIENCE = 15

k_fold = KFold(n_splits=5, shuffle=True, random_state=RANDOM_STATE)
best_aucs = [0.] * k_fold.get_n_splits()

for i_fold, (train_idx, valid_idx) in enumerate(k_fold.split(tv_users)):
    print(f'Fold {i_fold} started!')

    train_users = tv_users[train_idx]
    valid_users = tv_users[valid_idx]

    te = train_err[train_err['user_id'].isin(train_users)]  # 오래 걸림
    ve = train_err[train_err['user_id'].isin(valid_users)]  # 오래 걸림
    tp = train_problem[train_problem['user_id'].isin(train_users)]
    vp = train_problem[train_problem['user_id'].isin(valid_users)]

    traindata, _ = make_data(te, tp, train_users, scaler, True)
    validdata, _ = make_data(ve, vp, valid_users, scaler)

    trainset = Dataset(traindata)
    validset = Dataset(validdata)

    trainloader = DataLoader(trainset, batch_size=BATCH_SIZE, shuffle=True)
    validloader = DataLoader(validset, batch_size=BATCH_SIZE, shuffle=False)

    net = Net(INPUT_NUM).to(device)
    optimizer = torch.optim.Adam(net.parameters(), lr=LR)
    criterion = nn.BCELoss()

    stop_cnt = 0

    # loop over the dataset multiple times
    for epoch in range(1, MAX_EPOCHS + 1):
        print(f'Epoch {epoch} ', end='\t')

        # Training
        net.train()
        running_loss = 0.0
        saved_outputs = []
        saved_labels = []

        for i, data in enumerate(trainloader, 0):
            inputs, labels = data
            inputs, labels = inputs.to(device), labels.to(device)

            # zero the parameter gradients
            optimizer.zero_grad()

            # forward + backward + optimize
            outputs = net(inputs)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()

            running_loss  += loss.item() * len(outputs)
            saved_outputs += outputs.squeeze()        .tolist()
            saved_labels  += (labels.squeeze() >= 0.5).tolist()

        running_loss /= len(trainloader.dataset)
        auc = roc_auc_score(saved_labels, saved_outputs)

        print(f'train loss={running_loss:.3f} \t'
              f'train auc={ auc         :.3f} \t', end='')

        # Validation
        net.eval()
        running_loss = 0.0
        saved_outputs = []
        saved_labels = []

        for i, data in enumerate(validloader, 0):
            inputs, labels = data
            inputs, labels = inputs.to(device), labels.to(device)

            outputs = net(inputs)
            loss = criterion(outputs, labels)

            running_loss  += loss.item() * len(outputs)
            saved_outputs += outputs.squeeze()        .tolist()
            saved_labels  += (labels.squeeze() >= 0.5).tolist()

        running_loss /= len(validloader.dataset)
        auc = roc_auc_score(saved_labels, saved_outputs)

        print(f'valid loss={running_loss:.3f} \t'
              f'valid auc={ auc         :.3f} \t', end='')

        if auc > best_aucs[i_fold]:
            best_aucs[i_fold] = auc
            torch.save(net.state_dict(), f'../models/{i_fold}.pt')
            print(f'model saved!', end='')
            stop_cnt = 0
        else:
            stop_cnt += 1
            if stop_cnt > PATIENCE:
                print()
                break
        print()

d loss=0.665 	valid auc=0.655 	model saved!
Epoch 11 	train loss=0.435 	train auc=0.857 	valid loss=0.663 	valid auc=0.659 	model saved!
Epoch 12 	train loss=0.426 	train auc=0.864 	valid loss=0.674 	valid auc=0.664 	model saved!
Epoch 13 	train loss=0.421 	train auc=0.867 	valid loss=0.703 	valid auc=0.653 	
Epoch 14 	train loss=0.414 	train auc=0.872 	valid loss=0.693 	valid auc=0.657 	
Epoch 15 	train loss=0.409 	train auc=0.875 	valid loss=0.664 	valid auc=0.670 	model saved!
Epoch 16 	train loss=0.403 	train auc=0.879 	valid loss=0.696 	valid auc=0.664 	
Epoch 17 	train loss=0.398 	train auc=0.880 	valid loss=0.707 	valid auc=0.667 	
Epoch 18 	train loss=0.392 	train auc=0.884 	valid loss=0.711 	valid auc=0.669 	
Epoch 19 	train loss=0.386 	train auc=0.888 	valid loss=0.690 	valid auc=0.672 	model saved!
Epoch 20 	train loss=0.381 	train auc=0.891 	valid loss=0.683 	valid auc=0.675 	model saved!
Epoch 21 	train loss=0.382 	train auc=0.890 	valid loss=0.722 	valid auc=0.660 	
Epoch

# Make a Submission File

In [9]:
testset = Dataset(testdata)
testloader = DataLoader(testset, batch_size=BATCH_SIZE, shuffle=False)

net = Net(INPUT_NUM)
net.to(device) 

votes = np.zeros(len(testloader.dataset))

for i in range(k_fold.get_n_splits()):
    net.load_state_dict(torch.load(f'../models/{i}.pt'))

    saved_outputs = []

    for i, data in enumerate(testloader, 0):
        inputs, labels = data
        inputs, labels = inputs.to(device), labels.to(device)

        outputs = net(inputs)
        saved_outputs += outputs.squeeze().tolist()

    votes += np.array(saved_outputs)

votes = votes / k_fold.get_n_splits()

sample_submission = pd.read_csv(os.path.join(PATH, 'sample_submission.csv'))
sample_submission['problem'] = votes
sample_submission.to_csv('../submission.csv', index=False)

# Print the AUC Score

In [10]:
cv_auc = sum(best_aucs) / len(best_aucs)
cv_auc

0.683711447134824