In [1]:
import os
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import DataLoader
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import KFold
from sklearn.metrics import precision_score, recall_score, roc_auc_score
from tqdm import tqdm

In [None]:
train_err = pd.read_csv(os.path.join(PATH, 'train_err_data.csv'))
train_problem = pd.read_csv(os.path.join(PATH, 'train_problem_data.csv'))
test_err = pd.read_csv(os.path.join(PATH, 'test_err_data.csv'))

In [6]:
PATH = '../data'
TOTAL_ERR_NUM = 42
USED_CODES = ['5', '6', 'V-21008', 'terminate by peer user', 'H-51042', '4',
             'connection fail to establish', '82', '13', '14', '83', '99', '3', '88',
             'connection timeout', '100', 'connectionterminated by local host', '91',
             'UNKNOWN', '95', '87', '94', '78', '89', '90', '81', '86', 'active',
             '85', '84', '2', 'NFANDROID2', 'S-61001', '1', '80', '79', 'B-A8002',
             'standby', '8.0', '0', 'S-65002', 'Q-64002']
INPUT_NUM = 1 + TOTAL_ERR_NUM + len(USED_CODES)  # total_errs + ...

In [7]:
used_code_dict = {code: i for i, code in enumerate(USED_CODES)}

In [2]:
def make_data(err, problem, user_min, user_num, scaler=None):
    global TOTAL_ERR_NUM, INPUT_NUM, used_err, used_code_dict

    data = np.zeros((user_num, INPUT_NUM + 1))  # + total_errs

    user_err = err[['user_id', 'errtype']].values
    for user, err in tqdm(user_err):
        data[user - user_min, err] += 1
        data[user - user_min, 0] += 1  # total_errs

    code_start_index = TOTAL_ERR_NUM + 1
    for user, code in tqdm()

    # Standard-scale
    if scaler is None:
        scaler = StandardScaler()
        scaler.fit(data[:, :-1])
    data[:, :-1] = scaler.transform(data[:, :-1])

    if problem is not None:  # if train data
        data[problem.user_id.unique() - user_min, -1] = 1.

    return data, scaler

In [17]:
tvdata, scaler = make_data(train_err, train_problem, 10000, 15000)
testdata, _    = make_data(test_err , None         , 30000, 14999, scaler)

100%|██████████| 16554663/16554663 [00:37<00:00, 437438.05it/s]
100%|██████████| 16532648/16532648 [00:36<00:00, 453362.28it/s]


In [18]:
len(train_err['errcode'].unique())

2806

In [19]:
display(tvdata.shape)
display(testdata.shape)

(15000, 34)

(14999, 34)

In [20]:
LR = 1e-3
BATCH_SIZE = 1024
MAX_EPOCHS = 100
PATIENCE = 15
CUDA = 1
RANDOM_STATE = 2021

In [21]:
device = torch.device(f'cuda:{CUDA}' if torch.cuda.is_available() else 'cpu')

In [22]:
torch.manual_seed(RANDOM_STATE)
np.random.seed(RANDOM_STATE)

In [23]:
class Dataset(torch.utils.data.Dataset):
    def __init__(self, data):
        super(Dataset, self).__init__()
        self.data = data

    def __getitem__(self, index):
        input = torch.Tensor(self.data[index, :-1])
        label = torch.Tensor(self.data[index, -1:])
        return input, label

    def __len__(self):
        return len(self.data)

In [24]:
class Net(nn.Module):
    def __init__(self, input_num):
        super(Net, self).__init__()

        self.dense = nn.Sequential(
            nn.Linear(input_num, 512),
            nn.ReLU(),
            nn.Dropout(),
            nn.Linear(512, 1024),
            nn.ReLU(),
            nn.Dropout(),
            nn.Linear(1024, 512),
            nn.ReLU(),
            nn.Dropout(),
            nn.Linear(512, 128),
            nn.ReLU(),
            nn.Dropout(),
            nn.Linear(128, 1),
            nn.Sigmoid()
        )

    def forward(self, x):
        x = self.dense(x)
        return x

In [25]:
k_fold = KFold(n_splits=5, shuffle=True, random_state=RANDOM_STATE)

best_aucs = [0.] * k_fold.get_n_splits()

for i_fold, (train_idx, valid_idx) in enumerate(k_fold.split(tvdata)):
    print(f'Fold {i_fold} started!')

    validdata = tvdata[valid_idx]
    traindata = tvdata[train_idx]

    trainset = Dataset(traindata)
    validset = Dataset(validdata)

    trainloader = DataLoader(trainset, batch_size=BATCH_SIZE, shuffle=True)
    validloader = DataLoader(validset, batch_size=BATCH_SIZE, shuffle=False)

    net = Net(INPUT_NUM).to(device)
    optimizer = torch.optim.Adam(net.parameters(), lr=LR)
    criterion = nn.BCELoss()

    stop_cnt = 0

    # loop over the dataset multiple times
    for epoch in range(1, MAX_EPOCHS + 1):
        print(f'Epoch {epoch} ', end='\t')

        # Training
        net.train()
        running_loss = 0.0
        saved_outputs = []
        saved_labels = []

        for i, data in enumerate(trainloader, 0):
            inputs, labels = data
            inputs, labels = inputs.to(device), labels.to(device)

            # zero the parameter gradients
            optimizer.zero_grad()

            # forward + backward + optimize
            outputs = net(inputs)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()

            running_loss  += loss.item() * len(outputs)
            saved_outputs += outputs.squeeze()        .tolist()
            saved_labels  += (labels.squeeze() >= 0.5).tolist()

        running_loss /= len(trainloader.dataset)
        auc = roc_auc_score(saved_labels, saved_outputs)

        print(f'train loss={running_loss:.3f} \t'
              f'train auc={ auc         :.3f} \t', end='')

        # Validation
        net.eval()
        running_loss = 0.0
        saved_outputs = []
        saved_labels = []

        for i, data in enumerate(validloader, 0):
            inputs, labels = data
            inputs, labels = inputs.to(device), labels.to(device)

            outputs = net(inputs)
            loss = criterion(outputs, labels)

            running_loss  += loss.item() * len(outputs)
            saved_outputs += outputs.squeeze()        .tolist()
            saved_labels  += (labels.squeeze() >= 0.5).tolist()

        running_loss /= len(validloader.dataset)
        auc = roc_auc_score(saved_labels, saved_outputs)

        print(f'valid loss={running_loss:.3f} \t'
              f'valid auc={ auc         :.3f} \t', end='')

        if auc > best_aucs[i_fold]:
            best_aucs[i_fold] = auc
            torch.save(net.state_dict(), f'../models/{i_fold}.pt')
            print(f'model saved!', end='')
            stop_cnt = 0
        else:
            stop_cnt += 1
            if stop_cnt > PATIENCE:
                print()
                break

        print()

in auc=0.768 	valid loss=0.542 	valid auc=0.745 	
Epoch 54 	train loss=0.523 	train auc=0.770 	valid loss=0.542 	valid auc=0.745 	
Epoch 55 	train loss=0.520 	train auc=0.771 	valid loss=0.542 	valid auc=0.744 	
Epoch 56 	train loss=0.519 	train auc=0.772 	valid loss=0.543 	valid auc=0.745 	
Epoch 57 	train loss=0.520 	train auc=0.772 	valid loss=0.543 	valid auc=0.744 	
Fold 2 started!
Epoch 1 	train loss=0.626 	train auc=0.609 	valid loss=0.602 	valid auc=0.693 	model saved!
Epoch 2 	train loss=0.584 	train auc=0.694 	valid loss=0.579 	valid auc=0.710 	model saved!
Epoch 3 	train loss=0.570 	train auc=0.707 	valid loss=0.575 	valid auc=0.715 	model saved!
Epoch 4 	train loss=0.569 	train auc=0.718 	valid loss=0.575 	valid auc=0.717 	model saved!
Epoch 5 	train loss=0.571 	train auc=0.714 	valid loss=0.569 	valid auc=0.722 	model saved!
Epoch 6 	train loss=0.570 	train auc=0.720 	valid loss=0.571 	valid auc=0.722 	
Epoch 7 	train loss=0.560 	train auc=0.720 	valid loss=0.565 	valid au

In [26]:
cv_auc = sum(best_aucs) / len(best_aucs)
print(f'CV auc={cv_auc:.3f}')

CV auc=0.754


In [27]:
testset = Dataset(testdata)
testloader = DataLoader(testset, batch_size=BATCH_SIZE, shuffle=False)

net = Net(INPUT_NUM)
net.to(device) 

votes = np.zeros(len(testloader.dataset))

for i in range(k_fold.get_n_splits()):
    net.load_state_dict(torch.load(f'../models/{i}.pt'))

    saved_outputs = []

    for i, data in enumerate(testloader, 0):
        inputs, labels = data
        inputs, labels = inputs.to(device), labels.to(device)

        outputs = net(inputs)
        saved_outputs += outputs.squeeze().tolist()

    votes += np.array(saved_outputs)

votes = votes / k_fold.get_n_splits()

In [28]:
sample_submission = pd.read_csv(os.path.join(PATH, 'sample_submission.csv'))
sample_submission['problem'] = votes
sample_submission.to_csv('../submission.csv', index=False)
sample_submission


Unnamed: 0,user_id,problem
0,30000,0.938583
1,30001,0.207983
2,30002,0.401872
3,30003,0.211236
4,30004,0.610800
...,...,...
14994,44994,0.485103
14995,44995,0.329633
14996,44996,0.757862
14997,44997,0.517425
