In [1]:
import os
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import DataLoader
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import KFold
from sklearn.metrics import precision_score, recall_score, roc_auc_score
from tqdm import tqdm

In [2]:
PATH = '../data'
train_err = pd.read_csv(os.path.join(PATH, 'train_err_data.csv'))
train_problem = pd.read_csv(os.path.join(PATH, 'train_problem_data.csv'))
test_err = pd.read_csv(os.path.join(PATH, 'test_err_data.csv'))

In [3]:
TOTAL_ERR_NUM = 42
USED_CODES = ['5', '6', 'V-21008', 'terminate by peer user', 'H-51042', '4',
              'connection fail to establish', '82', '13', '14', '83', '99', '3', '88',
              'connection timeout', '100', 'connectionterminated by local host', '91',
              'UNKNOWN', '95', '87', '94', '78', '89', '90', '81', '86', 'active',
              '85', '84', '2', 'NFANDROID2', 'S-61001', '1', '80', '79', 'B-A8002',
              'standby', '8.0', '0', 'S-65002', 'Q-64002']
INPUT_NUM = 1 + TOTAL_ERR_NUM + len(USED_CODES)  # total_errs + ...
print('# of inputs:', INPUT_NUM)

# of inputs: 85


In [4]:
used_code_dict = {code: i for i, code in enumerate(USED_CODES)}

In [5]:
def make_data(err, problem, user_min, user_num, scaler=None):
    global TOTAL_ERR_NUM, INPUT_NUM, used_err, used_code_dict

    data = np.zeros((user_num, INPUT_NUM + 1))

    user_err = err[['user_id', 'errtype']].values
    for user, errtype in tqdm(user_err):
        data[user - user_min, errtype] += 1
        data[user - user_min, 0] += 1  # total_errs

    code_start_index = 1 + TOTAL_ERR_NUM
    user_id = err['user_id'].values
    errcode = err['errcode'].values
    for user, errcode in tqdm(zip(user_id, errcode)):
        if errcode in used_code_dict:
            index = code_start_index + used_code_dict[errcode]
            data[user - user_min, index] += 1

    # Standard-scale
    if scaler is None:
        scaler = StandardScaler()
        scaler.fit(data[:, :-1])
    data[:, :-1] = scaler.transform(data[:, :-1])

    if problem is not None:  # if train data
        data[problem.user_id.unique() - user_min, -1] = 1.

    return data, scaler

In [6]:
tvdata, scaler = make_data(train_err, train_problem, 10000, 15000)
testdata, _    = make_data(test_err , None         , 30000, 14999, scaler)

100%|██████████| 16554663/16554663 [00:38<00:00, 433951.08it/s]
16554663it [00:16, 1000880.86it/s]
100%|██████████| 16532648/16532648 [00:38<00:00, 434797.57it/s]
16532648it [00:16, 982485.01it/s] 


In [7]:
display(tvdata.shape)
display(testdata.shape)

(15000, 86)

(14999, 86)

파라미터 바꿀 시 여기부터 실행

In [8]:
LR = 1e-3
BATCH_SIZE = 1024
MAX_EPOCHS = 100
PATIENCE = 15
CUDA = 1
RANDOM_STATE = 2021

In [9]:
device = torch.device(f'cuda:{CUDA}' if torch.cuda.is_available() else 'cpu')

In [10]:
torch.manual_seed(RANDOM_STATE)
np.random.seed(RANDOM_STATE)

In [11]:
class Dataset(torch.utils.data.Dataset):
    def __init__(self, data):
        super(Dataset, self).__init__()
        self.data = data

    def __getitem__(self, index):
        input = torch.Tensor(self.data[index, :-1])
        label = torch.Tensor(self.data[index, -1:])
        return input, label

    def __len__(self):
        return len(self.data)

모델 변경 시 여기부터 실행

In [12]:
class Net(nn.Module):
    def __init__(self, input_num):
        super(Net, self).__init__()

        self.dense = nn.Sequential(
            nn.Linear(input_num, 256),
            nn.ReLU(),
            nn.Dropout(),
            nn.Linear(256, 512),
            nn.ReLU(),
            nn.Dropout(),
            nn.Linear(512, 256),
            nn.ReLU(),
            nn.Dropout(),
            nn.Linear(256, 1),
            nn.Sigmoid()
        )

    def forward(self, x):
        x = self.dense(x)
        return x

In [13]:
k_fold = KFold(n_splits=5, shuffle=True, random_state=RANDOM_STATE)

best_aucs = [0.] * k_fold.get_n_splits()

for i_fold, (train_idx, valid_idx) in enumerate(k_fold.split(tvdata)):
    print(f'Fold {i_fold} started!')

    validdata = tvdata[valid_idx]
    traindata = tvdata[train_idx]

    trainset = Dataset(traindata)
    validset = Dataset(validdata)

    trainloader = DataLoader(trainset, batch_size=BATCH_SIZE, shuffle=True)
    validloader = DataLoader(validset, batch_size=BATCH_SIZE, shuffle=False)

    net = Net(INPUT_NUM).to(device)
    optimizer = torch.optim.Adam(net.parameters(), lr=LR)
    criterion = nn.BCELoss()

    stop_cnt = 0

    # loop over the dataset multiple times
    for epoch in range(1, MAX_EPOCHS + 1):
        print(f'Epoch {epoch} ', end='\t')

        # Training
        net.train()
        running_loss = 0.0
        saved_outputs = []
        saved_labels = []

        for i, data in enumerate(trainloader, 0):
            inputs, labels = data
            inputs, labels = inputs.to(device), labels.to(device)

            # zero the parameter gradients
            optimizer.zero_grad()

            # forward + backward + optimize
            outputs = net(inputs)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()

            running_loss  += loss.item() * len(outputs)
            saved_outputs += outputs.squeeze()        .tolist()
            saved_labels  += (labels.squeeze() >= 0.5).tolist()

        running_loss /= len(trainloader.dataset)
        auc = roc_auc_score(saved_labels, saved_outputs)

        print(f'train loss={running_loss:.3f} \t'
              f'train auc={ auc         :.3f} \t', end='')

        # Validation
        net.eval()
        running_loss = 0.0
        saved_outputs = []
        saved_labels = []

        for i, data in enumerate(validloader, 0):
            inputs, labels = data
            inputs, labels = inputs.to(device), labels.to(device)

            outputs = net(inputs)
            loss = criterion(outputs, labels)

            running_loss  += loss.item() * len(outputs)
            saved_outputs += outputs.squeeze()        .tolist()
            saved_labels  += (labels.squeeze() >= 0.5).tolist()

        running_loss /= len(validloader.dataset)
        auc = roc_auc_score(saved_labels, saved_outputs)

        print(f'valid loss={running_loss:.3f} \t'
              f'valid auc={ auc         :.3f} \t', end='')

        if auc > best_aucs[i_fold]:
            best_aucs[i_fold] = auc
            torch.save(net.state_dict(), f'../models/{i_fold}.pt')
            print(f'model saved!', end='')
            stop_cnt = 0
        else:
            stop_cnt += 1
            if stop_cnt > PATIENCE:
                print()
                break

        print()

uc=0.775 	
Epoch 41 	train loss=0.468 	train auc=0.815 	valid loss=0.511 	valid auc=0.776 	
Epoch 42 	train loss=0.470 	train auc=0.812 	valid loss=0.508 	valid auc=0.781 	model saved!
Epoch 43 	train loss=0.469 	train auc=0.815 	valid loss=0.509 	valid auc=0.779 	
Epoch 44 	train loss=0.476 	train auc=0.815 	valid loss=0.506 	valid auc=0.780 	
Epoch 45 	train loss=0.466 	train auc=0.817 	valid loss=0.511 	valid auc=0.776 	
Epoch 46 	train loss=0.464 	train auc=0.817 	valid loss=0.511 	valid auc=0.778 	
Epoch 47 	train loss=0.465 	train auc=0.818 	valid loss=0.511 	valid auc=0.778 	
Epoch 48 	train loss=0.465 	train auc=0.817 	valid loss=0.509 	valid auc=0.779 	
Epoch 49 	train loss=0.465 	train auc=0.817 	valid loss=0.510 	valid auc=0.778 	
Epoch 50 	train loss=0.462 	train auc=0.819 	valid loss=0.509 	valid auc=0.781 	
Epoch 51 	train loss=0.464 	train auc=0.820 	valid loss=0.508 	valid auc=0.780 	
Epoch 52 	train loss=0.462 	train auc=0.821 	valid loss=0.509 	valid auc=0.779 	
Epoch

In [14]:
testset = Dataset(testdata)
testloader = DataLoader(testset, batch_size=BATCH_SIZE, shuffle=False)

net = Net(INPUT_NUM)
net.to(device) 

votes = np.zeros(len(testloader.dataset))

for i in range(k_fold.get_n_splits()):
    net.load_state_dict(torch.load(f'../models/{i}.pt'))

    saved_outputs = []

    for i, data in enumerate(testloader, 0):
        inputs, labels = data
        inputs, labels = inputs.to(device), labels.to(device)

        outputs = net(inputs)
        saved_outputs += outputs.squeeze().tolist()

    votes += np.array(saved_outputs)

votes = votes / k_fold.get_n_splits()

In [15]:
sample_submission = pd.read_csv(os.path.join(PATH, 'sample_submission.csv'))
sample_submission['problem'] = votes
sample_submission.to_csv('../submission.csv', index=False)
sample_submission


Unnamed: 0,user_id,problem
0,30000,0.951213
1,30001,0.194668
2,30002,0.579603
3,30003,0.560690
4,30004,0.426514
...,...,...
14994,44994,0.458819
14995,44995,0.316946
14996,44996,0.861230
14997,44997,0.843933


In [16]:
cv_auc = sum(best_aucs) / len(best_aucs)
print(f'CV auc={cv_auc:.3f}')

CV auc=0.797
