In [1]:
import os
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import DataLoader
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import KFold
from sklearn.metrics import precision_score, recall_score, roc_auc_score
from tqdm import tqdm

In [2]:
def make_data(err, problem, user_min, user_num, input_num):
    data = np.zeros((user_num, input_num + 1))
    user_err = err[['user_id', 'errtype']].values

    for user, err in tqdm(user_err):
        data[user - user_min, err - 1] += 1

    # Standard-scale
    data[:, :-1] = StandardScaler().fit_transform(data[:, :-1])

    if problem is not None:  # if train data
        data[problem.user_id.unique() - user_min, -1] = 1.

    return data

In [3]:
PATH = '../data'
INPUT_NUM = 42

In [4]:
train_err = pd.read_csv(os.path.join(PATH, 'train_err_data.csv'))
train_problem = pd.read_csv(os.path.join(PATH, 'train_problem_data.csv'))
test_err = pd.read_csv(os.path.join(PATH, 'test_err_data.csv'))

tvdata   = make_data(train_err, train_problem, 10000, 15000, INPUT_NUM)
testdata = make_data(test_err , None         , 30000, 14999, INPUT_NUM)

100%|██████████| 16554663/16554663 [00:29<00:00, 568886.46it/s]
100%|██████████| 16532648/16532648 [00:28<00:00, 572991.32it/s]


In [5]:
LR = 3e-4
BATCH_SIZE = 1024
MAX_EPOCHS = 100
PATIENCE = 15
CUDA = 1
RANDOM_STATE = 2021

In [6]:
device = torch.device(f'cuda:{CUDA}' if torch.cuda.is_available() else 'cpu')

In [7]:
torch.manual_seed(RANDOM_STATE)
np.random.seed(RANDOM_STATE)

In [8]:
class Dataset(torch.utils.data.Dataset):
    def __init__(self, data):
        super(Dataset, self).__init__()
        self.data = data

    def __getitem__(self, index):
        input = torch.Tensor(self.data[index, :-1])
        label = torch.Tensor(self.data[index, -1:])
        return input, label

    def __len__(self):
        return len(self.data)

In [9]:
class Net(nn.Module):
    def __init__(self, input_num):
        super(Net, self).__init__()

        self.dense = nn.Sequential(
            nn.Linear(input_num, 512),
            nn.ReLU(),
            nn.Dropout(),
            nn.Linear(512, 1024),
            nn.ReLU(),
            nn.Dropout(),
            nn.Linear(1024, 512),
            nn.ReLU(),
            nn.Dropout(),
            nn.Linear(512, 128),
            nn.ReLU(),
            nn.Dropout(),
            nn.Linear(128, 1),
            nn.Sigmoid()
        )

    def forward(self, x):
        x = self.dense(x)
        return x

In [10]:
k_fold = KFold(n_splits=5, shuffle=True, random_state=RANDOM_STATE)

best_aucs = [0.] * k_fold.get_n_splits()

for i_fold, (train_idx, valid_idx) in enumerate(k_fold.split(tvdata)):
    print(f'Fold {i_fold} started!')

    validdata = tvdata[valid_idx]
    traindata = tvdata[train_idx]

    trainset = Dataset(traindata)
    validset = Dataset(validdata)

    trainloader = DataLoader(trainset, batch_size=BATCH_SIZE, shuffle=True)
    validloader = DataLoader(validset, batch_size=BATCH_SIZE, shuffle=False)

    net = Net(INPUT_NUM).to(device)
    optimizer = torch.optim.Adam(net.parameters(), lr=LR)
    criterion = nn.BCELoss()

    stop_cnt = 0

    # loop over the dataset multiple times
    for epoch in range(1, MAX_EPOCHS + 1):
        print(f'Epoch {epoch} ', end='\t')

        # Training
        net.train()
        running_loss = 0.0
        saved_outputs = []
        saved_labels = []

        for i, data in enumerate(trainloader, 0):
            inputs, labels = data
            inputs, labels = inputs.to(device), labels.to(device)

            # zero the parameter gradients
            optimizer.zero_grad()

            # forward + backward + optimize
            outputs = net(inputs)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()

            running_loss  += loss.item()
            saved_outputs += outputs.squeeze()        .tolist()
            saved_labels  += (labels.squeeze() >= 0.5).tolist()

        auc = roc_auc_score(saved_labels, saved_outputs)

        print(f'train loss={running_loss:.3f} \t'
              f'train auc={ auc         :.3f} \t', end='')

        # Validation
        net.eval()
        running_loss = 0.0
        saved_outputs = []
        saved_labels = []

        for i, data in enumerate(validloader, 0):
            inputs, labels = data
            inputs, labels = inputs.to(device), labels.to(device)

            outputs = net(inputs)
            loss = criterion(outputs, labels)

            running_loss  += loss.item()
            saved_outputs += outputs.squeeze()        .tolist()
            saved_labels  += (labels.squeeze() >= 0.5).tolist()

        auc = roc_auc_score(saved_labels, saved_outputs)

        print(f'valid loss={running_loss:.3f} \t'
              f'valid auc={ auc         :.3f} \t', end='')

        if auc > best_aucs[i_fold]:
            best_aucs[i_fold] = auc
            torch.save(net.state_dict(), f'../models/{i_fold}.pt')
            print(f'model saved!', end='')
            stop_cnt = 0
        else:
            stop_cnt += 1
            if stop_cnt > PATIENCE:
                print()
                break

        print()

	model saved!
Epoch 26 	train loss=5.980 	train auc=0.786 	valid loss=1.535 	valid auc=0.777 	model saved!
Epoch 27 	train loss=5.963 	train auc=0.785 	valid loss=1.534 	valid auc=0.778 	model saved!
Epoch 28 	train loss=5.971 	train auc=0.785 	valid loss=1.535 	valid auc=0.778 	model saved!
Epoch 29 	train loss=6.000 	train auc=0.782 	valid loss=1.537 	valid auc=0.778 	model saved!
Epoch 30 	train loss=5.937 	train auc=0.788 	valid loss=1.537 	valid auc=0.778 	
Epoch 31 	train loss=5.933 	train auc=0.788 	valid loss=1.535 	valid auc=0.779 	model saved!
Epoch 32 	train loss=5.951 	train auc=0.786 	valid loss=1.535 	valid auc=0.780 	model saved!
Epoch 33 	train loss=5.991 	train auc=0.785 	valid loss=1.531 	valid auc=0.780 	model saved!
Epoch 34 	train loss=5.975 	train auc=0.785 	valid loss=1.533 	valid auc=0.779 	
Epoch 35 	train loss=5.912 	train auc=0.790 	valid loss=1.531 	valid auc=0.780 	
Epoch 36 	train loss=5.968 	train auc=0.784 	valid loss=1.528 	valid auc=0.781 	model saved!

In [11]:
cv_auc = sum(best_aucs) / len(best_aucs)
print(f'CV auc={cv_auc:.3f}')

CV auc=0.795


In [12]:
testset = Dataset(testdata)
testloader = DataLoader(testset, batch_size=BATCH_SIZE, shuffle=False)

net = Net(INPUT_NUM)
net.to(device) 

votes = np.zeros(len(testloader.dataset))

for i in range(k_fold.get_n_splits()):
    net.load_state_dict(torch.load(f'../models/{i}.pt'))

    saved_outputs = []

    for i, data in enumerate(testloader, 0):
        inputs, labels = data
        inputs, labels = inputs.to(device), labels.to(device)

        outputs = net(inputs)
        saved_outputs += outputs.squeeze().tolist()

    votes += np.array(saved_outputs)

votes = votes / k_fold.get_n_splits()

In [13]:
sample_submission = pd.read_csv(os.path.join(PATH, 'sample_submission.csv'))
sample_submission['problem'] = votes
sample_submission.to_csv('../submission.csv', index=False)
sample_submission


Unnamed: 0,user_id,problem
0,30000,0.888938
1,30001,0.299995
2,30002,0.361327
3,30003,0.487069
4,30004,0.430292
...,...,...
14994,44994,0.479245
14995,44995,0.350849
14996,44996,0.758904
14997,44997,0.792922
