# Import Libraries and Initial Settings

In [1]:
import os
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import DataLoader
from sklearn.preprocessing import LabelBinarizer
from sklearn.model_selection import KFold
from sklearn.metrics import roc_auc_score
from tqdm import tqdm

In [2]:
CUDA         = 1
RANDOM_STATE = 2021

In [3]:
device = torch.device(f'cuda:{CUDA}' if torch.cuda.is_available() else 'cpu')
torch.manual_seed(RANDOM_STATE)
np   .random.seed(RANDOM_STATE)

# Load Data

In [4]:
PATH = '../data'
train_err     = pd.read_csv(os.path.join(PATH,     'train_err_data.csv'))
train_problem = pd.read_csv(os.path.join(PATH, 'train_problem_data.csv'))
test_err      = pd.read_csv(os.path.join(PATH,      'test_err_data.csv'))

# Data Prepration

In [None]:
def preprocess(err, onehot_encoder, min_id, max_id):
    user_ids = list(err['user_id'])

    # one-hot encode errtype
    onehot_errtypes = onehot_encoder.transform(err['errtype'])
    features = list(onehot_errtypes)  # [np.array([0, ...]), ...]

    x = [[] for _ in range(max_id - min_id + 1)]

    # store one-hot errtype into x
    for user_id, feature in zip(user_ids, features):
        i = user_id - min_id
        x[i].append(feature)

    for i in range(len(x)):
        x[i] = np.array(x[i])

    return x

In [None]:
train_min_id, train_max_id = train_err['user_id'].min(), train_err['user_id'].max()
test_min_id ,  test_max_id =  test_err['user_id'].min(),  test_err['user_id'].max()


In [None]:
onehot_encoder = LabelBinarizer()
onehot_encoder.fit(train_err['errtype'])

In [None]:
# lists of np.array(seq_len, #features)
train_x = preprocess(train_err, onehot_encoder, train_min_id, train_max_id)
test_x  = preprocess( test_err, onehot_encoder,  test_min_id,  test_max_id)

In [None]:
# make train_y
problem_user_ids = sorted(train_problem['user_id'].unique())
train_y = [0] * (train_max_id - train_min_id + 1)
for user_id in problem_user_ids:
    i = user_id - train_min_id
    train_y[i] = 1

In [66]:
input_size = train_x[0].shape[-1]
input_size

41

# Define a Model Class

In [92]:
class Net(nn.Module):
    def __init__(self, input_size, hidden_size):
        super(Net, self).__init__()
        self.rnn = nn.GRU(input_size, hidden_size, dropout=0.5)
        self.dense = nn.Sequential(
            nn.Linear(hidden_size, hidden_size),
            nn.Dropout(),
            nn.ReLU(),
            nn.Linear(hidden_size, 1),
            nn.Sigmoid()
        )

    def forward(self, x):  
        x, _ = self.rnn(x)  # (seq_len, batch_size, hidden_size)
        x = torch.max(x, dim=0).values  # (batch_size, hidden_size)
        x = self.dense(x)  # (batch_size, 1)
        return x

# Training

In [96]:
# hyperparameters for learning
LR = 1e-3
HIDDEN_SIZE = 256
MAX_EPOCHS = 100
PATIENCE = 15

k_fold = KFold(n_splits=5, shuffle=True, random_state=RANDOM_STATE)
best_aucs = [0.] * k_fold.get_n_splits()

for i_fold, (train_idx, valid_idx) in enumerate(k_fold.split(train_x)):
    print(f'Fold {i_fold} started!')

    net = Net(input_size, HIDDEN_SIZE).to(device)
    optimizer = torch.optim.Adam(net.parameters(), lr=LR)
    criterion = nn.BCELoss()

    stop_cnt = 0

    # loop over the dataset multiple times
    for epoch in range(1, MAX_EPOCHS + 1):
        print(f'Epoch {epoch} ', end='\t')

        # training
        net.train()
        running_loss = 0.0
        saved_outputs = []
        saved_labels = []

        for idx in tqdm(train_idx):
            inputs = torch.tensor(train_x[idx],
                dtype=torch.float, device=device).unsqueeze(1)
            labels = torch.tensor([train_y[idx]],
                dtype=torch.float, device=device).unsqueeze(1)

            # zero the parameter gradients
            optimizer.zero_grad()

            # forward + backward + optimize
            outputs = net(inputs)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()

            running_loss += loss.item() * len(outputs)
            saved_outputs.append(outputs        .item())
            saved_labels .append((labels >= 0.5).item())

        running_loss /= len(train_idx)
        auc = roc_auc_score(saved_labels, saved_outputs)

        print(f'train loss={running_loss:.3f} \t'
              f'train auc={ auc         :.3f} \t', end='')

        # validation
        net.eval()
        running_loss = 0.0
        saved_outputs = []
        saved_labels = []

        for idx in tqdm(valid_idx):
            inputs = torch.tensor(train_x[idx],
                dtype=torch.float, device=device).unsqueeze(1)
            labels = torch.tensor([train_y[idx]],
                dtype=torch.float, device=device).unsqueeze(1)

            # forward
            outputs = net(inputs)
            loss = criterion(outputs, labels)

            running_loss += loss.item() * len(outputs)
            saved_outputs.append(outputs        .item())
            saved_labels .append((labels >= 0.5).item())

        running_loss /= len(valid_idx)
        auc = roc_auc_score(saved_labels, saved_outputs)

        print(f'valid loss={running_loss:.3f} \t'
              f'valid auc={ auc         :.3f} \t', end='')

        if auc > best_aucs[i_fold]:
            best_aucs[i_fold] = auc
            torch.save(net.state_dict(), f'../models/{i_fold}.pt')
            print(f'model saved!', end='')
            stop_cnt = 0
        else:
            stop_cnt += 1
            if stop_cnt > PATIENCE:
                print()
                break
        print()

  0%|          | 2/12000 [00:00<17:30, 11.42it/s]Fold 0 started!
100%|██████████| 12000/12000 [09:07<00:00, 21.92it/s]


NameError: name 'trainloader' is not defined

# Make a Submission File

In [None]:
testset = Dataset(testdata)
testloader = DataLoader(testset, batch_size=BATCH_SIZE, shuffle=False)

net = Net(INPUT_NUM)
net.to(device) 

votes = np.zeros(len(testloader.dataset))

for i in range(k_fold.get_n_splits()):
    net.load_state_dict(torch.load(f'../models/{i}.pt'))

    saved_outputs = []

    for i, data in enumerate(testloader, 0):
        inputs, labels = data
        inputs, labels = inputs.to(device), labels.to(device)

        outputs = net(inputs)
        saved_outputs += outputs.squeeze().tolist()

    votes += np.array(saved_outputs)

votes = votes / k_fold.get_n_splits()

sample_submission = pd.read_csv(os.path.join(PATH, 'sample_submission.csv'))
sample_submission['problem'] = votes
sample_submission.to_csv('../submission.csv', index=False)

# Print the AUC Score

In [None]:
cv_auc = sum(best_aucs) / len(best_aucs)
cv_auc