# Import Libraries and Initial Settings

In [29]:
import os
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import DataLoader
from sklearn.preprocessing import LabelBinarizer
from sklearn.model_selection import KFold
from sklearn.metrics import roc_auc_score
from tqdm import tqdm

In [30]:
CUDA         = 1
RANDOM_STATE = 2021

In [31]:
os.environ["CUDA_VISIBLE_DEVICES"] = f"{CUDA}"
device = torch.device(f'cuda' if torch.cuda.is_available() else 'cpu')
torch.manual_seed(RANDOM_STATE)
np   .random.seed(RANDOM_STATE)

# Load Data

In [32]:
PATH = '../data'
train_err     = pd.read_csv(os.path.join(PATH,     'train_err_data.csv'))
train_problem = pd.read_csv(os.path.join(PATH, 'train_problem_data.csv'))
test_err      = pd.read_csv(os.path.join(PATH,      'test_err_data.csv'))

# Data Prepration

In [33]:
def preprocess(err, onehot_encoder, min_id, max_id):
    user_ids = list(err['user_id'])

    # one-hot encode errtype
    onehot_errtypes = onehot_encoder.transform(err['errtype'])
    features = list(onehot_errtypes)  # [np.array([0, ...]), ...]

    x = [[] for _ in range(max_id - min_id + 1)]

    # store one-hot errtype into x
    for user_id, feature in zip(user_ids, features):
        i = user_id - min_id
        x[i].append(feature)

    for i in range(len(x)):
        x[i] = np.array(x[i])

    return x

In [34]:
train_min_id, train_max_id = train_err['user_id'].min(), train_err['user_id'].max()
test_min_id ,  test_max_id =  test_err['user_id'].min(),  test_err['user_id'].max()


In [35]:
onehot_encoder = LabelBinarizer()
onehot_encoder.fit(train_err['errtype'])

LabelBinarizer()

In [36]:
# lists of np.array(seq_len, #features)
train_x = preprocess(train_err, onehot_encoder, train_min_id, train_max_id)
test_x  = preprocess( test_err, onehot_encoder,  test_min_id,  test_max_id)

In [38]:
max_len = 0
for arr in train_x:
    max_len = max(max_len, len(arr))
print(max_len)
for arr in test_x:
    max_len = max(max_len, len(arr))
print(max_len)

222186
396478


In [None]:
# make train_y
problem_user_ids = sorted(train_problem['user_id'].unique())
train_y = [0] * (train_max_id - train_min_id + 1)
for user_id in problem_user_ids:
    i = user_id - train_min_id
    train_y[i] = 1

In [66]:
input_size = train_x[0].shape[-1]
input_size

41

# Define a Model Class

In [61]:
class Net(nn.Module):
    def __init__(self, input_size, hidden_size, kernel_size, stride, dropout=0.5):
        super(Net, self).__init__()

        self.conv1 = nn.Conv1d(input_size , hidden_size, kernel_size, stride)
        self.conv2 = nn.Conv1d(hidden_size, hidden_size, kernel_size, stride)
        self.pool = nn.MaxPool1d(8)
        self.small_dropout = nn.Dropout(0.15)
        self.dropout = nn.Dropout(dropout)
        self.dense = nn.Sequential(
            nn.Linear(hidden_size * 126, hidden_size * 16),
            nn.Dropout(),
            nn.ReLU(),
            nn.Linear(hidden_size * 16, 1),
            nn.Sigmoid()
        )

    def forward(self, x):  # (batch_size, input_size, seq_len)
        x = self.small_dropout(x)

        x = self.conv1(x)
        x = self.dropout(x)
        x = F.relu(x)
        x = self.pool(x)
        # x=(batch_size, hidden_size, *)

        x = self.conv2(x)
        x = self.dropout(x)
        x = F.relu(x)
        x = self.pool(x)
        # x=(batch_size, hidden_size, *)

        x = torch.flatten(x, start_dim=1)
        x = self.dense(x)

        return x

In [62]:
net = Net(41, 64, 15, 7).to(device)
from torchsummary import summary
summary(net, (41, 396478))

----------------------------------------------------------------
        Layer (type)               Output Shape         Param #
           Dropout-1           [-1, 41, 396478]               0
            Conv1d-2            [-1, 64, 56638]          39,424
           Dropout-3            [-1, 64, 56638]               0
         MaxPool1d-4             [-1, 64, 7079]               0
            Conv1d-5             [-1, 64, 1010]          61,504
           Dropout-6             [-1, 64, 1010]               0
         MaxPool1d-7              [-1, 64, 126]               0
            Linear-8                 [-1, 1024]       8,258,560
           Dropout-9                 [-1, 1024]               0
             ReLU-10                 [-1, 1024]               0
           Linear-11                    [-1, 1]           1,025
          Sigmoid-12                    [-1, 1]               0
Total params: 8,360,513
Trainable params: 8,360,513
Non-trainable params: 0
---------------------------

# Training

In [96]:
# hyperparameters for learning
LR = 1e-3
HIDDEN_SIZE = 256
MAX_EPOCHS = 100
PATIENCE = 15

k_fold = KFold(n_splits=5, shuffle=True, random_state=RANDOM_STATE)
best_aucs = [0.] * k_fold.get_n_splits()

for i_fold, (train_idx, valid_idx) in enumerate(k_fold.split(train_x)):
    print(f'Fold {i_fold} started!')

    net = Net(input_size, HIDDEN_SIZE).to(device)
    optimizer = torch.optim.Adam(net.parameters(), lr=LR)
    criterion = nn.BCELoss()

    stop_cnt = 0

    # loop over the dataset multiple times
    for epoch in range(1, MAX_EPOCHS + 1):
        print(f'Epoch {epoch} ', end='\t')

        # training
        net.train()
        running_loss = 0.0
        saved_outputs = []
        saved_labels = []

        for idx in tqdm(train_idx):
            inputs = torch.tensor(train_x[idx],
                dtype=torch.float, device=device).unsqueeze(1)
            labels = torch.tensor([train_y[idx]],
                dtype=torch.float, device=device).unsqueeze(1)

            # zero the parameter gradients
            optimizer.zero_grad()

            # forward + backward + optimize
            outputs = net(inputs)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()

            running_loss += loss.item() * len(outputs)
            saved_outputs.append(outputs        .item())
            saved_labels .append((labels >= 0.5).item())

        running_loss /= len(train_idx)
        auc = roc_auc_score(saved_labels, saved_outputs)

        print(f'train loss={running_loss:.3f} \t'
              f'train auc={ auc         :.3f} \t', end='')

        # validation
        net.eval()
        running_loss = 0.0
        saved_outputs = []
        saved_labels = []

        for idx in tqdm(valid_idx):
            inputs = torch.tensor(train_x[idx],
                dtype=torch.float, device=device).unsqueeze(1)
            labels = torch.tensor([train_y[idx]],
                dtype=torch.float, device=device).unsqueeze(1)

            # forward
            outputs = net(inputs)
            loss = criterion(outputs, labels)

            running_loss += loss.item() * len(outputs)
            saved_outputs.append(outputs        .item())
            saved_labels .append((labels >= 0.5).item())

        running_loss /= len(valid_idx)
        auc = roc_auc_score(saved_labels, saved_outputs)

        print(f'valid loss={running_loss:.3f} \t'
              f'valid auc={ auc         :.3f} \t', end='')

        if auc > best_aucs[i_fold]:
            best_aucs[i_fold] = auc
            torch.save(net.state_dict(), f'../models/{i_fold}.pt')
            print(f'model saved!', end='')
            stop_cnt = 0
        else:
            stop_cnt += 1
            if stop_cnt > PATIENCE:
                print()
                break
        print()

  0%|          | 2/12000 [00:00<17:30, 11.42it/s]Fold 0 started!
100%|██████████| 12000/12000 [09:07<00:00, 21.92it/s]


NameError: name 'trainloader' is not defined

# Make a Submission File

In [None]:
testset = Dataset(testdata)
testloader = DataLoader(testset, batch_size=BATCH_SIZE, shuffle=False)

net = Net(INPUT_NUM)
net.to(device) 

votes = np.zeros(len(testloader.dataset))

for i in range(k_fold.get_n_splits()):
    net.load_state_dict(torch.load(f'../models/{i}.pt'))

    saved_outputs = []

    for i, data in enumerate(testloader, 0):
        inputs, labels = data
        inputs, labels = inputs.to(device), labels.to(device)

        outputs = net(inputs)
        saved_outputs += outputs.squeeze().tolist()

    votes += np.array(saved_outputs)

votes = votes / k_fold.get_n_splits()

sample_submission = pd.read_csv(os.path.join(PATH, 'sample_submission.csv'))
sample_submission['problem'] = votes
sample_submission.to_csv('../submission.csv', index=False)

# Print the AUC Score

In [None]:
cv_auc = sum(best_aucs) / len(best_aucs)
cv_auc