In [6]:
import os

# This variable is used by helperbot to make the training deterministic
os.environ["SEED"] = "828"

import logging
import gc
from pathlib import Path

import torch
import torch.nn as nn
import numpy as np
import pandas as pd
from sklearn.metrics import log_loss
from sklearn.model_selection import StratifiedKFold
from torch.optim.lr_scheduler import CosineAnnealingLR
from torch.utils.data import Dataset, DataLoader
from pytorch_pretrained_bert import BertTokenizer
from pytorch_pretrained_bert.modeling import BertModel
# from allennlp.modules.span_extractors import SelfAttentiveSpanExtractor, EndpointSpanExtractor

# from helperbot import (
#     TriangularLR, BaseBot, WeightDecayOptimizerWrapper,
#     GradualWarmupScheduler
# )

In [14]:
def extract_target(df):
    df["Neither"] = 0
    df.loc[~(df['A-coref'] | df['B-coref']), "Neither"] = 1
    df["target"] = 0
    df.loc[df['B-coref'] == 1, "target"] = 1
    df.loc[df["Neither"] == 1, "target"] = 2
    print(df.target.value_counts())
    return df

In [7]:
df_train = pd.concat([
    pd.read_csv("../input/gap-test.tsv", delimiter="\t"),
    pd.read_csv("../input/gap-validation.tsv", delimiter="\t")
], axis=0)

In [8]:
df_test = pd.read_csv("../input/gap-development.tsv", delimiter="\t")

In [15]:
df_train = extract_target(df_train)

0    1105
1    1060
2     289
Name: target, dtype: int64


In [17]:
df_test = extract_target(df_test)

1    925
0    874
2    201
Name: target, dtype: int64


In [20]:
sample_sub = pd.read_csv("../input/sample_submission_stage_1.csv")
assert sample_sub.shape[0] == df_test.shape[0]

In [21]:
BERT_MODEL = 'bert-large-uncased'
CASED = False
tokenizer = BertTokenizer.from_pretrained(
    BERT_MODEL,
    do_lower_case=CASED,
    never_split = ("[UNK]", "[SEP]", "[PAD]", "[CLS]", "[MASK]")
)

100%|██████████| 231508/231508 [00:00<00:00, 344314.08B/s]


In [30]:
def tokenize(row, tokenizer):
    break_points = sorted(
        [
            ("A", row["A-offset"], row["A"]),
            ("B", row["B-offset"], row["B"]),
            ("P", row["Pronoun-offset"], row["Pronoun"]),
        ], key=lambda x: x[0]
    )
    tokens, spans, current_pos = [], {}, 0
    for name, offset, text in break_points:
        tokens.extend(tokenizer.tokenize(row["Text"][current_pos:offset]))
        # Make sure we do not get it wrong
        assert row["Text"][offset:offset+len(text)] == text
        # Tokenize the target
        tmp_tokens = tokenizer.tokenize(row["Text"][offset:offset+len(text)])
        spans[name] = [len(tokens), len(tokens) + len(tmp_tokens) - 1] # inclusive
        tokens.extend(tmp_tokens)
        current_pos = offset + len(text)
    tokens.extend(tokenizer.tokenize(row["Text"][current_pos:offset]))
    assert spans["P"][0] == spans["P"][1]
    return tokens, (spans["A"] + spans["B"] + [spans["P"][0]])

def collate_examples(batch, truncate_len=490):
    """Batch preparation.
    
    1. Pad the sequences
    2. Transform the target.
    """    
    transposed = list(zip(*batch))
    max_len = min(
        max((len(x) for x in transposed[0])),
        truncate_len
    )
    tokens = np.zeros((len(batch), max_len), dtype=np.int64)
    for i, row in enumerate(transposed[0]):
        row = np.array(row[:truncate_len])
        tokens[i, :len(row)] = row
    token_tensor = torch.from_numpy(tokens)
    # Offsets
    offsets = torch.stack([
        torch.LongTensor(x) for x in transposed[1]
    ], dim=0) + 1 # Account for the [CLS] token
    # Labels
    if len(transposed) == 2:
        return token_tensor, offsets, None
    labels = torch.LongTensor(transposed[2])
    return token_tensor, offsets, labels

class GAPDataset(Dataset):
    """Custom GAP Dataset class"""
    def __init__(self, df, tokenizer, labeled=True):
        self.labeled = labeled
        if labeled:
            self.y = df.target.values.astype("uint8")
        
        self.offsets, self.tokens = [], []
        for _, row in df.iterrows():
            tokens, offsets = tokenize(row, tokenizer)
            self.offsets.append(offsets)
            self.tokens.append(tokenizer.convert_tokens_to_ids(
                ["[CLS]"] + tokens + ["[SEP]"]))
        
    def __len__(self):
        return len(self.tokens)

    def __getitem__(self, idx):
        if self.labeled:
            return self.tokens[idx], self.offsets[idx], self.y[idx]
        return self.tokens[idx], self.offsets[idx], None

In [31]:
test_ds = GAPDataset(df_test, tokenizer)
test_loader = DataLoader(
    test_ds,
    collate_fn = collate_examples,
    batch_size=128,
    num_workers=2,
    pin_memory=True,
    shuffle=False
)

In [32]:
class GAPModel(nn.Module):
    """The main model."""
    def __init__(self, bert_model: str, device: torch.device):
        super().__init__()
        self.device = device
        if bert_model in ("bert-base-uncased", "bert-base-cased"):
            self.bert_hidden_size = 768
        elif bert_model in ("bert-large-uncased", "bert-large-cased"):
            self.bert_hidden_size = 1024
        else:
            raise ValueError("Unsupported BERT model.")
        self.bert = BertModel.from_pretrained(bert_model).to(device)
        self.head = Head(self.bert_hidden_size).to(device)
    
    def forward(self, token_tensor, offsets):
        token_tensor = token_tensor.to(self.device)
        bert_outputs, _ =  self.bert(
            token_tensor, attention_mask=(token_tensor > 0).long(), 
            token_type_ids=None, output_all_encoded_layers=False)
        head_outputs = self.head(bert_outputs, offsets.to(self.device))
        return head_outputs

In [34]:
skf = StratifiedKFold(n_splits=5, random_state=191)

val_preds, test_preds, val_ys, val_losses = [], [], [], []
for train_index, valid_index in skf.split(df_train, df_train["target"]):
    print("=" * 20)
    print("Fold " + str(len(val_preds)))
    print("=" * 20)
    train_ds = GAPDataset(df_train.iloc[train_index], tokenizer)
    val_ds = GAPDataset(df_train.iloc[valid_index], tokenizer)
    train_loader = DataLoader(
        train_ds,
        collate_fn = collate_examples,
        batch_size=32,
        num_workers=2,
        pin_memory=True,
        shuffle=True,
        drop_last=True
    )
    val_loader = DataLoader(
        val_ds,
        collate_fn = collate_examples,
        batch_size=128,
        num_workers=2,
        pin_memory=True,
        shuffle=False
    )
    model = GAPModel(BERT_MODEL, torch.device("cuda:0"))
    # You can unfreeze the last layer of bert by calling set_trainable(model.bert.encoder.layer[23], True)
    set_trainable(model.bert, False)
    set_trainable(model.head, True)
    optimizer = WeightDecayOptimizerWrapper(
        torch.optim.Adam(model.parameters(), lr=2e-3),
        0.05
    )
    # optimizer = torch.optim.Adam(model.parameters(), lr=2e-3)
    bot = GAPBot(
        model, train_loader, val_loader,
        optimizer=optimizer, echo=True,
        avg_window=40
    )
    gc.collect()
    steps_per_epoch = len(train_loader) 
    n_steps = steps_per_epoch * 15
    bot.train(
        n_steps,
        log_interval=steps_per_epoch // 2,
        snapshot_interval=steps_per_epoch,
#         scheduler=GradualWarmupScheduler(optimizer, 20, int(steps_per_epoch * 4),
#             after_scheduler=CosineAnnealingLR(
#                 optimizer, n_steps - int(steps_per_epoch * 4)
#             )
#         )
        scheduler=TriangularLR(
            optimizer, 20, ratio=3, steps_per_cycle=n_steps)
    )
    # Load the best checkpoint
    bot.load_model(bot.best_performers[0][1])
    bot.remove_checkpoints(keep=0)    
    val_preds.append(torch.softmax(bot.predict(val_loader), -1).clamp(1e-4, 1-1e-4).cpu().numpy())
    val_ys.append(df_train.iloc[valid_index].target.astype("uint8").values)
    val_losses.append(log_loss(val_ys[-1], val_preds[-1]))
    bot.logger.info("Confirm val loss: %.4f", val_losses[-1])
    test_preds.append(torch.softmax(bot.predict(test_loader), -1).clamp(1e-4, 1-1e-4).cpu().numpy())

Fold 1


100%|██████████| 1248501532/1248501532 [03:07<00:00, 6674702.69B/s]


NameError: name 'Head' is not defined