# Import

In [33]:
import os
import re
import random
import numpy as np
import pandas as pd
import string
from tqdm.autonotebook import tqdm, trange

import torch
import torch.nn as nn
from torch.nn import functional as F
from torch.optim import lr_scheduler

from sklearn import model_selection
from sklearn import metrics
from sklearn.model_selection import StratifiedKFold

import transformers
import tokenizers
from transformers import AdamW, get_linear_schedule_with_warmup

from IPython.core.debugger import set_trace

# Settings

In [34]:
args = {
    'data_dir': '/data/ai_challenge/tweet_sentiment_extraction/',
    'train_file': 'train.csv',
    'test_file': 'test.csv',
    'save_model_dir': 'trained_model',
    
    'model_name': 'bert-base-uncased',
    'do_eval': False,
    'gradient_accumulation_steps': 1,
    'max_seq_length': 128,
    'batch_size': 8,
    'n_epoch': 3,
    'weight_decay': 1e-7,
    'adam_epsilon': 1e-8,
    'warmup_steps': 0,
    'n_splits': 5,
    'learning_rate': 1e-5,
    'seed': 1234,
}

# class config:
#     DATA_DIR = '../input/tweet-sentiment-extraction'
#     BERT_PATH = '../input/bert-base-uncased'
#     OUTPUT_DIR = '/kaggle/working'
#     TRAIN_FILE = 'train.csv'
#     TEST_FILE = 'test.csv'
#     SAVE_MODEL_DIR = ''
#     MAX_LEN = 128
#     TRAIN_BATCH_SIZE = 32
#     VALID_BATCH_SIZE = 32
#     EPOCHS = 3
#     N_SPLITS = 5
#     SEED = 1234
#     TOKENIZER = tokenizers.BertWordPieceTokenizer(
#         f"{BERT_PATH}/vocab.txt", 
#         lowercase=True
#     )

class config:
    DATA_DIR = '/data/ai_challenge/tweet_sentiment_extraction'
    BERT_PATH = '/data/ai_challenge/tweet_sentiment_extraction/bert-base-uncased'
    OUTPUT_DIR = '/data/ai_challenge/tweet_sentiment_extraction'
    TRAIN_FILE = 'train.csv'
    TEST_FILE = 'test.csv'
    SAVE_MODEL_DIR = 'trained_model'
    MAX_LEN = 128
    TRAIN_BATCH_SIZE = 8
    VALID_BATCH_SIZE = 8
    LOGGING_STEPS = 100
    LEARNING_RATE = 3e-5
    WEIGHT_DECAY = 1e-3
    EPOCHS = 3
    N_SPLITS = 5
    PATIENCE = 2
    SEED = 1234
    DEVICE = torch.device('cuda')
    TOKENIZER = tokenizers.BertWordPieceTokenizer(
        f"{BERT_PATH}/vocab.txt", 
        lowercase=True
    )

In [35]:
def seed_everything(seed=1234):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False
seed_everything(config.SEED)

# Utils

In [36]:
class AverageMeter:
    """
    Computes and stores the average and current value
    """
    def __init__(self):
        self.reset()

    def reset(self):
        self.val = 0
        self.avg = 0
        self.sum = 0
        self.count = 0

    def update(self, val, n=1):
        self.val = val
        self.sum += val * n
        self.count += n
        self.avg = self.sum / self.count


class EarlyStopping:
    def __init__(self, patience=7, mode="max", delta=0.001):
        self.patience = patience
        self.counter = 0
        self.mode = mode
        self.best_score = None
        self.early_stop = False
        self.delta = delta
        if self.mode == "min":
            self.val_score = np.Inf
        else:
            self.val_score = -np.Inf

    def __call__(self, epoch_score, model, model_path):

        if self.mode == "min":
            score = -1.0 * epoch_score
        else:
            score = np.copy(epoch_score)

        if self.best_score is None:
            self.best_score = score
            self.save_checkpoint(epoch_score, model, model_path)
        elif score < self.best_score + self.delta:
            self.counter += 1
            print('EarlyStopping counter: {} out of {}'.format(self.counter, self.patience))
            if self.counter >= self.patience:
                self.early_stop = True
        else:
            self.best_score = score
            self.save_checkpoint(epoch_score, model, model_path)
            self.counter = 0

    def save_checkpoint(self, epoch_score, model, model_path):
        if epoch_score not in [-np.inf, np.inf, -np.nan, np.nan]:
            print('Validation score improved ({} --> {}). Saving model!'.format(self.val_score, epoch_score))
            torch.save(model.state_dict(), model_path)
        self.val_score = epoch_score


def jaccard(str1, str2): 
    a = set(str1.lower().split()) 
    b = set(str2.lower().split())
    c = a.intersection(b)
    return float(len(c)) / (len(a) + len(b) - len(c))

# Data Processing

In [37]:
def process_data(tweet, selected_text, sentiment, tokenizer, max_len):
    len_st = len(selected_text)
    idx0 = None
    idx1 = None
    
    # Test again
    for ind in (i for i, e in enumerate(tweet) if e == selected_text[0]):
        if tweet[ind: ind+len_st] == selected_text:
            idx0 = ind
            idx1 = ind + len_st - 1
            break
            
    char_targets = [0] * len(tweet)
    if idx0 != None and idx1 != None:
        for ct in range(idx0, idx1 + 1):
            char_targets[ct] = 1
            
    tok_tweet = tokenizer.encode(tweet)
    input_ids_orig = tok_tweet.ids[1: -1]
    tweet_offsets = tok_tweet.offsets[1: -1]
    
    target_idx = [] # offset of token in sentence
    for j, (offset1, offset2) in enumerate(tweet_offsets):
        if sum(char_targets[offset1: offset2]) > 0:
            target_idx.append(j)
            
    targets_start = target_idx[0]
    targets_end = target_idx[-1]
    
    sentiment_id = {
        'positive': 3893,
        'negative': 4997,
        'neutral': 8699
    }
    
    input_ids = [101] + [sentiment_id[sentiment]] + [102] + input_ids_orig + [102]
    token_type_ids = [0, 0, 0] + [1] * (len(input_ids_orig) + 1)
    mask = [1] * len(token_type_ids)
    tweet_offsets = [(0, 0)] * 3 + tweet_offsets + [(0, 0)]
    targets_start += 3
    targets_end += 3
    
    padding_length = max_len - len(input_ids)
    if padding_length > 0:
        input_ids = input_ids + ([0] * padding_length)
        mask = mask + ([0] * padding_length)
        token_type_ids = token_type_ids + ([0] * padding_length)
        tweet_offsets = tweet_offsets + ([(0, 0)] * padding_length)
        
    return {
        'ids': input_ids,
        'mask': mask,
        'token_type_ids': token_type_ids,
        'targets_start': targets_start,
        'targets_end': targets_end,
        'orig_tweet': tweet,
        'orig_selected': selected_text,
        'sentiment': sentiment,
        'offsets': tweet_offsets
    }

# Data Loader

In [38]:
class TweetDataset :
    def __init__(self, tweet, sentiment, selected_text):
        self.tweet = tweet
        self.sentiment = sentiment
        self.selected_text = selected_text
        self.tokenizer = config.TOKENIZER
        self.max_len = config.MAX_LEN
    
    def __len__(self):
        return len(self.tweet)
    
    def __getitem__(self, item):
        data = process_data(
            self.tweet[item],
            self.selected_text[item],
            self.sentiment[item],
            self.tokenizer,
            self.max_len
        )
        
        return {
            'ids': torch.tensor(data['ids'], dtype=torch.long),
            'mask': torch.tensor(data['mask'], dtype=torch.long),
            'token_type_ids': torch.tensor(data["token_type_ids"], dtype=torch.long),
            'targets_start': torch.tensor(data["targets_start"], dtype=torch.long),
            'targets_end': torch.tensor(data["targets_end"], dtype=torch.long),
            'orig_tweet': data["orig_tweet"],
            'orig_selected': data["orig_selected"],
            'sentiment': data["sentiment"],
            'offsets': torch.tensor(data["offsets"], dtype=torch.long)
        }

# Model

In [39]:
class TweetModel(transformers.BertPreTrainedModel):
    def __init__(self, conf):
        super(TweetModel, self).__init__(conf)
        self.bert = transformers.BertModel.from_pretrained(config.BERT_PATH, config=conf)
        self.drop_out = nn.Dropout(0.1)
        self.l0 = nn.Linear(768 * 2, 2)
        nn.init.normal_(self.l0.weight, std=0.02)
        
    def forward(self, ids, mask, token_type_ids):
        _, _, out = self.bert(
            ids, 
            attention_mask = mask,
            token_type_ids = token_type_ids
        )
        
        out = torch.cat((out[-1], out[-2]), dim=-1)
        out = self.drop_out(out)
        logits = self.l0(out)

        start_logits, end_logits = logits.split(1, dim=-1)

        start_logits = start_logits.squeeze(-1)
        end_logits = end_logits.squeeze(-1)

        return start_logits, end_logits

In [40]:
def loss_fn(start_logits, end_logits, start_positions, end_positions):
    loss_fct = nn.CrossEntropyLoss()
    start_loss = loss_fct(start_logits, start_positions)
    end_loss = loss_fct(end_logits, end_positions)
    total_loss = start_loss + end_loss
    
    return total_loss

# Train and Valid Functions

In [41]:
def train_fn(data_loader, model, optimizer, scheduler=None):
    model.train()
    losses = AverageMeter()
    jaccards = AverageMeter()
    
    tk0 = tqdm(data_loader, total=len(data_loader), desc='Train')
    for bi, d in enumerate(tk0):
        ids = d["ids"]
        token_type_ids = d["token_type_ids"]
        mask = d["mask"]
        targets_start = d["targets_start"]
        targets_end = d["targets_end"]
        sentiment = d["sentiment"]
        orig_selected = d["orig_selected"]
        orig_tweet = d["orig_tweet"]
        targets_start = d["targets_start"]
        targets_end = d["targets_end"]
        offsets = d["offsets"]

        ids = ids.to(config.DEVICE, dtype=torch.long)
        token_type_ids = token_type_ids.to(config.DEVICE, dtype=torch.long)
        mask = mask.to(config.DEVICE, dtype=torch.long)
        targets_start = targets_start.to(config.DEVICE, dtype=torch.long)
        targets_end = targets_end.to(config.DEVICE, dtype=torch.long)
        
        model.zero_grad()
        outputs_start, outputs_end = model(ids=ids, mask=mask, token_type_ids=token_type_ids)
        
        loss = loss_fn(outputs_start, outputs_end, targets_start, targets_end)
        loss.backward()
        optimizer.step()
        scheduler.step()
        
        outputs_start = torch.softmax(outputs_start, dim=1).cpu().detach().numpy()
        outputs_end = torch.softmax(outputs_end, dim=1).cpu().detach().numpy()
        jaccard_scores = []
        
        for px, tweet in enumerate(orig_tweet):
            selected_tweet = orig_selected[px]
            tweet_sentiment = sentiment[px]
            jaccard_score, _ = calculate_jaccard_score(
                original_tweet=tweet,
                target_string=selected_tweet,
                sentiment_val=tweet_sentiment,
                idx_start=np.argmax(outputs_start[px, :]),
                idx_end=np.argmax(outputs_end[px, :]),
                offsets=offsets[px]
            )
            jaccard_scores.append(jaccard_score)
        
        jaccards.update(np.mean(jaccard_scores), ids.size(0))
        losses.update(loss.item(), ids.size(0))
        if bi >= config.LOGGING_STEPS and bi % config.LOGGING_STEPS == 0:
            print(f"Loss: {losses.avg} - Jaccard: {jaccards.avg}")
            losses.reset()
            jaccards.reset()

In [42]:
def calculate_jaccard_score(original_tweet, target_string, sentiment_val, idx_start, idx_end, offsets):
    if idx_end < idx_start:
        idx_end = idx_start
        
    filtered_output  = ""
    for ix in range(idx_start, idx_end + 1):
        filtered_output += original_tweet[offsets[ix][0]: offsets[ix][1]]
        if (ix+1) < len(offsets) and offsets[ix][1] < offsets[ix+1][0]:
            filtered_output += " "
            
    if sentiment_val == 'neutral' or len(original_tweet.split()) < 2:
        filtered_output = original_tweet
        
    jac = jaccard(target_string.strip(), filtered_output.strip())
    
    return jac, filtered_output

def eval_fn(data_loader, model):
    model.eval()
    losses = AverageMeter()
    jaccards = AverageMeter()
    
    with torch.no_grad():
        tk0 = tqdm(data_loader, total=len(data_loader), desc='Valid')
        for bi, d in enumerate(tk0):
            ids = d["ids"]
            token_type_ids = d["token_type_ids"]
            mask = d["mask"]
            sentiment = d["sentiment"]
            orig_selected = d["orig_selected"]
            orig_tweet = d["orig_tweet"]
            targets_start = d["targets_start"]
            targets_end = d["targets_end"]
            offsets = d["offsets"].numpy()

            ids = ids.to(config.DEVICE, dtype=torch.long)
            token_type_ids = token_type_ids.to(config.DEVICE, dtype=torch.long)
            mask = mask.to(config.DEVICE, dtype=torch.long)
            targets_start = targets_start.to(config.DEVICE, dtype=torch.long)
            targets_end = targets_end.to(config.DEVICE, dtype=torch.long)

            outputs_start, outputs_end = model(
                ids=ids,
                mask=mask,
                token_type_ids=token_type_ids
            )
            loss = loss_fn(outputs_start, outputs_end, targets_start, targets_end)
            outputs_start = torch.softmax(outputs_start, dim=1).cpu().detach().numpy()
            outputs_end = torch.softmax(outputs_end, dim=1).cpu().detach().numpy()
            jaccard_scores = []
            for px, tweet in enumerate(orig_tweet):
                selected_tweet = orig_selected[px]
                tweet_sentiment = sentiment[px]
                jaccard_score, _ = calculate_jaccard_score(
                    original_tweet=tweet,
                    target_string=selected_tweet,
                    sentiment_val=tweet_sentiment,
                    idx_start=np.argmax(outputs_start[px, :]),
                    idx_end=np.argmax(outputs_end[px, :]),
                    offsets=offsets[px]
                )
                jaccard_scores.append(jaccard_score)

            jaccards.update(np.mean(jaccard_scores), ids.size(0))
            losses.update(loss.item(), ids.size(0))
            tk0.set_postfix(loss=losses.avg, jaccard=jaccards.avg)
    
    print(f"Validation Jaccard = {jaccards.avg}")
    return jaccards.avg

# Training

In [43]:
def run(fold, train_data_loader, valid_data_loader, train_size, model_config):
    model = TweetModel(conf=model_config)
    model.to(config.DEVICE)
    
    num_train_steps = int(train_size / config.TRAIN_BATCH_SIZE * config.EPOCHS)
    param_optimizer = list(model.named_parameters())
    no_decay = ["bias", "LayerNorm.bias", "LayerNorm.weight"]
    optimizer_parameters = [
        {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': config.WEIGHT_DECAY},
        {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0},
    ]
    optimizer = AdamW(optimizer_parameters, lr=config.LEARNING_RATE)
    scheduler = get_linear_schedule_with_warmup(
        optimizer, 
        num_warmup_steps=0, 
        num_training_steps=num_train_steps
    )
    
    es = EarlyStopping(patience=config.PATIENCE, mode="max")
    
    for epoch in range(config.EPOCHS):
        train_fn(train_data_loader, model, optimizer, scheduler=scheduler)
        jaccard = eval_fn(valid_data_loader, model)
        es(jaccard, model, model_path=f"{config.OUTPUT_DIR}/{config.SAVE_MODEL_DIR}/model_fold_{fold}.bin")
        if es.early_stop:
            print("Early stopping")
            break

In [44]:
train_df = pd.read_csv(os.path.join(config.DATA_DIR, config.TRAIN_FILE))
train_df = train_df.dropna(how='any', axis=0)
splits = list(StratifiedKFold(n_splits=config.N_SPLITS, shuffle=True, random_state=config.SEED).split(train_df, train_df['sentiment']))

model_config = transformers.BertConfig.from_pretrained(config.BERT_PATH)
model_config.output_hidden_states = True

for fold, (train_idx, val_idx) in enumerate(splits):
    print('---Train Fold {}---'.format(fold))
    
    fold_train_df = train_df.iloc[train_idx, :]
    fold_valid_df = train_df.iloc[val_idx, :]
    
    train_dataset = TweetDataset(
        tweet = fold_train_df.text.values,
        sentiment = fold_train_df.sentiment.values,
        selected_text = fold_train_df.selected_text.values
    )
    
    valid_dataset = TweetDataset(
        tweet = fold_valid_df.text.values,
        sentiment = fold_valid_df.sentiment.values,
        selected_text = fold_valid_df.selected_text.values
    )
    
    train_data_loader = torch.utils.data.DataLoader(
        train_dataset,
        batch_size = config.TRAIN_BATCH_SIZE,
        shuffle = True,
        num_workers = 8
    )
    
    valid_data_loader = torch.utils.data.DataLoader(
        valid_dataset,
        batch_size = config.VALID_BATCH_SIZE,
        shuffle = False,
        num_workers = 8
    )
    
    run(fold=fold, 
        train_data_loader=train_data_loader, 
        valid_data_loader=valid_data_loader, 
        train_size=len(train_idx),
        model_config=model_config
       )

I0425 23:21:03.642301 139864187406080 configuration_utils.py:149] loading configuration file /data/ai_challenge/tweet_sentiment_extraction/bert-base-uncased/config.json
I0425 23:21:03.643000 139864187406080 configuration_utils.py:169] Model config {
  "attention_probs_dropout_prob": 0.1,
  "finetuning_task": null,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "is_decoder": false,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "num_labels": 2,
  "output_attentions": false,
  "output_hidden_states": false,
  "output_past": true,
  "pruned_heads": {},
  "torchscript": false,
  "type_vocab_size": 2,
  "use_bfloat16": false,
  "vocab_size": 30522
}

I0425 23:21:03.646480 139864187406080 modeling_utils.py:384] loading weights file /data/ai_challenge/tweet_sentiment_extraction/bert-base-uncased/pytorch_model.bin


Train Fold 0


HBox(children=(IntProgress(value=0, max=2748), HTML(value='')))

Loss: 4.006890003043826 - Jaccard: 0.5641843551036889
Loss: 2.3982258260250093 - Jaccard: 0.6278097417701818
Loss: 2.1957616502046586 - Jaccard: 0.6605873786543285
Loss: 2.2117417457699777 - Jaccard: 0.6587832252461537
Loss: 2.0294745269417764 - Jaccard: 0.6834536842008329
Loss: 2.000752937793732 - Jaccard: 0.6589861732157594
Loss: 2.0507117620110513 - Jaccard: 0.6529942369698438
Loss: 2.023613204956055 - Jaccard: 0.6436526877577187
Loss: 1.9709315341711044 - Jaccard: 0.6747795378235841
Loss: 2.0475505888462067 - Jaccard: 0.6696222275010871
Loss: 2.0431724429130553 - Jaccard: 0.650858327888883
Loss: 1.9772656041383743 - Jaccard: 0.6585664789287548
Loss: 2.0552616330981253 - Jaccard: 0.6609851099666662
Loss: 1.8740272343158721 - Jaccard: 0.6770454389736977
Loss: 1.765090399980545 - Jaccard: 0.6827987630838354
Loss: 1.9156112080812455 - Jaccard: 0.683015192905799
Loss: 1.91412092640996 - Jaccard: 0.6775231139694191
Loss: 1.8191272270679475 - Jaccard: 0.6838461505192979
Loss: 1.7774360516

HBox(children=(IntProgress(value=0, max=688), HTML(value='')))

Eval Jaccard = 0.6924205063985204
Validation score improved (-inf --> 0.6924205063985204). Saving model!


HBox(children=(IntProgress(value=0, max=2748), HTML(value='')))

Loss: 1.6106412092057785 - Jaccard: 0.7209827439982722
Loss: 1.5524682834744454 - Jaccard: 0.7119186405242446
Loss: 1.438403276503086 - Jaccard: 0.7309713882697799
Loss: 1.451978203356266 - Jaccard: 0.7249674359932761
Loss: 1.552498097717762 - Jaccard: 0.7198944857517
Loss: 1.4662634551525116 - Jaccard: 0.730182983043756
Loss: 1.4619652596116066 - Jaccard: 0.7281056459082029
Loss: 1.3672282102704048 - Jaccard: 0.7402388916354566
Loss: 1.486831073462963 - Jaccard: 0.7379813036483414
Loss: 1.4282226225733756 - Jaccard: 0.7444991814764838
Loss: 1.4905841171741485 - Jaccard: 0.7297055939130245
Loss: 1.520411153435707 - Jaccard: 0.7173051083202706
Loss: 1.4854453736543656 - Jaccard: 0.7275581279192997
Loss: 1.5596366119384766 - Jaccard: 0.7230809959096764
Loss: 1.399848034977913 - Jaccard: 0.7368017627452451
Loss: 1.5916595500707627 - Jaccard: 0.7146032801694497
Loss: 1.5352915725111962 - Jaccard: 0.7212640226350238
Loss: 1.4115150326490402 - Jaccard: 0.7497325916256868
Loss: 1.537011525034

HBox(children=(IntProgress(value=0, max=688), HTML(value='')))

Eval Jaccard = 0.6996168272292659
Validation score improved (0.6924205063985204 --> 0.6996168272292659). Saving model!


HBox(children=(IntProgress(value=0, max=2748), HTML(value='')))

Loss: 1.0487954619497355 - Jaccard: 0.7988509902848977
Loss: 1.1027327516674996 - Jaccard: 0.7969612875441274
Loss: 1.0156351475417613 - Jaccard: 0.7922009015621775
Loss: 1.1125477030873299 - Jaccard: 0.7861659920748478
Loss: 1.1701165410876273 - Jaccard: 0.7886576847795778
Loss: 0.9759951317310334 - Jaccard: 0.8085783529152663
Loss: 1.2069919748604299 - Jaccard: 0.762329272854133
Loss: 1.0497337731719016 - Jaccard: 0.7861751067027604
Loss: 1.060850309729576 - Jaccard: 0.8108737340260563
Loss: 1.07986856803298 - Jaccard: 0.7853398800230219
Loss: 1.0388491544127465 - Jaccard: 0.8018827597319668
Loss: 1.080313804447651 - Jaccard: 0.7962434426565831
Loss: 1.1397690007090568 - Jaccard: 0.7702624389548413
Loss: 1.0449483475089074 - Jaccard: 0.8028653378304498
Loss: 1.1259951019287109 - Jaccard: 0.7773704552083585
Loss: 1.1304967713356018 - Jaccard: 0.7723870945543142
Loss: 1.1517181631922722 - Jaccard: 0.7765852713713275
Loss: 1.109026340842247 - Jaccard: 0.7808896450578148
Loss: 1.07700336

HBox(children=(IntProgress(value=0, max=688), HTML(value='')))

Eval Jaccard = 0.6989393613660492
EarlyStopping counter: 1 out of 2


HBox(children=(IntProgress(value=0, max=2748), HTML(value='')))

Loss: 0.6883146016314479 - Jaccard: 0.8459134290737462
Loss: 0.6483116732537746 - Jaccard: 0.8539020519272233
Loss: 0.6224688391387463 - Jaccard: 0.8648387561284548
Loss: 0.6385715100169181 - Jaccard: 0.8676617607935543
Loss: 0.7442042492330074 - Jaccard: 0.8402445831181015
Loss: 0.7288773839175701 - Jaccard: 0.8569386740166776
Loss: 0.6815322598814965 - Jaccard: 0.8278711281249901
Loss: 0.7766256174445152 - Jaccard: 0.8336424901487404
Loss: 0.6366028183698654 - Jaccard: 0.8706729201770226
Loss: 0.6898996062576771 - Jaccard: 0.8536926574198397
Loss: 0.704504166096449 - Jaccard: 0.851559953214262
Loss: 0.6701661312580108 - Jaccard: 0.8410364529424705
Loss: 0.7222430995106697 - Jaccard: 0.8353002373741443
Loss: 0.6501882456243038 - Jaccard: 0.8636141015954286
Loss: 0.7024543158710003 - Jaccard: 0.8513693450127131
Loss: 0.7169994602352381 - Jaccard: 0.8621633893000481
Loss: 0.6804216329753399 - Jaccard: 0.8496835735883785
Loss: 0.750208730250597 - Jaccard: 0.8358955489487778
Loss: 0.68080

HBox(children=(IntProgress(value=0, max=688), HTML(value='')))

I0426 00:23:55.935435 139864187406080 modeling_utils.py:384] loading weights file /data/ai_challenge/tweet_sentiment_extraction/bert-base-uncased/pytorch_model.bin


Eval Jaccard = 0.6903527387172474
EarlyStopping counter: 2 out of 2
Early stopping
Train Fold 1


HBox(children=(IntProgress(value=0, max=2748), HTML(value='')))

Loss: 3.9250927809441443 - Jaccard: 0.5648147640048645
Loss: 2.2488807797431947 - Jaccard: 0.6365967051317013
Loss: 2.1950877195596696 - Jaccard: 0.6413079554057249
Loss: 2.1762791341543197 - Jaccard: 0.6547034876981824
Loss: 2.1291297167539596 - Jaccard: 0.6579244433677441
Loss: 1.9569959825277328 - Jaccard: 0.6673747601274098
Loss: 2.0354069769382477 - Jaccard: 0.6631110626581954
Loss: 1.9677068692445756 - Jaccard: 0.6759339485578733
Loss: 1.8674519318342209 - Jaccard: 0.679766372172262
Loss: 1.9253307110071183 - Jaccard: 0.6935863341782468
Loss: 1.8641150999069214 - Jaccard: 0.6867079773952899
Loss: 1.7604672312736511 - Jaccard: 0.685902734973668
Loss: 1.8455585324764252 - Jaccard: 0.6750558153002418
Loss: 1.8171142828464508 - Jaccard: 0.6784543575569466
Loss: 1.9242721816897392 - Jaccard: 0.6794612118954185
Loss: 1.7227347409725189 - Jaccard: 0.7114230449603977
Loss: 1.906080068051815 - Jaccard: 0.6706429195369963
Loss: 1.7305699768662453 - Jaccard: 0.6994914250083386
Loss: 1.72889

HBox(children=(IntProgress(value=0, max=688), HTML(value='')))

Eval Jaccard = 0.7027429484610169
Validation score improved (-inf --> 0.7027429484610169). Saving model!


HBox(children=(IntProgress(value=0, max=2748), HTML(value='')))

Loss: 1.456253868223417 - Jaccard: 0.7377256109157847
Loss: 1.432737732231617 - Jaccard: 0.7508758968239602
Loss: 1.419961133003235 - Jaccard: 0.7357621964040577
Loss: 1.5161054664850235 - Jaccard: 0.7304498509939492
Loss: 1.4336817690730095 - Jaccard: 0.7355223079893501
Loss: 1.4963116344809533 - Jaccard: 0.7169645771876769
Loss: 1.4195582857728004 - Jaccard: 0.727467005873169
Loss: 1.484325020313263 - Jaccard: 0.7336636612125463
Loss: 1.483836367726326 - Jaccard: 0.7130708372855363
Loss: 1.499843258559704 - Jaccard: 0.7394320899672205
Loss: 1.4730656361579895 - Jaccard: 0.7311902229330921
Loss: 1.472093179821968 - Jaccard: 0.7337734901713495
Loss: 1.5409895172715187 - Jaccard: 0.7206307811280938
Loss: 1.529994381070137 - Jaccard: 0.7131734943643744
Loss: 1.494701928794384 - Jaccard: 0.7177375697527616
Loss: 1.4377261531352996 - Jaccard: 0.7395234486284699
Loss: 1.4851282092928886 - Jaccard: 0.7424698644291986
Loss: 1.5411504071950912 - Jaccard: 0.7132853859728909
Loss: 1.403349785506

HBox(children=(IntProgress(value=0, max=688), HTML(value='')))

Eval Jaccard = 0.6957868377831454
EarlyStopping counter: 1 out of 2


HBox(children=(IntProgress(value=0, max=2748), HTML(value='')))

Loss: 1.083650127790942 - Jaccard: 0.7899867427707716
Loss: 1.1184110862016678 - Jaccard: 0.7707891515089657
Loss: 1.0239257436990739 - Jaccard: 0.8062241209104792
Loss: 1.1211995008587836 - Jaccard: 0.7827344028009219
Loss: 1.0303364756703377 - Jaccard: 0.7900712644978486
Loss: 1.0412088388204574 - Jaccard: 0.7960908911242696
Loss: 0.9626351654529571 - Jaccard: 0.7967571745616147
Loss: 1.127851814031601 - Jaccard: 0.7671428821959511
Loss: 1.044112186729908 - Jaccard: 0.7910692179409129
Loss: 0.9897830283641815 - Jaccard: 0.7875691607250175
Loss: 1.024745173752308 - Jaccard: 0.7909579386668074
Loss: 1.0912082123756408 - Jaccard: 0.7649691001310447
Loss: 1.1394722107052804 - Jaccard: 0.7725228403823852
Loss: 0.9692498990893363 - Jaccard: 0.8032521642624475
Loss: 1.0680256149172782 - Jaccard: 0.7921167889149424
Loss: 1.040019938275218 - Jaccard: 0.8027529557697487
Loss: 1.0718903666734696 - Jaccard: 0.7901800276092235
Loss: 1.0320831800997257 - Jaccard: 0.7940996807036947
Loss: 1.1239271

HBox(children=(IntProgress(value=0, max=688), HTML(value='')))

I0426 01:10:53.158895 139864187406080 modeling_utils.py:384] loading weights file /data/ai_challenge/tweet_sentiment_extraction/bert-base-uncased/pytorch_model.bin


Eval Jaccard = 0.700268322000635
EarlyStopping counter: 2 out of 2
Early stopping
Train Fold 2


HBox(children=(IntProgress(value=0, max=2749), HTML(value='')))

Loss: 4.131379029538372 - Jaccard: 0.5456695245048565
Loss: 2.4675424838066102 - Jaccard: 0.6202544722406413
Loss: 2.312286288142204 - Jaccard: 0.6383981655289267
Loss: 2.006708064675331 - Jaccard: 0.6774616979091952
Loss: 2.078600796163082 - Jaccard: 0.6576797574881907
Loss: 1.9669051307439804 - Jaccard: 0.6750861036273536
Loss: 1.9062970158457757 - Jaccard: 0.6985514099174069
Loss: 1.8667505809664726 - Jaccard: 0.692750042713667
Loss: 2.019109428524971 - Jaccard: 0.6700419305920051
Loss: 1.947121599316597 - Jaccard: 0.6705616237771386
Loss: 1.8395690619945526 - Jaccard: 0.6922063263475488
Loss: 1.9346526181697845 - Jaccard: 0.6741102043217099
Loss: 1.962025713324547 - Jaccard: 0.6669156982919265
Loss: 1.8171371841430664 - Jaccard: 0.6981547837065165
Loss: 1.8451337176561355 - Jaccard: 0.701321805378443
Loss: 1.880200001001358 - Jaccard: 0.6949488748077671
Loss: 1.7475495153665543 - Jaccard: 0.7015526671745025
Loss: 1.8271014299988746 - Jaccard: 0.6743317880718801
Loss: 1.806049458384

HBox(children=(IntProgress(value=0, max=687), HTML(value='')))

Eval Jaccard = 0.696875335335317
Validation score improved (-inf --> 0.696875335335317). Saving model!


HBox(children=(IntProgress(value=0, max=2749), HTML(value='')))

Loss: 1.4994063521965895 - Jaccard: 0.7325438990462689
Loss: 1.4641456162929536 - Jaccard: 0.7419673041769091
Loss: 1.5068176278471948 - Jaccard: 0.7266337621577823
Loss: 1.4150820636749268 - Jaccard: 0.7367689223784956
Loss: 1.417411946952343 - Jaccard: 0.7246451720049555
Loss: 1.4442904382944106 - Jaccard: 0.7325651628957819
Loss: 1.5625090879201888 - Jaccard: 0.7184263822794869
Loss: 1.457356193959713 - Jaccard: 0.7300801814173191
Loss: 1.5025572502613067 - Jaccard: 0.7292004372060643
Loss: 1.3927627649903298 - Jaccard: 0.7419757418911925
Loss: 1.6507721954584123 - Jaccard: 0.6900033234438421
Loss: 1.4805974617600441 - Jaccard: 0.7394060518085284
Loss: 1.4194888329505921 - Jaccard: 0.7340950518649659
Loss: 1.5119445186853409 - Jaccard: 0.716236854547998
Loss: 1.454479334950447 - Jaccard: 0.7285139313532203
Loss: 1.6197334748506547 - Jaccard: 0.7131072124844118
Loss: 1.5309252420067787 - Jaccard: 0.7433224679732032
Loss: 1.4237790083885193 - Jaccard: 0.7345803307837859
Loss: 1.606993

HBox(children=(IntProgress(value=0, max=687), HTML(value='')))

Eval Jaccard = 0.7011417559041289
Validation score improved (0.696875335335317 --> 0.7011417559041289). Saving model!


HBox(children=(IntProgress(value=0, max=2749), HTML(value='')))

Loss: 1.1004580417186907 - Jaccard: 0.7860456769342651
Loss: 1.011516844779253 - Jaccard: 0.7974109035809467
Loss: 1.0579987615346909 - Jaccard: 0.7931042220675019
Loss: 1.0547941291332246 - Jaccard: 0.7914018551810115
Loss: 1.076277250945568 - Jaccard: 0.7854519856998529
Loss: 0.9699429592490196 - Jaccard: 0.807469717245259
Loss: 1.084708569943905 - Jaccard: 0.7745589409011531
Loss: 1.0417032894492149 - Jaccard: 0.8049156313374205
Loss: 1.0366667664051057 - Jaccard: 0.7753715248005051
Loss: 1.000251711010933 - Jaccard: 0.8052805213117357
Loss: 1.0138977479934692 - Jaccard: 0.803007423523703
Loss: 1.0481782159209252 - Jaccard: 0.7661813793302951
Loss: 1.0631202456355096 - Jaccard: 0.7895974388338535
Loss: 1.1184745854139329 - Jaccard: 0.7809326985429043
Loss: 0.9854915469884873 - Jaccard: 0.799181016816608
Loss: 1.1482181166112424 - Jaccard: 0.7758473235094497
Loss: 1.180724088549614 - Jaccard: 0.7837947407822616
Loss: 1.0938086956739426 - Jaccard: 0.786445047368567
Loss: 1.03435878306

HBox(children=(IntProgress(value=0, max=687), HTML(value='')))

Eval Jaccard = 0.6929554526187361
EarlyStopping counter: 1 out of 2


HBox(children=(IntProgress(value=0, max=2749), HTML(value='')))

Loss: 0.6173136530831309 - Jaccard: 0.8617079775731431
Loss: 0.6375527827441693 - Jaccard: 0.8453013536450651
Loss: 0.7026452292501927 - Jaccard: 0.8541909161775041
Loss: 0.6831801827251911 - Jaccard: 0.8433272542235813
Loss: 0.5769408913701772 - Jaccard: 0.8644301690274403
Loss: 0.6985217029601336 - Jaccard: 0.8389033431105859
Loss: 0.668329411149025 - Jaccard: 0.8489918694651977
Loss: 0.7128353004157543 - Jaccard: 0.8373615921591397
Loss: 0.6797185006737709 - Jaccard: 0.8390173016426032
Loss: 0.7085574121773243 - Jaccard: 0.8540307420676139
Loss: 0.6403227354586124 - Jaccard: 0.8676752415906249
Loss: 0.6542784689366817 - Jaccard: 0.8471978186128643
Loss: 0.6659809416532516 - Jaccard: 0.8442402022701905
Loss: 0.6711973598599434 - Jaccard: 0.8665008465756884
Loss: 0.6137027791142464 - Jaccard: 0.8557391617807063
Loss: 0.7011162474751472 - Jaccard: 0.8463590497631569
Loss: 0.7150163793563843 - Jaccard: 0.8531044562527179
Loss: 0.6307059679925442 - Jaccard: 0.8555657184063077
Loss: 0.665

HBox(children=(IntProgress(value=0, max=687), HTML(value='')))

I0426 02:13:30.891991 139864187406080 modeling_utils.py:384] loading weights file /data/ai_challenge/tweet_sentiment_extraction/bert-base-uncased/pytorch_model.bin


Eval Jaccard = 0.686046587489046
EarlyStopping counter: 2 out of 2
Early stopping
Train Fold 3


HBox(children=(IntProgress(value=0, max=2749), HTML(value='')))

Loss: 3.957115610047142 - Jaccard: 0.5743215172253557
Loss: 2.503328174948692 - Jaccard: 0.6094575514182323
Loss: 2.291693057715893 - Jaccard: 0.6489431036157443
Loss: 2.1018513053655625 - Jaccard: 0.6592310634003005
Loss: 2.0894598829746247 - Jaccard: 0.6763796894365767
Loss: 2.1152956050634386 - Jaccard: 0.640902073416686
Loss: 1.9093018049001693 - Jaccard: 0.6868650719110502
Loss: 1.893374165892601 - Jaccard: 0.668292534403432
Loss: 2.083606230020523 - Jaccard: 0.6609861109953147
Loss: 1.923269934654236 - Jaccard: 0.6849875391116075
Loss: 2.0732654185593127 - Jaccard: 0.6589866293851668
Loss: 1.7381196624040605 - Jaccard: 0.6960415772960011
Loss: 1.9154135650396347 - Jaccard: 0.6857828325532845
Loss: 1.8193187233805657 - Jaccard: 0.6816999635476608
Loss: 1.858154098391533 - Jaccard: 0.6898709910837337
Loss: 1.8192259129881858 - Jaccard: 0.6891580922590296
Loss: 1.905450363755226 - Jaccard: 0.6758405701497111
Loss: 1.7131214880943297 - Jaccard: 0.6897529714271852
Loss: 1.728200637698

HBox(children=(IntProgress(value=0, max=687), HTML(value='')))

Eval Jaccard = 0.7084764475712458
Validation score improved (-inf --> 0.7084764475712458). Saving model!


HBox(children=(IntProgress(value=0, max=2749), HTML(value='')))

Loss: 1.6094354818953145 - Jaccard: 0.7274121120923334
Loss: 1.6032496851682663 - Jaccard: 0.7052773276182304
Loss: 1.4050411561131477 - Jaccard: 0.734766570323755
Loss: 1.4251287716627121 - Jaccard: 0.7295468238091547
Loss: 1.4383855217695236 - Jaccard: 0.7437501096827107
Loss: 1.4706201681494713 - Jaccard: 0.7320091160617328
Loss: 1.4365833279490472 - Jaccard: 0.7334819612400938
Loss: 1.4868589219450952 - Jaccard: 0.7353368056478192
Loss: 1.5378033024072648 - Jaccard: 0.7299970155210284
Loss: 1.5529287469387054 - Jaccard: 0.6994966001217623
Loss: 1.544970475435257 - Jaccard: 0.7255443568348636
Loss: 1.6520982247591018 - Jaccard: 0.7138669540051487
Loss: 1.5203256672620773 - Jaccard: 0.733757909067944
Loss: 1.6108260598778725 - Jaccard: 0.7114563781744326
Loss: 1.4418796774744989 - Jaccard: 0.7162282096378648
Loss: 1.5036164888739585 - Jaccard: 0.7120927793541115
Loss: 1.5117526349425316 - Jaccard: 0.727427734836113
Loss: 1.537513672709465 - Jaccard: 0.72412111351635
Loss: 1.560505313

HBox(children=(IntProgress(value=0, max=687), HTML(value='')))

Eval Jaccard = 0.7141036364033579
Validation score improved (0.7084764475712458 --> 0.7141036364033579). Saving model!


HBox(children=(IntProgress(value=0, max=2749), HTML(value='')))

Loss: 1.0203930924434472 - Jaccard: 0.7917715209510015
Loss: 1.1206414696574212 - Jaccard: 0.7767008640990645
Loss: 1.1288139626383782 - Jaccard: 0.7741833277544257
Loss: 1.129126259982586 - Jaccard: 0.7780497246045541
Loss: 1.0494600482285024 - Jaccard: 0.7790325219059838
Loss: 1.0748820962011814 - Jaccard: 0.7995052742651433
Loss: 1.109474211037159 - Jaccard: 0.7902961895929917
Loss: 1.1659309920668601 - Jaccard: 0.7732570242163423
Loss: 1.1301618553698063 - Jaccard: 0.7850112259788655
Loss: 1.1143588626384735 - Jaccard: 0.7702336494977686
Loss: 1.1575595581531524 - Jaccard: 0.7787054319543766
Loss: 1.1071906965970992 - Jaccard: 0.7750260586109369
Loss: 1.0518892189860345 - Jaccard: 0.791218295677108
Loss: 1.068937139213085 - Jaccard: 0.7894719815712756
Loss: 1.1188599161803723 - Jaccard: 0.7867410019391065
Loss: 1.0741998073458672 - Jaccard: 0.7828952828121813
Loss: 1.0513454385101795 - Jaccard: 0.7870636356665243
Loss: 1.1402086895704269 - Jaccard: 0.7694540690101671
Loss: 1.061158

HBox(children=(IntProgress(value=0, max=687), HTML(value='')))

Eval Jaccard = 0.6963811122408767
EarlyStopping counter: 1 out of 2


HBox(children=(IntProgress(value=0, max=2749), HTML(value='')))

Loss: 0.6915544146063304 - Jaccard: 0.8452065194260058
Loss: 0.6318357204645872 - Jaccard: 0.8495486543979557
Loss: 0.6962790650129318 - Jaccard: 0.8479150900116171
Loss: 0.7342992876470089 - Jaccard: 0.8351676872710722
Loss: 0.7190980030596257 - Jaccard: 0.8508523255564486
Loss: 0.6296802043914795 - Jaccard: 0.8552999328147419
Loss: 0.6700930424034596 - Jaccard: 0.8538562687321091
Loss: 0.6836408899724483 - Jaccard: 0.858186597170413
Loss: 0.6845359553396702 - Jaccard: 0.835900368649818
Loss: 0.6361523762345314 - Jaccard: 0.8644377763007589
Loss: 0.7083011068403721 - Jaccard: 0.8425824673601273
Loss: 0.7246220940351487 - Jaccard: 0.8438182384857725
Loss: 0.715951601266861 - Jaccard: 0.8393948745545596
Loss: 0.7301467809081078 - Jaccard: 0.8289989447196082
Loss: 0.6646251973882318 - Jaccard: 0.8498355306555837
Loss: 0.7230915129184723 - Jaccard: 0.8383146412744961
Loss: 0.6578884265571833 - Jaccard: 0.8438575715996002
Loss: 0.6284659255295992 - Jaccard: 0.8624067610655168
Loss: 0.75290

HBox(children=(IntProgress(value=0, max=687), HTML(value='')))

I0426 03:16:08.853911 139864187406080 modeling_utils.py:384] loading weights file /data/ai_challenge/tweet_sentiment_extraction/bert-base-uncased/pytorch_model.bin


Eval Jaccard = 0.6957977689536566
EarlyStopping counter: 2 out of 2
Early stopping
Train Fold 4


HBox(children=(IntProgress(value=0, max=2749), HTML(value='')))

Loss: 3.866982638245762 - Jaccard: 0.5832969384941282
Loss: 2.512176737189293 - Jaccard: 0.6044478649997962
Loss: 2.2072829037904738 - Jaccard: 0.6478503691584966
Loss: 2.0806332713365556 - Jaccard: 0.659156920909496
Loss: 2.171147363781929 - Jaccard: 0.6439349483115171
Loss: 2.0125829535722732 - Jaccard: 0.6729104815028158
Loss: 1.9713353776931763 - Jaccard: 0.6772864351082802
Loss: 1.9924335139989853 - Jaccard: 0.6563896212385741
Loss: 1.9599557876586915 - Jaccard: 0.6621429308729422
Loss: 1.8652702990174292 - Jaccard: 0.6969418129777913
Loss: 1.7828771328926087 - Jaccard: 0.6925310130545818
Loss: 1.8293436223268509 - Jaccard: 0.6744049708033067
Loss: 1.993977781534195 - Jaccard: 0.6582195708621192
Loss: 1.9055063605308533 - Jaccard: 0.662875180303368
Loss: 1.865543693304062 - Jaccard: 0.6729118560237473
Loss: 1.87390487074852 - Jaccard: 0.685677717123152
Loss: 1.893412288427353 - Jaccard: 0.6841445965382883
Loss: 1.8366708087921142 - Jaccard: 0.6711764202158088
Loss: 1.8271589779853

HBox(children=(IntProgress(value=0, max=687), HTML(value='')))

Eval Jaccard = 0.6972649499515994
Validation score improved (-inf --> 0.6972649499515994). Saving model!


HBox(children=(IntProgress(value=0, max=2749), HTML(value='')))

Loss: 1.4557868652414567 - Jaccard: 0.7489423385609864
Loss: 1.511812302172184 - Jaccard: 0.7227037581151647
Loss: 1.2974335381388664 - Jaccard: 0.7570847673353479
Loss: 1.5244905769824981 - Jaccard: 0.7371176018051548
Loss: 1.5194003239274025 - Jaccard: 0.7334389141695135
Loss: 1.49105104804039 - Jaccard: 0.7192172699700697
Loss: 1.5214613628387452 - Jaccard: 0.7137459086039292
Loss: 1.4389581969380378 - Jaccard: 0.729086784588671
Loss: 1.5028460919857025 - Jaccard: 0.733141986306906
Loss: 1.5228749388456344 - Jaccard: 0.7265329354076848
Loss: 1.4801782149076461 - Jaccard: 0.7439439599057631
Loss: 1.4979138001799583 - Jaccard: 0.7310023423958952
Loss: 1.6523909589648247 - Jaccard: 0.7188076184620619
Loss: 1.5042451202869416 - Jaccard: 0.7245283624362451
Loss: 1.482791654765606 - Jaccard: 0.7428477544869171
Loss: 1.4092244058847427 - Jaccard: 0.7511603639728041
Loss: 1.3878144717216492 - Jaccard: 0.7401971154602565
Loss: 1.4409482270479201 - Jaccard: 0.7402760711775456
Loss: 1.52189621

HBox(children=(IntProgress(value=0, max=687), HTML(value='')))

Eval Jaccard = 0.6946055779990511
EarlyStopping counter: 1 out of 2


HBox(children=(IntProgress(value=0, max=2749), HTML(value='')))

Loss: 1.0429256942012521 - Jaccard: 0.7934992678614856
Loss: 1.062065132856369 - Jaccard: 0.8007779229648839
Loss: 1.0403044363856315 - Jaccard: 0.7876143928350798
Loss: 0.987800139784813 - Jaccard: 0.7998503529670739
Loss: 1.1669888666272163 - Jaccard: 0.7700310562935353
Loss: 1.0783107428252696 - Jaccard: 0.7854987109534993
Loss: 1.0647250846028329 - Jaccard: 0.800814799658415
Loss: 1.0023440396785737 - Jaccard: 0.7885185737218147
Loss: 1.0596109688282014 - Jaccard: 0.8044258572653699
Loss: 1.0654757928848266 - Jaccard: 0.7946464921392191
Loss: 1.0451233641803264 - Jaccard: 0.8004563033059955
Loss: 1.076701924800873 - Jaccard: 0.784338012910927
Loss: 0.8958338136970997 - Jaccard: 0.8008538094118139
Loss: 1.0498445466160775 - Jaccard: 0.7985606045217132
Loss: 1.106883365213871 - Jaccard: 0.7906696442602396
Loss: 1.0904978999495507 - Jaccard: 0.7819779729714789
Loss: 1.0959253200888635 - Jaccard: 0.781870105732107
Loss: 1.0570912951231002 - Jaccard: 0.7832638525208289
Loss: 1.144183732

HBox(children=(IntProgress(value=0, max=687), HTML(value='')))

Eval Jaccard = 0.6907326772750885
EarlyStopping counter: 2 out of 2
Early stopping


# Testing

In [12]:
test_df = pd.read_csv(os.path.join(config.DATA_DIR, config.TEST_FILE))
test_df.loc[:, 'selected_text'] = test_df.text.values

In [13]:
model_config = transformers.BertConfig.from_pretrained(config.BERT_PATH)
model_config.output_hidden_states = True

I0424 22:58:20.596534 139864187406080 configuration_utils.py:149] loading configuration file /data/ai_challenge/tweet_sentiment_extraction/bert-base-uncased/config.json
I0424 22:58:20.600911 139864187406080 configuration_utils.py:169] Model config {
  "attention_probs_dropout_prob": 0.1,
  "finetuning_task": null,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "is_decoder": false,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "num_labels": 2,
  "output_attentions": false,
  "output_hidden_states": false,
  "output_past": true,
  "pruned_heads": {},
  "torchscript": false,
  "type_vocab_size": 2,
  "use_bfloat16": false,
  "vocab_size": 30522
}



In [None]:
model1 = TweetModel(conf=model_config)
model1.to(config.DEVICE)
model1.load_state_dict(torch.load(f"{config.OUTPUT_DIR}/{config.SAVE_MODEL_DIR}/model_fold_0.bin"))
model1.eval()

model2 = TweetModel(conf=model_config)
model2.to(config.DEVICE)
model2.load_state_dict(torch.load(f"{config.OUTPUT_DIR}/{config.SAVE_MODEL_DIR}/model_fold_1.bin"))
model2.eval()

model3 = TweetModel(conf=model_config)
model3.to(config.DEVICE)
model3.load_state_dict(torch.load(f"{config.OUTPUT_DIR}/{config.SAVE_MODEL_DIR}/model_fold_2.bin"))
model3.eval()

model4 = TweetModel(conf=model_config)
model4.to(config.DEVICE)
model4.load_state_dict(torch.load(f"{config.OUTPUT_DIR}/{config.SAVE_MODEL_DIR}/model_fold_3.bin"))
model4.eval()

model5 = TweetModel(conf=model_config)
model5.to(config.DEVICE)
model5.load_state_dict(torch.load(f"{config.OUTPUT_DIR}/{config.SAVE_MODEL_DIR}/model_fold_4.bin"))
model5.eval()

In [17]:
final_output = []

test_dataset = TweetDataset(
        tweet = test_df.text.values,
        sentiment = test_df.sentiment.values,
        selected_text = test_df.selected_text.values
)

test_data_loader = torch.utils.data.DataLoader(
    test_dataset,
    batch_size = config.VALID_BATCH_SIZE,
    shuffle = False,
    num_workers = 8
)

with torch.no_grad():
    tk0 = tqdm(test_data_loader, total=len(test_data_loader))
    for bi, d in enumerate(tk0):
        ids = d["ids"]
        token_type_ids = d["token_type_ids"]
        mask = d["mask"]
        sentiment = d["sentiment"]
        orig_selected = d["orig_selected"]
        orig_tweet = d["orig_tweet"]
        targets_start = d["targets_start"]
        targets_end = d["targets_end"]
        offsets = d["offsets"].numpy()

        ids = ids.to(config.DEVICE, dtype=torch.long)
        token_type_ids = token_type_ids.to(config.DEVICE, dtype=torch.long)
        mask = mask.to(config.DEVICE, dtype=torch.long)
        targets_start = targets_start.to(config.DEVICE, dtype=torch.long)
        targets_end = targets_end.to(config.DEVICE, dtype=torch.long)

        outputs_start1, outputs_end1 = model1(
            ids=ids,
            mask=mask,
            token_type_ids=token_type_ids
        )
        
        outputs_start2, outputs_end2 = model2(
            ids=ids,
            mask=mask,
            token_type_ids=token_type_ids
        )
        
        outputs_start3, outputs_end3 = model3(
            ids=ids,
            mask=mask,
            token_type_ids=token_type_ids
        )
        
        outputs_start4, outputs_end4 = model4(
            ids=ids,
            mask=mask,
            token_type_ids=token_type_ids
        )
        
        outputs_start5, outputs_end5 = model5(
            ids=ids,
            mask=mask,
            token_type_ids=token_type_ids
        )
        
        outputs_start = (
            outputs_start1 
            + outputs_start2 
            + outputs_start3 
            + outputs_start4 
            + outputs_start5
        ) / 5
        
        outputs_end = (
            outputs_end1 
            + outputs_end2 
            + outputs_end3 
            + outputs_end4 
            + outputs_end5
        ) / 5
        
        outputs_start = torch.softmax(outputs_start, dim=1).cpu().detach().numpy()
        outputs_end = torch.softmax(outputs_end, dim=1).cpu().detach().numpy()

        for px, tweet in enumerate(orig_tweet):
            selected_tweet = orig_selected[px]
            tweet_sentiment = sentiment[px]
            
            _, output_sentence = calculate_jaccard_score(
                original_tweet=tweet,
                target_string=selected_tweet,
                sentiment_val=tweet_sentiment,
                idx_start=np.argmax(outputs_start[px, :]),
                idx_end=np.argmax(outputs_end[px, :]),
                offsets=offsets[px]
            )
            
            final_output.append(output_sentence)

I0424 23:18:49.669809 139864187406080 modeling_utils.py:384] loading weights file /data/ai_challenge/tweet_sentiment_extraction/bert-base-uncased/pytorch_model.bin


HBox(children=(IntProgress(value=0, max=442), HTML(value='')))

> [0;32m<ipython-input-17-b83926e6aba2>[0m(26)[0;36m<module>[0;34m()[0m
[0;32m     24 [0;31m    [0;32mwith[0m [0mtorch[0m[0;34m.[0m[0mno_grad[0m[0;34m([0m[0;34m)[0m[0;34m:[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m     25 [0;31m        [0mtk0[0m [0;34m=[0m [0mtqdm[0m[0;34m([0m[0mtest_data_loader[0m[0;34m,[0m [0mtotal[0m[0;34m=[0m[0mlen[0m[0;34m([0m[0mtest_data_loader[0m[0;34m)[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m---> 26 [0;31m        [0;32mfor[0m [0mbi[0m[0;34m,[0m [0md[0m [0;32min[0m [0menumerate[0m[0;34m([0m[0mtk0[0m[0;34m)[0m[0;34m:[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m     27 [0;31m            [0mids[0m [0;34m=[0m [0md[0m[0;34m[[0m[0;34m"ids"[0m[0;34m][0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m     28 [0;31m            [0mtoken_type_ids[0m [0;34m=[0m [0md[0m[0;34m[[0m[0;34m"token_type_ids"[0m[0;34m][0m[0;34m[0m[0;34m[0m[0m
[0m
ipdb> outputs_start.shape
(8, 128)
ipdb> np

BdbQuit: 

In [23]:
sample = pd.read_csv(os.path.join(config.DATA_DIR, "sample_submission.csv"))
sample.loc[:, 'selected_text'] = final_output
sample.to_csv(os.path.join(config.OUTPUT_DIR, "submission.csv"), index=False)

sample.head()

Unnamed: 0,textID,selected_text
0,f87dea47db,http://twitpic.com/67ezh last session day the of
1,96d74cb729,good
2,eee518ae67,shame! a such
3,01082688c6,happy bday!
4,33987a8ee5,it!! like i
