# Import

In [70]:
import os
import re
import random
import numpy as np
import pandas as pd
import string
from tqdm import tqdm, trange, tqdm_notebook

import torch
import torch.nn as nn
from torch.nn import functional as F
from torch.optim import lr_scheduler

from sklearn import model_selection
from sklearn import metrics
from sklearn.model_selection import StratifiedKFold

import transformers
import tokenizers
from transformers import AdamW, get_linear_schedule_with_warmup

from IPython.core.debugger import set_trace

# Settings

In [71]:
class config:
    DATA_DIR = '../tweet_sentiment_extraction'
    ROBERTA_PATH = '../tweet_sentiment_extraction/roberta-base'
    OUTPUT_DIR = '../tweet_sentiment_extraction'
    TRAIN_FILE = 'train.csv'
    TEST_FILE = 'test.csv'
    SAVE_MODEL_DIR = 'trained_model'
    MAX_LEN = 128
    TRAIN_BATCH_SIZE = 32
    VALID_BATCH_SIZE = 32
    LOGGING_STEPS = 100
    LEARNING_RATE = 3e-5
    WEIGHT_DECAY = 1e-3
    WARM_UP_STEPS = 100
    GRADIENT_ACCUMULATION_STEPS = 1
    EPOCHS = 3
    N_SPLITS = 5
    PATIENCE = 2
    SEED = 1111
    DEVICE = torch.device('cuda')
    TOKENIZER = tokenizers.ByteLevelBPETokenizer(
        vocab_file=f"{ROBERTA_PATH}/vocab.json", 
        merges_file=f"{ROBERTA_PATH}/merges.txt", 
        lowercase=True,
        add_prefix_space=True
    )

In [72]:
def seed_everything(seed=1234):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False
seed_everything(config.SEED)

# Utils

In [73]:
class AverageMeter:
    """
    Computes and stores the average and current value
    """
    def __init__(self):
        self.reset()

    def reset(self):
        self.val = 0
        self.avg = 0
        self.sum = 0
        self.count = 0

    def update(self, val, n=1):
        self.val = val
        self.sum += val * n
        self.count += n
        self.avg = self.sum / self.count


class EarlyStopping:
    def __init__(self, patience=7, mode="max", delta=0.001):
        self.patience = patience
        self.counter = 0
        self.mode = mode
        self.best_score = None
        self.early_stop = False
        self.delta = delta
        if self.mode == "min":
            self.val_score = np.Inf
        else:
            self.val_score = -np.Inf

    def __call__(self, epoch_score, model, model_path):

        if self.mode == "min":
            score = -1.0 * epoch_score
        else:
            score = np.copy(epoch_score)

        if self.best_score is None:
            self.best_score = score
            self.save_checkpoint(epoch_score, model, model_path)
        elif score < self.best_score + self.delta:
            self.counter += 1
            print('EarlyStopping counter: {} out of {}'.format(self.counter, self.patience))
            if self.counter >= self.patience:
                self.early_stop = True
        else:
            self.best_score = score
            self.save_checkpoint(epoch_score, model, model_path)
            self.counter = 0

    def save_checkpoint(self, epoch_score, model, model_path):
        if epoch_score not in [-np.inf, np.inf, -np.nan, np.nan]:
            print('Validation score improved ({} --> {}). Saving model!'.format(self.val_score, epoch_score))
            torch.save(model.state_dict(), model_path)
        self.val_score = epoch_score

def jaccard(str1, str2): 
    a = set(str1.lower().split()) 
    b = set(str2.lower().split())
    c = a.intersection(b)
    return float(len(c)) / (len(a) + len(b) - len(c))

# def remove_URL(text):
#     url = re.compile(r'https?://[a-zA-Z0-9]+\.[a-zA-Z0-9]+/[a-zA-Z0-9]*')
#     return url.sub(r'URL', text)

# def data_aug(s):
#     text, selected_text = s.text, s.selected_text
#     text_split = text.split(selected_text)
#     text_begin, text_end = text_split[0], text_split[1]
#     text_begin_cut = " ".join(text_begin.split()[1:])
#     text_end_cut = " ".join(text_end.split()[:-1])
    
#     return " ".join([text_begin_cut, selected_text, text_end_cut])

# Data Processing

In [74]:
def process_data(tweet, selected_text, sentiment, tokenizer, max_len):
    tweet = " " + " ".join(str(tweet).split())
    selected_text = " " + " ".join(str(selected_text).split())

    len_st = len(selected_text) - 1
    idx0 = None
    idx1 = None

    for ind in (i for i, e in enumerate(tweet) if e == selected_text[1]):
        if " " + tweet[ind: ind+len_st] == selected_text:
            idx0 = ind
            idx1 = ind + len_st - 1
            break
            
    if idx0 == None:
        print(f'Cannot find selected text "{selected_text}" in "{tweet}"')

    char_targets = [0] * len(tweet)
    if idx0 != None and idx1 != None:
        for ct in range(idx0, idx1 + 1):
            char_targets[ct] = 1
    
    tok_tweet = tokenizer.encode(tweet)
    input_ids_orig = tok_tweet.ids
    tweet_offsets = tok_tweet.offsets
    
    target_idx = []
    for j, (offset1, offset2) in enumerate(tweet_offsets):
        if sum(char_targets[offset1: offset2]) > 0:
            target_idx.append(j)
    
    targets_start = target_idx[0]
    targets_end = target_idx[-1]

    sentiment_id = {
        'positive': 1313,
        'negative': 2430,
        'neutral': 7974
    }
    
    input_ids = [0] + tokenizer.encode(sentiment).ids + [2] + [2] + input_ids_orig + [2]
    token_type_ids = [0, 0, 0, 0] + [0] * (len(input_ids_orig) + 1)
    mask = [1] * len(token_type_ids)
    tweet_offsets = [(0, 0)] * 4 + tweet_offsets + [(0, 0)]
    targets_start += 4
    targets_end += 4

    padding_length = max_len - len(input_ids)
    if padding_length > 0:
        input_ids = input_ids + ([1] * padding_length)
        mask = mask + ([0] * padding_length)
        token_type_ids = token_type_ids + ([0] * padding_length)
        tweet_offsets = tweet_offsets + ([(0, 0)] * padding_length)
    
    return {
        'ids': input_ids,
        'mask': mask,
        'token_type_ids': token_type_ids,
        'targets_start': targets_start,
        'targets_end': targets_end,
        'orig_tweet': tweet,
        'orig_selected': selected_text,
        'sentiment': sentiment,
        'offsets': tweet_offsets
    }

In [75]:
# def process_data(tweet, selected_text, sentiment, tokenizer, max_len):
#     tweet = " " + " ".join(str(tweet).split())
#     selected_text = " " + " ".join(str(selected_text).split())

#     len_st = len(selected_text) - 1
#     idx0 = None
#     idx1 = None

#     for ind in (i for i, e in enumerate(tweet) if e == selected_text[1]):
#         if " " + tweet[ind: ind+len_st] == selected_text:
#             idx0 = ind
#             idx1 = ind + len_st - 1
#             break
            
#     if idx0 == None:
#         print(f'Cannot find selected text "{selected_text}" in "{tweet}"')

#     char_targets = [0] * len(tweet)
#     if idx0 != None and idx1 != None:
#         for ct in range(idx0, idx1 + 1):
#             char_targets[ct] = 1
    
#     tok_tweet = tokenizer.encode(tweet)
#     input_ids_orig = tok_tweet.ids
#     tweet_offsets = tok_tweet.offsets
    
#     target_idx = []
#     for j, (offset1, offset2) in enumerate(tweet_offsets):
#         if sum(char_targets[offset1: offset2]) > 0:
#             target_idx.append(j)
    
#     if len(target_idx) == 0:
#         print("### Alert")
    
# #     if len(target_idx) == 0 and len(selected_text.split()) > 1:
# # #         if len(selected_text.split()) > 1
# #         print(f'text: {tweet}\nselected text: {selected_text}')
# #         print(tok_tweet.tokens)
# #         print(tweet_offsets)
# #         print(target_idx)


In [76]:
# # Test process data
# train_df = pd.read_csv(os.path.join(config.DATA_DIR, config.TRAIN_FILE))
# train_df = train_df.dropna(how='any', axis=0)

# def remove_URL(text):
#     url = re.compile(r'https?://[a-zA-Z0-9]+\.[a-zA-Z0-9]+/[a-zA-Z0-9]*')
#     return url.sub(r'',text)

# train_df['text'] = train_df['text'].apply(remove_URL)
# train_df['selected_text'] = train_df['selected_text'].apply(remove_URL)

# for i, row in train_df.iterrows():
#     process_data(row['text'], row['selected_text'], row['sentiment'], config.TOKENIZER, config.MAX_LEN)

In [77]:
# process_data('On the way to Malaysia...no internet access to Twit', '.no internet', 'negative', config.TOKENIZER, config.MAX_LEN)

In [78]:
# train_df = pd.read_csv(os.path.join(config.DATA_DIR, config.TRAIN_FILE))
# train_df = train_df.dropna(how='any', axis=0)

# splits = list(StratifiedKFold(n_splits=config.N_SPLITS, shuffle=True, random_state=config.SEED).split(train_df, train_df['sentiment']))

# train_df['kfold'] = 0
# for fold, (train_idx, val_idx) in enumerate(splits):
#     train_df['kfold'].iloc[val_idx] = fold
    
# train_df.to_csv(os.path.join(config.DATA_DIR, 'train_folds.csv'), index=False)

# Data Loader

In [79]:
train_df = pd.read_csv(os.path.join(config.DATA_DIR, config.TRAIN_FILE))
train_df = train_df.dropna(how='any', axis=0)
# test_df = pd.read_csv(os.path.join(config.DATA_DIR, config.TEST_FILE))

In [80]:
class TweetDataset :
    def __init__(self, tweet, sentiment, selected_text):
        self.tweet = tweet
        self.sentiment = sentiment
        self.selected_text = selected_text
        self.tokenizer = config.TOKENIZER
        self.max_len = config.MAX_LEN
    
    def __len__(self):
        return len(self.tweet)
    
    def __getitem__(self, item):
        data = process_data(
            self.tweet[item],
            self.selected_text[item],
            self.sentiment[item],
            self.tokenizer,
            self.max_len
        )
        
        return {
            'ids': torch.tensor(data['ids'], dtype=torch.long),
            'mask': torch.tensor(data['mask'], dtype=torch.long),
            'token_type_ids': torch.tensor(data["token_type_ids"], dtype=torch.long),
            'targets_start': torch.tensor(data["targets_start"], dtype=torch.long),
            'targets_end': torch.tensor(data["targets_end"], dtype=torch.long),
            'orig_tweet': data["orig_tweet"],
            'orig_selected': data["orig_selected"],
            'sentiment': data["sentiment"],
            'offsets': torch.tensor(data["offsets"], dtype=torch.long)
        }

# Model

In [81]:
class TweetModel(transformers.BertPreTrainedModel):
    def __init__(self, conf):
        super(TweetModel, self).__init__(conf)
        self.roberta = transformers.RobertaModel.from_pretrained(config.ROBERTA_PATH, config=conf)
        self.drop_out = nn.Dropout(0.1)
        self.l0 = nn.Linear(768 * 2, 200)
        torch.nn.init.normal_(self.l0.weight, std=0.02)
        self.l1 = nn.Linear(200, 2)
        torch.nn.init.normal_(self.l1.weight, std=0.02)
    
    def forward(self, ids, mask, token_type_ids):
        _, _, out = self.roberta(
            ids,
            attention_mask=mask,
            token_type_ids=token_type_ids
        )

        out = torch.cat((out[-1], out[-2]), dim=-1)
        out = self.drop_out(out)
        logits = self.l0(out)
        logits = self.l1(logits)

        start_logits, end_logits = logits.split(1, dim=-1)

        start_logits = start_logits.squeeze(-1)
        end_logits = end_logits.squeeze(-1)

        return start_logits, end_logits
    
    
# class TweetModel(transformers.BertPreTrainedModel):
#     def __init__(self, conf):
#         super(TweetModel, self).__init__(conf)
#         self.roberta = transformers.RobertaModel.from_pretrained(config.ROBERTA_PATH, config=conf)
#         self.dropout = nn.Dropout(0.1)
        
#         self.qa_outputs1c = torch.nn.Conv1d(768, config.MAX_LEN, 2)
#         self.qa_outputs2c = torch.nn.Conv1d(768, config.MAX_LEN, 2)

#         self.qa_outputs1 = nn.Linear(config.MAX_LEN, 1)
#         self.qa_outputs2 = nn.Linear(config.MAX_LEN, 1)
    
#     def forward(self, ids, mask, token_type_ids):
#         _, _, out = self.roberta(
#             ids,
#             attention_mask=mask,
#             token_type_ids=token_type_ids
#         )

# #         out = torch.cat((out[-1], out[-2]), dim=-1)
#         s_out = self.dropout(out[-1])
#         s_out = torch.nn.functional.pad(s_out.transpose(1,2), (1, 0))

#         out1 = self.qa_outputs1c(s_out).transpose(1,2)
#         out2 = self.qa_outputs2c(s_out).transpose(1,2)

#         start_logits = self.qa_outputs1(self.dropout(out1)).squeeze(-1)
#         end_logits = self.qa_outputs2(self.dropout(out2)).squeeze(-1)

#         return start_logits, end_logits

In [82]:
def loss_fn(start_logits, end_logits, start_positions, end_positions):
    loss_fct = nn.CrossEntropyLoss()
    start_loss = loss_fct(start_logits, start_positions)
    end_loss = loss_fct(end_logits, end_positions)
    total_loss = start_loss + end_loss
    
    return total_loss
    
#     if is_train:
#         start_logprobs = nn.functional.log_softmax(start_logits, dim=-1)
#         start_smooth_loss = -start_logprobs.mean(dim=-1)
        
#         end_logprobs = nn.functional.log_softmax(end_logits, dim=-1)
#         end_smooth_loss = -end_logprobs.mean(dim=-1)
        
#         smooth_loss = start_smooth_loss.mean() + end_smooth_loss.mean()
#         return total_loss * (1 - smoothing) + smooth_loss * smoothing
        

def calculate_jaccard_score(original_tweet, target_string, sentiment_val, idx_start, idx_end, offsets):
    if idx_end < idx_start:
        idx_end = idx_start
#         idx_start = 0
#         idx_end = len(offsets) - 1
        
    filtered_output = ""
    for ix in range(idx_start, idx_end + 1):
        filtered_output += original_tweet[offsets[ix][0]: offsets[ix][1]]
        if (ix+1) < len(offsets) and offsets[ix][1] < offsets[ix+1][0]:
            filtered_output += " "
          
    jac = jaccard(target_string.strip(), filtered_output.strip())
    jac_1 = jac_2 = jac_3 = jac_4 = jac
    
    if len(original_tweet.split()) < 2:
        filtered_output_1 = original_tweet
        jac_1 = jaccard(target_string.strip(), filtered_output_1.strip())
    
    if sentiment_val == 'neutral' or len(original_tweet.split()) < 2:
        filtered_output_2 = original_tweet
        jac_2 = jaccard(target_string.strip(), filtered_output_2.strip())
        
    if len(filtered_output.split()) == 1:
        filtered_output_3 = filtered_output.replace('!!!!', '!')
        filtered_output_3 = filtered_output_3.replace('..', '.')
        filtered_output_3 = filtered_output_3.replace('...', '.')
        jac_3 = jaccard(target_string.strip(), filtered_output_3.strip())
        
    if sentiment_val == 'neutral' or len(original_tweet.split()) < 2:
        filtered_output_4 = original_tweet
        jac_4 = jaccard(target_string.strip(), filtered_output_4.strip())

        if len(filtered_output_4.split()) == 1:
            filtered_output_4 = filtered_output_4.replace('!!!!', '!')
            filtered_output_4 = filtered_output_4.replace('..', '.')
            filtered_output_4 = filtered_output_4.replace('...', '.')
            jac_4 = jaccard(target_string.strip(), filtered_output_4.strip())

#     jac = jaccard(target_string.strip(), filtered_output.strip())
    
    return (jac_1, jac_2, jac_3, jac_4), filtered_output

# Train and Valid Functions

In [83]:
def train_fn(epoch, data_loader, model, optimizer, scheduler=None):
    model.train()
    model.zero_grad()
    losses = AverageMeter()
    jaccards = AverageMeter()
    
    tk0 = tqdm_notebook(data_loader, total=len(data_loader), desc=f'Train Epoch {epoch}')
    for bi, d in enumerate(tk0):
        ids = d["ids"]
        token_type_ids = d["token_type_ids"]
        mask = d["mask"]
        targets_start = d["targets_start"]
        targets_end = d["targets_end"]
        sentiment = d["sentiment"]
        orig_selected = d["orig_selected"]
        orig_tweet = d["orig_tweet"]
        targets_start = d["targets_start"]
        targets_end = d["targets_end"]
        offsets = d["offsets"]

        ids = ids.to(config.DEVICE, dtype=torch.long)
        token_type_ids = token_type_ids.to(config.DEVICE, dtype=torch.long)
        mask = mask.to(config.DEVICE, dtype=torch.long)
        targets_start = targets_start.to(config.DEVICE, dtype=torch.long)
        targets_end = targets_end.to(config.DEVICE, dtype=torch.long)
        
        outputs_start, outputs_end = model(ids=ids, mask=mask, token_type_ids=token_type_ids)
        
        loss = loss_fn(outputs_start, outputs_end, targets_start, targets_end)
        loss = loss / config.GRADIENT_ACCUMULATION_STEPS
        loss.backward()
        
        if (bi + 1) % config.GRADIENT_ACCUMULATION_STEPS == 0:
            optimizer.step()
            scheduler.step()
            model.zero_grad()
        
        outputs_start = torch.softmax(outputs_start, dim=1).cpu().detach().numpy()
        outputs_end = torch.softmax(outputs_end, dim=1).cpu().detach().numpy()
        jaccard_scores = []
        
        for px, tweet in enumerate(orig_tweet):
            selected_tweet = orig_selected[px]
            tweet_sentiment = sentiment[px]
            jaccard_score, _ = calculate_jaccard_score(
                original_tweet=tweet,
                target_string=selected_tweet,
                sentiment_val=tweet_sentiment,
                idx_start=np.argmax(outputs_start[px, :]),
                idx_end=np.argmax(outputs_end[px, :]),
                offsets=offsets[px]
            )
            jaccard_scores.append(jaccard_score)
        
        jaccards.update(np.mean(jaccard_scores, axis=0), ids.size(0))
        losses.update(loss.item(), ids.size(0))
        if bi >= config.LOGGING_STEPS and bi % config.LOGGING_STEPS == 0:
            print(f"Epoch {epoch} - Loss: {losses.avg} - Jaccard: {jaccards.avg}")
            losses.reset()
            jaccards.reset()

In [84]:
def eval_fn(epoch, data_loader, model):
    model.eval()
    losses = AverageMeter()
    jaccards = AverageMeter()
    
    with torch.no_grad():
        tk0 = tqdm_notebook(data_loader, total=len(data_loader), desc=f'Valid Epoch {epoch}')
        for bi, d in enumerate(tk0):
            ids = d["ids"]
            token_type_ids = d["token_type_ids"]
            mask = d["mask"]
            sentiment = d["sentiment"]
            orig_selected = d["orig_selected"]
            orig_tweet = d["orig_tweet"]
            targets_start = d["targets_start"]
            targets_end = d["targets_end"]
            offsets = d["offsets"].numpy()

            ids = ids.to(config.DEVICE, dtype=torch.long)
            token_type_ids = token_type_ids.to(config.DEVICE, dtype=torch.long)
            mask = mask.to(config.DEVICE, dtype=torch.long)
            targets_start = targets_start.to(config.DEVICE, dtype=torch.long)
            targets_end = targets_end.to(config.DEVICE, dtype=torch.long)

            outputs_start, outputs_end = model(
                ids=ids,
                mask=mask,
                token_type_ids=token_type_ids
            )
            loss = loss_fn(outputs_start, outputs_end, targets_start, targets_end)
            outputs_start = torch.softmax(outputs_start, dim=1).cpu().detach().numpy()
            outputs_end = torch.softmax(outputs_end, dim=1).cpu().detach().numpy()
            jaccard_scores = []
            for px, tweet in enumerate(orig_tweet):
                selected_tweet = orig_selected[px]
                tweet_sentiment = sentiment[px]
                jaccard_score, _ = calculate_jaccard_score(
                    original_tweet=tweet,
                    target_string=selected_tweet,
                    sentiment_val=tweet_sentiment,
                    idx_start=np.argmax(outputs_start[px, :]),
                    idx_end=np.argmax(outputs_end[px, :]),
                    offsets=offsets[px]
                )
                jaccard_scores.append(jaccard_score)

            jaccards.update(np.mean(jaccard_scores, axis=0), ids.size(0))
            losses.update(loss.item(), ids.size(0))
            tk0.set_postfix(loss=losses.avg, jaccard=jaccards.avg)
    
    print(f"Epoch {epoch} - Validation Jaccard = {jaccards.avg}")
    return jaccards.avg

# Training

In [85]:
def run(fold, train_data_loader, valid_data_loader, train_size, model_config):
    model = TweetModel(conf=model_config)
#     model = nn.DataParallel(model)
    model.to(config.DEVICE)
    
    num_train_steps = int(train_size / (config.TRAIN_BATCH_SIZE * config.GRADIENT_ACCUMULATION_STEPS) * config.EPOCHS)
    param_optimizer = list(model.named_parameters())
    no_decay = ["bias", "LayerNorm.bias", "LayerNorm.weight"]
    optimizer_parameters = [
        {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': config.WEIGHT_DECAY},
        {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0},
    ]
    optimizer = AdamW(optimizer_parameters, lr=config.LEARNING_RATE)
    scheduler = get_linear_schedule_with_warmup(
        optimizer, 
        num_warmup_steps=config.WARM_UP_STEPS, 
        num_training_steps=num_train_steps
    )
    
    es = EarlyStopping(patience=config.PATIENCE, mode="max")
    
    for epoch in range(config.EPOCHS):
        train_fn(epoch, train_data_loader, model, optimizer, scheduler=scheduler)
        jaccard = eval_fn(epoch, valid_data_loader, model)
        es(jaccard[0], model, model_path=f"{config.OUTPUT_DIR}/{config.SAVE_MODEL_DIR}/model_fold_{fold}.bin")
        if es.early_stop:
            print("### Early stopping")
            break
            
    return es.best_score

In [86]:
splits = list(StratifiedKFold(n_splits=config.N_SPLITS, shuffle=True, random_state=config.SEED).split(train_df, train_df['sentiment']))
fold_valid_scores = []

model_config = transformers.RobertaConfig.from_pretrained(config.ROBERTA_PATH)
model_config.output_hidden_states = True

for fold, (train_idx, val_idx) in enumerate(splits):
# for fold in range(5):
    print('###########################')
    print('### Train Fold {}'.format(fold))
    print('###########################')
    
    fold_train_df = train_df.iloc[train_idx, :]
    fold_valid_df = train_df.iloc[val_idx, :]
#     fold_train_df = train_df[train_df.kfold != fold].reset_index(drop=True)
#     fold_valid_df = train_df[train_df.kfold == fold].reset_index(drop=True)
    
#     Augument data
#     fold_train_df_copy = fold_train_df.copy()
#     fold_train_df_copy['text'] = fold_train_df.apply(data_aug, axis=1)
#     fold_train_df_copy = fold_train_df_copy[fold_train_df_copy.sentiment != 'neutral']
#     fold_train_df = pd.concat([fold_train_df, fold_train_df_copy])
#     fold_train_df = fold_train_df.sample(frac=1).reset_index(drop=True)
    
    train_dataset = TweetDataset(
        tweet = fold_train_df.text.values,
        sentiment = fold_train_df.sentiment.values,
        selected_text = fold_train_df.selected_text.values
    )
    
    valid_dataset = TweetDataset(
        tweet = fold_valid_df.text.values,
        sentiment = fold_valid_df.sentiment.values,
        selected_text = fold_valid_df.selected_text.values
    )
    
    train_data_loader = torch.utils.data.DataLoader(
        train_dataset,
        batch_size = config.TRAIN_BATCH_SIZE,
        shuffle = True,
        num_workers = 8
    )
    
    valid_data_loader = torch.utils.data.DataLoader(
        valid_dataset,
        batch_size = config.VALID_BATCH_SIZE,
        shuffle = False,
        num_workers = 8
    )
    
    valid_score = run(fold=fold, 
        train_data_loader=train_data_loader, 
        valid_data_loader=valid_data_loader, 
        train_size=fold_train_df.shape[0],
        model_config=model_config
    )
    
    fold_valid_scores.append(valid_score)

print(f'\n### Fold Valid Scores {fold_valid_scores}\n### Valid Score Average {np.mean(fold_valid_scores)}')

###########################
### Train Fold 0
###########################


Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  import sys


HBox(children=(FloatProgress(value=0.0, description='Train Epoch 0', max=687.0, style=ProgressStyle(descriptio…

Epoch 0 - Loss: 6.016593038445652 - Jaccard: [0.39006538 0.53721878 0.38638345 0.53598115]
Epoch 0 - Loss: 2.4843676042556764 - Jaccard: [0.59842531 0.62689323 0.59920656 0.62689323]
Epoch 0 - Loss: 1.9858428931236267 - Jaccard: [0.66533907 0.66776318 0.66641719 0.66776318]
Epoch 0 - Loss: 1.9049552595615387 - Jaccard: [0.66103924 0.66153229 0.66263646 0.66121979]
Epoch 0 - Loss: 1.7226816475391389 - Jaccard: [0.68818642 0.68831512 0.68775673 0.68800262]
Epoch 0 - Loss: 1.7063206267356872 - Jaccard: [0.70090379 0.70172067 0.70160432 0.70140817]



HBox(children=(FloatProgress(value=0.0, description='Valid Epoch 0', max=172.0, style=ProgressStyle(descriptio…


Epoch 0 - Validation Jaccard = [0.69750367 0.69757899 0.69783885 0.69721509]
Validation score improved (-inf --> 0.6975036749529769). Saving model!


HBox(children=(FloatProgress(value=0.0, description='Train Epoch 1', max=687.0, style=ProgressStyle(descriptio…

Epoch 1 - Loss: 1.5482139534289294 - Jaccard: [0.71900871 0.71852821 0.72139801 0.7182188 ]
Epoch 1 - Loss: 1.5699192988872528 - Jaccard: [0.71097413 0.71140973 0.70920702 0.71078473]
Epoch 1 - Loss: 1.5758606165647506 - Jaccard: [0.71284653 0.71354911 0.71361786 0.71323661]
Epoch 1 - Loss: 1.546956517100334 - Jaccard: [0.70941977 0.7097961  0.70867223 0.7088586 ]
Epoch 1 - Loss: 1.5525723111629486 - Jaccard: [0.70653942 0.70631244 0.70865661 0.70631244]
Epoch 1 - Loss: 1.536770858168602 - Jaccard: [0.72132162 0.72131558 0.72177735 0.72131558]



HBox(children=(FloatProgress(value=0.0, description='Valid Epoch 1', max=172.0, style=ProgressStyle(descriptio…


Epoch 1 - Validation Jaccard = [0.709192   0.70940267 0.71025696 0.70903876]
Validation score improved (0.6975036749529769 --> 0.7091919959286956). Saving model!


HBox(children=(FloatProgress(value=0.0, description='Train Epoch 2', max=687.0, style=ProgressStyle(descriptio…

Epoch 2 - Loss: 1.4022045259428497 - Jaccard: [0.72655404 0.72654435 0.72709771 0.72623494]
Epoch 2 - Loss: 1.448340744972229 - Jaccard: [0.72405481 0.72394798 0.72694951 0.72363548]
Epoch 2 - Loss: 1.362673934698105 - Jaccard: [0.74486204 0.74466894 0.74518086 0.74404394]
Epoch 2 - Loss: 1.4362106263637542 - Jaccard: [0.72293689 0.72314755 0.72255061 0.72252255]
Epoch 2 - Loss: 1.4127839386463166 - Jaccard: [0.73127463 0.73101947 0.73108106 0.73070697]
Epoch 2 - Loss: 1.3653763955831528 - Jaccard: [0.7401986  0.7391689  0.74129235 0.7391689 ]



HBox(children=(FloatProgress(value=0.0, description='Valid Epoch 2', max=172.0, style=ProgressStyle(descriptio…


Epoch 2 - Validation Jaccard = [0.70984323 0.71020199 0.71059386 0.70983809]
EarlyStopping counter: 1 out of 2
###########################
### Train Fold 1
###########################


HBox(children=(FloatProgress(value=0.0, description='Train Epoch 0', max=687.0, style=ProgressStyle(descriptio…

Epoch 0 - Loss: 5.926185275068377 - Jaccard: [0.40253872 0.54163143 0.4006204  0.54132203]
Epoch 0 - Loss: 2.527734043598175 - Jaccard: [0.58114782 0.61826128 0.58066548 0.61763628]
Epoch 0 - Loss: 2.0282514762878416 - Jaccard: [0.66135794 0.67123191 0.65964663 0.67123191]
Epoch 0 - Loss: 1.9149127894639968 - Jaccard: [0.67812034 0.68154842 0.67914638 0.68123592]
Epoch 0 - Loss: 1.844926779270172 - Jaccard: [0.6765322  0.68076551 0.67771746 0.67982801]
Epoch 0 - Loss: 1.775816603899002 - Jaccard: [0.68240654 0.68281372 0.68481973 0.68281372]



HBox(children=(FloatProgress(value=0.0, description='Valid Epoch 0', max=172.0, style=ProgressStyle(descriptio…


Epoch 0 - Validation Jaccard = [0.69884862 0.69916388 0.70108328 0.69898192]
Validation score improved (-inf --> 0.6988486246272586). Saving model!


HBox(children=(FloatProgress(value=0.0, description='Train Epoch 1', max=687.0, style=ProgressStyle(descriptio…

Epoch 1 - Loss: 1.644740364929237 - Jaccard: [0.70786561 0.70774812 0.70821627 0.70743872]
Epoch 1 - Loss: 1.5747883194684982 - Jaccard: [0.71020326 0.71081716 0.70892364 0.71019216]
Epoch 1 - Loss: 1.565198746919632 - Jaccard: [0.70269855 0.70285592 0.70357728 0.70254342]
Epoch 1 - Loss: 1.5299443221092224 - Jaccard: [0.71351636 0.7132716  0.71422335 0.7126466 ]
Epoch 1 - Loss: 1.600006195306778 - Jaccard: [0.70687562 0.7079076  0.70735739 0.7072826 ]
Epoch 1 - Loss: 1.5491163945198059 - Jaccard: [0.71798497 0.71992314 0.71687634 0.71992314]



HBox(children=(FloatProgress(value=0.0, description='Valid Epoch 1', max=172.0, style=ProgressStyle(descriptio…


Epoch 1 - Validation Jaccard = [0.69757448 0.69779228 0.69885247 0.69761033]
EarlyStopping counter: 1 out of 2


HBox(children=(FloatProgress(value=0.0, description='Train Epoch 2', max=687.0, style=ProgressStyle(descriptio…

Epoch 2 - Loss: 1.4697023582930613 - Jaccard: [0.72156916 0.72156233 0.72181411 0.72094352]
Epoch 2 - Loss: 1.5017379873991012 - Jaccard: [0.71911583 0.72013796 0.71925906 0.71982546]
Epoch 2 - Loss: 1.4799117439985274 - Jaccard: [0.72413631 0.72441557 0.72388333 0.72347807]
Epoch 2 - Loss: 1.4782028931379319 - Jaccard: [0.72485904 0.72525292 0.72492861 0.72525292]
Epoch 2 - Loss: 1.4000633227825166 - Jaccard: [0.73562051 0.73545063 0.73594343 0.73545063]
Epoch 2 - Loss: 1.412388613820076 - Jaccard: [0.73418964 0.73387894 0.73335035 0.73325394]



HBox(children=(FloatProgress(value=0.0, description='Valid Epoch 2', max=172.0, style=ProgressStyle(descriptio…


Epoch 2 - Validation Jaccard = [0.70021001 0.70049841 0.70195803 0.70031646]
Validation score improved (0.6988486246272586 --> 0.7002100093269767). Saving model!
###########################
### Train Fold 2
###########################


HBox(children=(FloatProgress(value=0.0, description='Train Epoch 0', max=687.0, style=ProgressStyle(descriptio…

Epoch 0 - Loss: 6.004250483937783 - Jaccard: [0.37976137 0.52909817 0.37787915 0.52909817]
Epoch 0 - Loss: 2.513495606184006 - Jaccard: [0.59935685 0.62822842 0.59677352 0.62729092]
Epoch 0 - Loss: 1.97205681681633 - Jaccard: [0.66110982 0.66620982 0.66136852 0.66558482]
Epoch 0 - Loss: 1.9612564253807068 - Jaccard: [0.66457373 0.66700928 0.66421045 0.66638428]
Epoch 0 - Loss: 1.7746469140052796 - Jaccard: [0.69747979 0.69873243 0.69974616 0.69873243]
Epoch 0 - Loss: 1.7098988127708434 - Jaccard: [0.70005775 0.70028961 0.69946511 0.69997711]



HBox(children=(FloatProgress(value=0.0, description='Valid Epoch 0', max=172.0, style=ProgressStyle(descriptio…


Epoch 0 - Validation Jaccard = [0.69698056 0.69702616 0.69746618 0.69684421]
Validation score improved (-inf --> 0.6969805585023761). Saving model!


HBox(children=(FloatProgress(value=0.0, description='Train Epoch 1', max=687.0, style=ProgressStyle(descriptio…

Epoch 1 - Loss: 1.6126836001282872 - Jaccard: [0.70472567 0.70503485 0.70601486 0.70472544]
Epoch 1 - Loss: 1.6475758045911788 - Jaccard: [0.69059941 0.69060026 0.69070358 0.68997526]
Epoch 1 - Loss: 1.5377421438694001 - Jaccard: [0.71601971 0.7160026  0.71537387 0.7150651 ]
Epoch 1 - Loss: 1.552536860704422 - Jaccard: [0.71506617 0.71544512 0.71351111 0.71513262]
Epoch 1 - Loss: 1.5579511511325836 - Jaccard: [0.71189926 0.71187907 0.7137352  0.71187907]
Epoch 1 - Loss: 1.5655226659774781 - Jaccard: [0.71576692 0.71583218 0.71647763 0.71551968]



HBox(children=(FloatProgress(value=0.0, description='Valid Epoch 1', max=172.0, style=ProgressStyle(descriptio…


Epoch 1 - Validation Jaccard = [0.70486582 0.70528797 0.70563194 0.70510602]
Validation score improved (0.6969805585023761 --> 0.7048658202191611). Saving model!


HBox(children=(FloatProgress(value=0.0, description='Train Epoch 2', max=687.0, style=ProgressStyle(descriptio…

Epoch 2 - Loss: 1.5168849459969171 - Jaccard: [0.71303506 0.71603052 0.71443475 0.71572111]
Epoch 2 - Loss: 1.441731538772583 - Jaccard: [0.72541921 0.73001434 0.7255364  0.73001434]
Epoch 2 - Loss: 1.4796555066108703 - Jaccard: [0.72023406 0.72240017 0.72156739 0.72177517]
Epoch 2 - Loss: 1.4728939336538316 - Jaccard: [0.73374401 0.7341584  0.73516142 0.7338459 ]
Epoch 2 - Loss: 1.4464493483304977 - Jaccard: [0.72416564 0.7243127  0.7250585  0.7236877 ]
Epoch 2 - Loss: 1.4259234046936036 - Jaccard: [0.72911997 0.72945176 0.73004259 0.72913926]



HBox(children=(FloatProgress(value=0.0, description='Valid Epoch 2', max=172.0, style=ProgressStyle(descriptio…


Epoch 2 - Validation Jaccard = [0.70371463 0.70426669 0.70420581 0.70408474]
EarlyStopping counter: 1 out of 2
###########################
### Train Fold 3
###########################


HBox(children=(FloatProgress(value=0.0, description='Train Epoch 0', max=687.0, style=ProgressStyle(descriptio…

Epoch 0 - Loss: 5.974871165681593 - Jaccard: [0.41777574 0.54194349 0.41516298 0.54132468]
Epoch 0 - Loss: 2.460195233821869 - Jaccard: [0.60712106 0.63233844 0.60709055 0.63233844]
Epoch 0 - Loss: 1.9012952983379363 - Jaccard: [0.67146442 0.67269998 0.67251725 0.67238748]
Epoch 0 - Loss: 1.81413978099823 - Jaccard: [0.67930552 0.68017336 0.68062434 0.67986086]
Epoch 0 - Loss: 1.7708830207586288 - Jaccard: [0.69869612 0.69905525 0.69772737 0.69905525]
Epoch 0 - Loss: 1.7450721591711045 - Jaccard: [0.68195511 0.68229419 0.68207044 0.68229419]



HBox(children=(FloatProgress(value=0.0, description='Valid Epoch 0', max=172.0, style=ProgressStyle(descriptio…


Epoch 0 - Validation Jaccard = [0.69178681 0.69172198 0.69175476 0.69081222]
Validation score improved (-inf --> 0.6917868134868372). Saving model!


HBox(children=(FloatProgress(value=0.0, description='Train Epoch 1', max=687.0, style=ProgressStyle(descriptio…

Epoch 1 - Loss: 1.5749005999895607 - Jaccard: [0.71160652 0.71143207 0.71214827 0.71112267]
Epoch 1 - Loss: 1.5345609337091446 - Jaccard: [0.71791692 0.71780689 0.71839869 0.71749439]
Epoch 1 - Loss: 1.5889868360757828 - Jaccard: [0.70624552 0.70644061 0.70904115 0.70612811]
Epoch 1 - Loss: 1.5652212381362915 - Jaccard: [0.71040053 0.71061594 0.71071935 0.71030344]
Epoch 1 - Loss: 1.5707155990600585 - Jaccard: [0.70702864 0.7078628  0.70772509 0.7078628 ]
Epoch 1 - Loss: 1.5700376439094543 - Jaccard: [0.71188558 0.71189081 0.71256341 0.71157831]



HBox(children=(FloatProgress(value=0.0, description='Valid Epoch 1', max=172.0, style=ProgressStyle(descriptio…


Epoch 1 - Validation Jaccard = [0.70852043 0.70849761 0.70876563 0.70758786]
Validation score improved (0.6917868134868372 --> 0.7085204261179079). Saving model!


HBox(children=(FloatProgress(value=0.0, description='Train Epoch 2', max=687.0, style=ProgressStyle(descriptio…

Epoch 2 - Loss: 1.3852065506547984 - Jaccard: [0.73035627 0.73032355 0.73235908 0.73032355]
Epoch 2 - Loss: 1.4152123218774795 - Jaccard: [0.73968194 0.73942608 0.73995971 0.73911358]
Epoch 2 - Loss: 1.4513030529022217 - Jaccard: [0.72018486 0.72044055 0.71966826 0.72044055]
Epoch 2 - Loss: 1.392531515955925 - Jaccard: [0.72957213 0.7294438  0.73160586 0.7294438 ]
Epoch 2 - Loss: 1.3761050295829773 - Jaccard: [0.7348204  0.73467746 0.73409383 0.73436496]
Epoch 2 - Loss: 1.395619170665741 - Jaccard: [0.73867123 0.73803001 0.73991289 0.73740501]



HBox(children=(FloatProgress(value=0.0, description='Valid Epoch 2', max=172.0, style=ProgressStyle(descriptio…


Epoch 2 - Validation Jaccard = [0.71292156 0.71265962 0.71261788 0.71174987]
Validation score improved (0.7085204261179079 --> 0.7129215600535136). Saving model!
###########################
### Train Fold 4
###########################


HBox(children=(FloatProgress(value=0.0, description='Train Epoch 0', max=687.0, style=ProgressStyle(descriptio…

Epoch 0 - Loss: 6.023427866473057 - Jaccard: [0.4272746  0.56248975 0.42494116 0.56218034]
Epoch 0 - Loss: 2.316786688566208 - Jaccard: [0.61626873 0.63481776 0.61659425 0.63481776]
Epoch 0 - Loss: 1.9128926992416382 - Jaccard: [0.67625993 0.6790722  0.67725733 0.6787597 ]
Epoch 0 - Loss: 1.8186072397232056 - Jaccard: [0.68798426 0.68794064 0.68841495 0.68762814]
Epoch 0 - Loss: 1.7625179582834243 - Jaccard: [0.68663204 0.68704148 0.6881685  0.68704148]
Epoch 0 - Loss: 1.78386106133461 - Jaccard: [0.686785   0.68731209 0.68750206 0.68637459]



HBox(children=(FloatProgress(value=0.0, description='Valid Epoch 0', max=172.0, style=ProgressStyle(descriptio…


Epoch 0 - Validation Jaccard = [0.69728888 0.69737259 0.69679401 0.69719064]
Validation score improved (-inf --> 0.6972888800562065). Saving model!


HBox(children=(FloatProgress(value=0.0, description='Train Epoch 1', max=687.0, style=ProgressStyle(descriptio…

Epoch 1 - Loss: 1.626807165027845 - Jaccard: [0.70365716 0.70484053 0.70434301 0.70453113]
Epoch 1 - Loss: 1.5802716451883316 - Jaccard: [0.71918713 0.71975511 0.72057738 0.71975511]
Epoch 1 - Loss: 1.585286653637886 - Jaccard: [0.70799824 0.70784849 0.70819563 0.70691099]
Epoch 1 - Loss: 1.5350817334651947 - Jaccard: [0.71927566 0.72005181 0.71974441 0.71973931]
Epoch 1 - Loss: 1.6185606276988984 - Jaccard: [0.70191733 0.70232719 0.70220304 0.70201469]
Epoch 1 - Loss: 1.5555801427364349 - Jaccard: [0.70811264 0.70832558 0.71056056 0.70770058]



HBox(children=(FloatProgress(value=0.0, description='Valid Epoch 1', max=172.0, style=ProgressStyle(descriptio…


Epoch 1 - Validation Jaccard = [0.71308997 0.71323019 0.71177676 0.71304824]
Validation score improved (0.6972888800562065 --> 0.7130899663504775). Saving model!


HBox(children=(FloatProgress(value=0.0, description='Train Epoch 2', max=687.0, style=ProgressStyle(descriptio…

Epoch 2 - Loss: 1.463117305595096 - Jaccard: [0.72110884 0.72143769 0.72114567 0.72081888]
Epoch 2 - Loss: 1.4730858212709428 - Jaccard: [0.71386792 0.71356344 0.71544344 0.71293844]
Epoch 2 - Loss: 1.4330804497003555 - Jaccard: [0.72717707 0.72638809 0.72685031 0.72545059]
Epoch 2 - Loss: 1.4480472385883332 - Jaccard: [0.72903563 0.72907675 0.73120937 0.72907675]
Epoch 2 - Loss: 1.4149633795022964 - Jaccard: [0.73630958 0.73652771 0.73772327 0.73652771]
Epoch 2 - Loss: 1.465336445569992 - Jaccard: [0.72784488 0.72722797 0.72889138 0.72691547]



HBox(children=(FloatProgress(value=0.0, description='Valid Epoch 2', max=172.0, style=ProgressStyle(descriptio…


Epoch 2 - Validation Jaccard = [0.71124863 0.71154469 0.71017759 0.71136274]
EarlyStopping counter: 1 out of 2

### Fold Valid Scores [array(0.709192), array(0.70021001), array(0.70486582), array(0.71292156), array(0.71308997)]
### Valid Score Average 0.7080558703757649


In [13]:
### Fold Valid Scores [array(0.70857465), array(0.70933893), array(0.70598435), array(0.70872072), array(0.70730684)]
### Average 0.7079850985349567

### Fold Valid Scores [array(0.70952816), array(0.7069516), array(0.70330911), array(0.70835745), array(0.71030896)]
### Valid Score Average 0.7076910558826741

L1: 200 ### Fold Valid Scores [array(0.71360075), array(0.7115196), array(0.70646786), array(0.71183899), array(0.70908395)]
### Valid Score Average 0.710502229454183

L1: 400 ### Fold Valid Scores [array(0.71273853), array(0.70800568), array(0.70623233), array(0.70959698), array(0.70644186)]
### Valid Score Average 0.7086030747462625

L1: 100 ### Fold Valid Scores [array(0.71101688), array(0.71193292), array(0.69802207), array(0.70490326), array(0.70705004)]
### Valid Score Average 0.7065850347029923

L1: 200 + relu ### Fold Valid Scores [array(0.71175533), array(0.70503955), array(0.70795357), array(0.70640962), array(0.71039699)]
### Valid Score Average 0.7083110118006811

Dropout ### Fold Valid Scores [array(0.71108905), array(0.70961412), array(0.7035667), array(0.70853809), array(0.70744561)]
### Valid Score Average 0.7080507151707145

Distance Loss ### Fold Valid Scores [array(0.71389534), array(0.71070689), array(0.70406148), array(0.7086584), array(0.70803468)]
### Valid Score Average 0.7090713561243934

Remove punctuation ### Fold Valid Scores [array(0.71394893), array(0.71187192), array(0.70876343), array(0.70869398), array(0.71475133)]
### Valid Score Average 0.7116059166845755

Seed 1111 ### Fold Valid Scores [array(0.71215477), array(0.70377209), array(0.706983), array(0.7146253), array(0.71572213)]
### Valid Score Average 0.7106514581294852 - 0.715
### Fold Valid Scores [array(0.71191979), array(0.70360961), array(0.70702528), array(0.71460327), array(0.71597365)]
### Valid Score Average 0.7106263206022948

Seed 1111, remove URL ### Fold Valid Scores [array(0.71431862), array(0.7056475), array(0.70800331), array(0.71604839), array(0.71440704)]
### Valid Score Average 0.711684969336701 - 0.712

Dropout 0.2 ### Fold Valid Scores [array(0.71312366), array(0.70360245), array(0.70588042), array(0.71115171), array(0.71487151)]
### Valid Score Average 0.7097259490487787 - 0.715

Dont set full text with Neutral ### Fold Valid Scores [array(0.71191979), array(0.70360961), array(0.70702528), array(0.71460327), array(0.71597365)]
### Valid Score Average 0.7106263206022948 - 0.715

Seperate words in sentence by two spaces ### Fold Valid Scores [array(0.71472236), array(0.70395218), array(0.70167148), array(0.70907122), array(0.71733808)]
### Valid Score Average 0.7093510636787899

### Fold Valid Scores [array(0.71282134), array(0.70215061), array(0.7082946), array(0.71306205), array(0.71531596)]
### Valid Score Average 0.7103289118753705

Gradient accumulation step == 2 ### Fold Valid Scores [array(0.7129531), array(0.7031783), array(0.70544356), array(0.7115031), array(0.71163413)]
### Valid Score Average 0.7089424360991764

Loss 0.45/0.55 ### Fold Valid Scores [array(0.71553568), array(0.70258901), array(0.70767322), array(0.71024796), array(0.71366115)]
### Valid Score Average 0.7099414022583754

Loss 0.4/0.6 ### Fold Valid Scores [array(0.71097718), array(0.70036318), array(0.70536396), array(0.71544554), array(0.71403322)]
### Valid Score Average 0.7092366146621474

Replace url by "URL" string ### Fold Valid Scores [array(0.71128741), array(0.7010848), array(0.7050311), array(0.7151821), array(0.71327337)]
### Valid Score Average 0.7091717570366891

Learning rate 5e-5 ### Fold Valid Scores [array(0.71060388), array(0.70234062), array(0.70975842), array(0.71367711), array(0.71244896)]
### Valid Score Average 0.7097657967743797

Warm up steps 100 ### Fold Valid Scores [array(0.7121743), array(0.70574296), array(0.70752267), array(0.71541347), array(0.71487715)]
### Valid Score Average 0.7111461102863055/0.71124 - 0.717
### Fold Valid Scores [array(0.71088258), array(0.7055451), array(0.70748563), array(0.71542542), array(0.71473709)]
### Valid Score Average 0.710815163841038

Warm up steps 200 ### Fold Valid Scores [array(0.7142003), array(0.7020768), array(0.70581585), array(0.71400421), array(0.71357059)]
### Valid Score Average 0.7099335480925978

Warm up steps 128 ### Fold Valid Scores [array(0.7131979), array(0.70396897), array(0.7069105), array(0.71623603), array(0.71503632)]
### Valid Score Average 0.7110699428033749

Warm up steps 80 ### Fold Valid Scores [array(0.71381001), array(0.70383435), array(0.70870028), array(0.71356599), array(0.71006091)]
### Valid Score Average 0.7099943089401244

Warm up steps 110 ### Fold Valid Scores [array(0.71367481), array(0.70218424), array(0.70514319), array(0.71393211), array(0.71012181)]
### Valid Score Average 0.7090112310740804

Warm up steps 100 + Loss 0.45/0.55 ### Fold Valid Scores [array(0.71090445), array(0.7033069), array(0.70759749), array(0.70934932), array(0.71212442)]
### Valid Score Average 0.7086565172742907

##### New Jaccard

Distance Loss ### Fold Valid Scores [array(0.70784799), array(0.70339139), array(0.70684697), array(0.7129423), array(0.71347322)]
### Valid Score Average 0.7089003747892983

Label Smoothing 0.1 epoch 3 ### Fold Valid Scores [array(0.71439071), array(0.70098441), array(0.69517371), array(0.71655344), array(0.71508791)]
### Valid Score Average 0.7084380391000078

Label Smoothing 0.1 epoch 5 ### Fold Valid Scores [array(0.71142339), array(0.7048489), array(0.70959277), array(0.7124854), array(0.71789047)]
### Valid Score Average 0.7112481863284884 - 0.711

Label Smoothing 0.15 ### Fold Valid Scores [array(0.71298349), array(0.70379821), array(0.70762449), array(0.70895829), array(0.71638138)]
### Valid Score Average 0.7099491723213273

Conv1D ### Fold Valid Scores [array(0.71250557), array(0.70219231), array(0.7088155), array(0.71189476), array(0.71703853)]
### Valid Score Average 0.7104893364550021

Loss 0.55/0.45 ### Fold Valid Scores [array(0.71217874), array(0.70071567), array(0.70391095), array(0.71271005), array(0.71238488)]
### Valid Score Average 0.7083800595106661

start index > end index### Fold Valid Scores [array(0.71182255), array(0.70536097), array(0.70830436), array(0.71588593), array(0.71516879)]
### Valid Score Average 0.7113085194985139

Tweat the loss function a bit ### Fold Valid Scores [array(0.711305), array(0.69687138), array(0.70561886), array(0.71241906), array(0.70912093)]
### Valid Score Average 0.7070670451766619

Batch size 16 ### Fold Valid Scores [array(0.70967621), array(0.70625739), array(0.70894466), array(0.71357104), array(0.7164231)]
### Valid Score Average 0.7109744799794366

# Validating

In [17]:
model_config = transformers.RobertaConfig.from_pretrained(config.ROBERTA_PATH)
model_config.output_hidden_states = True

model = TweetModel(conf=model_config)
model.to(config.DEVICE)
model.load_state_dict(torch.load(f"{config.OUTPUT_DIR}/{config.SAVE_MODEL_DIR}/model_fold_0.bin"))
model.eval()

TweetModel(
  (roberta): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(50265, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-05, el

In [20]:
splits = list(StratifiedKFold(n_splits=config.N_SPLITS, shuffle=True, random_state=config.SEED).split(train_df, train_df['sentiment']))
train_idx, val_idx = splits[0]

valid_dataset = TweetDataset(
    tweet = fold_valid_df.text.values,
    sentiment = fold_valid_df.sentiment.values,
    selected_text = fold_valid_df.selected_text.values
)

valid_data_loader = torch.utils.data.DataLoader(
    valid_dataset,
    batch_size = config.VALID_BATCH_SIZE,
    shuffle = False,
    num_workers = 8
)

with torch.no_grad():
    tk0 = tqdm_notebook(valid_data_loader, total=len(valid_data_loader))
    for bi, d in enumerate(tk0):
        ids = d["ids"]
        token_type_ids = d["token_type_ids"]
        mask = d["mask"]
        sentiment = d["sentiment"]
        orig_selected = d["orig_selected"]
        orig_tweet = d["orig_tweet"]
        targets_start = d["targets_start"]
        targets_end = d["targets_end"]
        offsets = d["offsets"].numpy()

        ids = ids.to(config.DEVICE, dtype=torch.long)
        token_type_ids = token_type_ids.to(config.DEVICE, dtype=torch.long)
        mask = mask.to(config.DEVICE, dtype=torch.long)
        targets_start = targets_start.to(config.DEVICE, dtype=torch.long)
        targets_end = targets_end.to(config.DEVICE, dtype=torch.long)

        outputs_start, outputs_end = model(
            ids=ids,
            mask=mask,
            token_type_ids=token_type_ids
        )
        
        outputs_start = torch.softmax(outputs_start, dim=1).cpu().detach().numpy()
        outputs_end = torch.softmax(outputs_end, dim=1).cpu().detach().numpy()

        for px, tweet in enumerate(orig_tweet):
            selected_tweet = orig_selected[px]
            tweet_sentiment = sentiment[px]
            
            _, output_sentence = calculate_jaccard_score(
                original_tweet=tweet,
                target_string=selected_tweet,
                sentiment_val=tweet_sentiment,
                idx_start=np.argmax(outputs_start[px, :]),
                idx_end=np.argmax(outputs_end[px, :]),
                offsets=offsets[px]
            )
            
            if tweet_sentiment != 'neutral':
                print(f'{tweet} - {selected_tweet} - {output_sentence}')

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`


HBox(children=(FloatProgress(value=0.0, max=172.0), HTML(value='')))

 i want to go to music tonight but i lost my voice. -  lost -  lost

 I`m going home now. Have you seen my new twitter design? Quite....heavenly isn`****? -  Quite....heavenly -  Quite....heavenly

 i donbt like to peel prawns, i also dont like going shopping, running out of money and crawling round the car looking for more -  dont like go -  i donbt like

 He`s awesome... Have you worked with him before? He`s a good friend. -  s awesome -  awesome...

 Chilliin -  Chilliin -  Chilliin

 lucky kid...i so wanna see loserville pity im in oz.... -  lucky -  lucky

 Happy Mothers day to all you Mums out there -  Happy Mothers day to all you Mums out there -  Happy

 CASEY`S GONE?!?! BUT WHY?! So, she piddled a little on the carpet. She`s prolly freaked cause it`s new. Can we get her back? -  freaked -  freaked

 We saw that in none 3D - the baddie`s the best -  best -  best

 Awesome. I`m down in Ocean Beach (if you know where that is.) By the way. 'YourBiggestFan' I`m a re-al big fan of y

 Thanks Amy! That video is so awesome! Did you see TMH? He`s amazing in that too!! Bouncy Bouncy Bouncy!!! -  e`s amazi -  Thanks Amy! That video is so awesome!

 I don`t think I am, my sisters refusn to get me a ticket now. what you doing next week? -  I don`t think I am, my sisters refusn to get me a ticket now. -  I don`t think I am,

 i have no idea what im doing and i am completely lost. -  i am completely lost. -  lost.

 awww hes too cute!!wish i could`ve gone -  cute! -  cute!!

 I just spent 2 hours looking for a blog topic and ended up inventing my own. grrrr http://bit.ly/QRz6y -  ended up inventing my own. -  grrrr

 _KittyKat hello new follower haha!! how are ya? -  haha! -  haha!!

 _Lay aww well I just randomly woke up and now I can`t sleep! Too many things on my mind -  aww -  I can`t sleep!

 That`s it? It`s done already? This is one proof that there`s nothing fair in this world. http://bit.ly/10UEMq -  nothing fair -  nothing fair

 _3 U know - kids do what we DO - no

 She doesn`t believe spending many (many) thousands on a perfect wedding makes a happy couple.....A happy couple makes a perfect wedding -  happy couple. -  happy couple.....A happy couple makes a perfect wedding

 mmmm lauren conrad eeeeep im going to miss the hills -  miss -  miss

 her son is 7 and captured it outside...THANK GOD I HAVE A LITTLE GIRL -  .THANK GOD - THANK GOD

 Whom do yu wanna kill ?? -  kill -  Whom do yu wanna kill

 I love being able to run my tongue along my teeth -  I love being able to run my tongue along my teeth -  love

 I dont like seeing my best friend cry, it breaks my heart and I dont know what to do or say -  it breaks my heart -  breaks my heart

 Thanks, I just found a article that say i cannot join you because i am in the UK though -  Thanks, -  Thanks,

 Just drove by fisher and I feel so sad -  sad -  sad

 I think my cheap sunglasses are falling apart. Oh well -  falling apart. -  falling apart.

 Slept at my parents the bed was hard as a rock n

 LOVING the hot weather forecast for the rest of the week!!!! Summer is almost heeeeere -  LOVING -  LOVING

 _watkins Oh my gosh ian i always miss you when your on -  miss -  miss

 painting my nails green in an attempt to look like an army person. annoyed that everyone seems to tan apart from me! freckles and sunburn -  annoyed that everyone seems to tan apart from me! freckles and sunburn -  annoyed

 watching Jon & Kate plus 8..can`t believe they`re divorcing -  can`t believe they`re divorcing - can`t believe they`re divorcing

 I work for a man that is so bad at his job that the whole team want to leave, and all believe we would make more money if he wasn`t here -  bad -  bad

 **** you and your hot weather - it`s freezing in Oz at the moment I miss summer.. -  **** -  **** you and your hot weather - it`s freezing in Oz at the moment I miss summer..

 Uh-Oooh my throat is scratchy -  scratchy -  scratchy

 Bad day, just got worse... -  Bad -  Bad

 I guess you are not interested !

 may gray, coldplay, and nice showers...work at 2 -  nice showers... -  nice

 good weekend -  good weekend -  good weekend

 say hello to Marina Green ... couldn`t participate in Barcelona this weeken -  couldn`t participate -  couldn`t participate

 Just got finished cleaning and putting out my mom`s presents. Happy mother`s day. I`m going to sleeeeep. -  Happy -  Happy

 isnt going to the movies!! i got grounded -  i got grounded -  grounded

 `erocka the ruler` i called you, but i see i gets the no love whats up with that? -  i see i gets the no love whats up with that -  no love

 This is truly enlightening for me -  s truly enlightening -  enlightening

 thanks, I deserve it. -  thanks, -  thanks,

 it sucks no matter where you are! I`m gonna freakin be late for work! -  t sucks -  sucks

 shakalohana week two of flat wavez no surfin -  shakalohana week two of flat wavez no surfin -  flat wavez no surfin

 Sitting with sabbeth in first period. buhahaha we are so cool -sabbeth lma

 Ja Deze is interessanter - Why text messages are limited to 160 characters - http://tr.im/kpgg -  s interessanter -  interessanter

 all the best for your IB exams Carl. I hope you don`t find them too difficult and that they go well for you. -  hope -  all the best

 thanks lady.... bummer for sure. -  thanks -  thanks

 says HAPPY MOTHER`S DAY! http://plurk.com/p/sv70e -  HAPPY -  HAPPY

 hotness personified -  hotness personified -  hotness personified

 My purple pusrse is pretty someone tell Katie Holmes! -  pretty -  pretty

 _asher better. That wasn`t one of my better summaries -  better. -  better.

 Finally going home! Its been a long night.. Ready to crash and have awesome dreams -  awesome -  awesome dreams

 lol after this weekend yea ur right text me cause im goin to 3rd and they dont have computers there bye bye -  lol -  lol after this weekend yea ur right

 i`m not being mean -  i`m not being mean -  i`m not being mean

 my teeth and head hurts -  my teeth and head hurt

 Britains got Talent just gets better every week -  Britains got Talent just gets better every week -  better

 sorry but I`m not impressed in the slightest b/c u don`t eat ****..more like saddened and confused -  not impressed -  sorry

 got in a fist fight with a old biker guy.......and how dare you even ask, yes i kicked his **** -  got in a fist fight with a old biker guy.......and how dare you even ask, yes i kicked his **** -  kicked his ****

 heading out to the park with the kids - hope it does not rain -looking cloudy? -  hope -  hope

 _radio yeah :S i feel all funny cause i haven`t slept enough i woke my mum up cause i was singing she`s not impressed :S you? -  i was singing she`s not impressed -  she`s not impressed

 I told andrew that JT is going to cameo @ MTV movie awards. He said, 'he`s pissing me off.' guess they`re in a fight -  he`s pissing me off.' -  pissing me off.'

 Mom, where ever you are; Happy Mothers day -  Happy -  Happy

 de wereld need more ppl like you!

 This chair is not comfortable at all -  not comfortable -  not comfortable at all

 The battle at Minas Tirith is still very impressive. Return of the Jedi is the best Lord of the Rings movie IMO. -  impressive. -  impressive.

 unless you`re gretel killeen apparently. you did look pretty **** good -  you did look pretty **** good -  you did look pretty **** good

 no I wish. Just a van that comes round. We missed him -  missed -  missed him

 I wish yesterday was Friday -  I wish -  wish

 Finally picked up some handwraps, but struggling to wrap my stronger hand with the other! Defo should have got some a lot sooner though -  struggling -  struggling

 What do you mean by your portfolio is sad? No luck here. I`ve applied about 30 places so far. -  ? No luck -  sad?

 lol sweet!!! ...i still have yet to watch the 4th movie hope ur having a lovely weekend! happy mother`s day from aus! =P haha -  hope ur having a lovely weekend! -  happy

 Same to your mom too pril -  Same to your mom t

 I`m awake. Anybody else awake? Wish I lived in the US since all the fun happens when I`m asleep! -  since all the fun happens when I`m asleep! -  Wish

 thanks to follow -  thanks -  thanks

 _benson You are an absolute legend! Love love love it -  You are an absolute legend! Love love love it -  Love love love it

 _pina_14 Not good!!He wouldnt like his girl flirting with his colleagues,would he? -  Not good! -  Not good!!

 just woke up, having coffee, listening to Music, reading RSS...Sunday feels great -  Sunday feels great -  great

 my mom wants to lay. We`ll be there later. Probably a little after 10. And I`m sorry you`re sick -  I`m sorry you`re sick -  sorry you`re sick

 binstruct suffers from an update of the mib package... and the developer of that package is on holidays -  suffers -  suffers

 I also think that I`m talking to myself now. Okay I need to get to bed. BTW, I miss my brothers. -  I miss my brothers. -  miss

 Happy Mother`s Day ~ and for single dads who plays 

 ~ Happy Star Wars Day.every one ~ ...may the fourth be with you! -  Happy Star Wars Day.every one -  Happy

 Weekend was quiet, just planning new websites. Today, research and choosing hats. Coffee hat sounds good though. Kettle on -  sounds good though. -  good

 aww hope uve hada good day xxxxx -  w hope uve hada good day -  hope

 Ugh worried about my math test -  Ugh worried about my math test -  worried

 I fall asleep and didnt get to see the jonas brothers web cast ... Still tierd -  tierd -  I fall asleep and didnt get to see the jonas brothers web cast ... Still tierd

 No, seriously you guys, I /wanted/ to kick Monday off with #starwarswithaddedpants spam. Really -  No, seriously you guys, I /wanted/ to kick Monday off wit -  spam. Really

 Good morning MiaMiaDC My weekend was great and I just had my Monday, which was also great I hope you have a wonderful day! -  a wonderfu -  Good morning MiaMiaDC My weekend was great

 My cable signal is all messed up!!! I`m missing Ghost

 ii DON`T HAVE ANY EiTHER -  ii DON`T HAVE ANY EiTHER -  ii DON`T HAVE ANY EiTHER

 Tweeting from the tarmac at Cork Airport. Delayed -  Delayed -  Delayed

 tell zach & jer I said happy birthday! they seem like cool brothers, youre lucky haha -  e lucky -  happy

 Then back to **** school -  Then back to **** school -  **** school

 Just spoke to Keith Urban`s record company, we can`t get him on the show till we`re back from holidays sorry Jack, we`re trying mate! -  sorry -  sorry

 Ugh..just starting another 2 hours of behind the wheel for drivers ed -  Ugh.. -  Ugh..

 had a fun day at the theatre... glad to be back in town for a while -  had a fun day -  fun day at the theatre... glad

 I wish our didn`t close -  I wish our didn`t close -  I wish our didn`t close

 Thanks for the link, ive voted & i`ll send that out too. -  Thanks for the link -  Thanks

 is also very excited for the BLAZIN SQUAD revival! im rooting for `em-- their new song http://tinyurl.com/dz7tms -  is also ver

 I don`t believe it, my puppy likes brussels sprouts! -  likes -  likes

 i already got my tickets to your concert here in the philippines! im so excited! -  excited! -  excited!

 Think of the prize at the end. So sorry to hear that though. -  So sorry to hear that though. -  sorry

 4 fillings to the good(?); two more appointments to go -  4 fillings to the good(? -  good(?);

 this is very true about ! but you do have to admit it was pret-ty funny! im bout to go you tube it! lol -  funny! -  funny!

 I was sleeping soo good but just woke up like 10 mins ago n got sick not feeling so great. I wanna go back to sleep but I`m wide awake. -  got sick not feeling so great. -  sick not feeling so great.

 you will be great! Have a wonderful first day -  you will be great! Have a wonderful first day -  you will be great! Have a wonderful

 Awww my lovies! Yes I love it all. And I`m totally finishing my buffalo sammich after a few hits off the new bongie. <3 -  Awww my lovies! Yes I love it 

 learn to fly higher...! http://tinyurl.com/30tools -  learn to fly higher...! -  learn

 Double rainbow above the Organs. Pretty, but doesn`t take the edge off my $460 grocery tab. -  Pretty, -  Pretty,

 thanks for agreeing with me -  thanks -  thanks

 Mind telling me which book cuz i was at barnes and nobles today and found 2 books that looked promising -  d found 2 books that looked promising -  promising

 cannot go, cannot refused to feel the pain -  cannot refused to feel the pain -  cannot go, cannot refused to feel the pain

 Thanks for the follow back -  Thanks -  Thanks

 back from the yoga retreat. I recommend this to everyone -  I recommend this to everyone -  I recommend

 Sorry I can not reach either URL -  Sorry I can not reach either URL -  Sorry

 Congrats on the Invisalign! I need to get refitted for mine - I got lazy and stopped wearing them. -  I got lazy and stopped -  lazy

 have you read Angels & Demons? What do you think of it if you have? oh and beautiful sun

 I wanna go to the extra show really bad -  really bad -  bad

 Ahhh I am sooo happy Ashley Tisdale is in Germany but I am not in Oberhausen... But i will show it in the TV at 8 o` clock -  happy -  happy

 I can`t keep it -  I can`t keep it -  I can`t keep it

 argh! color me jealous! That rain I asked for the other day still hasn`t gone away -  jealous! -  argh! color me jealous!

 Thanks!!!! Like I said on facebook, you just made me awesomely happy. Thanks. -  Thanks!!!! Like I said on facebook, you just made me awesomely happy. Thanks -  Thanks!!!!

 confused on why and how people pose as others on twitter. I mean they really do their homework on peoples lives to imitate them. it`s sad -  confused -  it`s sad

 Great... I`ll check it when I get off work... They block Sims at work... -  Great. -  Great...

 gives you a Haiku status (inspired by Michelle Yuen) the sun is shining a perfect day glorious day outside my office -  perfect -  perfect

 Frank from UPS` last day today... sad

 yay cheerleading when im sick. its gonna be a fun night peoples -  gonna be a fun night -  fun

 i can`t i can`t i can`t i`m sad.... i`m from venezuela! -  m sad -  sad....

 yeah well a deadline is in T-9 hours, that`s architecture for you, oh well... mmm that coffee sounds like a good idea -  good -  good idea

 Just got bullied by Dillah. HELP! -  bullied -  bullied

 Word. Yayy twitter after dark lol. -  . Yayy -  Yayy

 time to leave a passive agressive note to the owners. It`s not the dog`s fault... it`s their **** owners -  time to leave a passive agressive note to the owners. It`s not the dog`s fault... it`s their **** owne -  passive agressive note to the owners. It`s not the dog`s fault... it`s their **** owners

 Trust me, that`s a GOOD thing. Your 40-something self will thank you. -  GOOD -  GOOD

 a headache once again ugh -  ugh -  headache once again ugh

 Know exactly what you mean I`ve lost too many friends. I do feel for you all -  I`ve lost too many friends. I do fe

 missing on his birthday -  missing on his birthday -  missing

 good luck going to sleep. i`m up working on a stupid paper. no worries. ur not alone. so u ready to record that album or what? -  good luck -  good luck

 wish i was rollin with ya -  wish -  wish

 had an awesome day at the zoo yes2dy!!!!! now gettin ready 4 church yay!!! -  had an awesome day -  awesome

 @_AislinnTighee bhaha, its a teenage nightclub and i am at home when im supposed to be there. they had no **** license. no more touch -  they had no **** license. -  they had no **** license. no more touch

 Bye!! Great meeting you! -  Great meeting you! -  Great

 I asked mum bout going out tommorow, she laughed in my face lmao -  she laughed in my face -  laughed

 [Wrong!] #liesboystell Your the only one, I love (they really have several women) http://tinyurl.com/nl6pct -  liesboystell -  [Wrong!]

 Omigoodness I feel like a popsicle -  Omigoodness -  Omigoodness

 goodmorning ! -  goodmorning ! -  goodmorning

 Emi

 I`m Kinda sleepy.. I Was up too late texting a nice boy making date plans for next weekend.. -  nice bo -  nice

 Yup I stayed until the very very end Exciting! -  I stayed until the very very end Exciting! -  Exciting!

 Bad day! Work`s TOO stressful...been involved in a minor accident, but everything`s ok so far. Have to cut down immediately! -  Bad day! -  Bad day! Work`s TOO stressful...

 Oh no adult school again -  Oh no -  Oh no adult school again

 Took a short nap now Im ready for work. My sun burn hurts -  hurts -  hurts

 _lopez I love you kiss me! -  I love you kiss me! -  I love

 I do hope many of my new followers are from around Sydney Australia Welcome to my tweets anyway. -  hope -  hope

 Ha I got another #followfriday. take that ! oh... your listed too... thanks a lot ya jerk. -  thanks -  thanks

 Michelle, I slept for 11 hours last night. I`m still stick with this fever. -  stick with this fever. -  I`m still stick with this fever.

 Getting ready to go to sleeep!

 _at_work agree totally, think though if we can take a point off Everton and Liverpool beat Man City we are happy days -  happy days -  happy days

 wtf kinda best friend am I? I *still* haven`t met the hubby. That depresses me -  wtf kinda best friend am I? I *still* haven`t met the hubby. That depresses me -  wtf kinda best friend am I? I *still* haven`t met the hubby. That depresses me

 _Dubai Enjoy sounds idyllic AND Geordiebird has lost her bikini...perfect holiday! -  perfect holiday! -  Enjoy sounds idyllic AND Geordiebird has lost her bikini...perfect

 I have 3 computers all going now. IE 7 on XP and IE 8 on Vista all are still NO Shows! -  re still NO Shows -  NO Shows!

 Yes, people skills and social manners are quite nice. If you a few billion, guess you need MORE love to gently guide you. -  nice. -  nice.

 Miss you my dear -  Miss you my dear -  Miss you my dear

 gasp-- 10 followers! i feel almost famous. i used to think i would be famous when i grew up one day LOL. oh

 lonely -  lonely -  lonely

 omg i realy can`t sleep ughh -  realy can`t sleep -  omg i realy can`t sleep ughh

 Played with FontStruct http://is.gd/ejE uploaded to dafont, 16k downloads & top of its category http://is.gd/wyyp Very wtf moment for me -  top -  Very wtf moment for me

 yes but 75% are on the wrong wireless plan -  wrong -  wrong

 aaahhh, showers are great -  aaahhh, showers are great -  great

 Decode by Paramore is a great song... Love it... -  great -  great

 It`s great to be home! Temp is chill, cat is great and I feel awesome -  great -  great to be home! Temp is chill, cat is great and I feel awesome

 people r weird -  people r weird -  weird

 happy birthday and congrats wish you were here for your lady..ill take care of her;) -  happy birthday and congrats wish you were here for your lady..ill take care of her -  happy birthday and congrats

 Not looking forward to next wednesday at all -  Not looking forward -  Not looking forward to next wednesday at all

 g

 oh that makes sense! Well you are officially my first from a celebrity!! -  oh that makes sense! Well you are officially my first from a celebrity!! -  oh that makes sense!

 Gloomy day can`t stop my bliss. I hope Ronaldo will score in derby match today. -  Gloomy day can`t stop my bliss. -  bliss. I hope

 WOW i just drank a drink of water - 12 ice cubes that took ages to melt. i now have brian freeze -  i now have brian freeze -  WOW i just drank a drink of water - 12 ice cubes that took ages to melt.

 ya know why today sucks? its been raining, we have no $, & no possibility of a magic friday. so whats goin down tonight? -  ya know why today sucks? -  sucks?

 Trued a rim! I`m getting good at this! o_O -  I`m getting good -  good

 I know! I loved it. -  I loved -  loved

 told you, you would sweep haha :-p -  d sweep haha :-p -  haha

 my riding time has been dismal too during these rainy weeks. haven`t been able to make myself to go to the gym instead. -  haven`t been able to -  

 dont ya know? people love the human society -  people love the human society -  love

 Thanks to: New followers! -  Thanks -  Thanks

 well what about tomorrow? I miss you -  I miss you -  miss you

 BOOK NOW & SAVE:SUMMER 2009 * THE AMAZONES VILLAGE SUITES****-CRETE-GREECE! THE BEST PLACE TO BE! -  BEST PLACE TO BE! -  BEST

 being surrounded by student houses having barbecues and playing **** music is hardly conducive to a good job-applying frame of mind -  good -  good

 _aureole Oh my gosh, so cute!!! -  cute!! -  cute!!!

 Swimming party at my brothers tonight. I had an AWESOME time. Since when am I a sissy about cold water? Who am I? LOVED TODAY!!! -  AWESOME ti -  AWESOME

 out enjoying the weather before i have to go to work.. last shift with -  out enjoying the weather be -  enjoying

 sad that school is over gonna miss all my friends and teachers -  sad -  sad

 I`d love to, but i`m all the way in India. -  love -  love

 Watching the Pianist with my dad great movie. -  grea

 looking at all my old myspace status` oh mann. Skyrockets in flight! afternoon delight! AAAAAAAAfternoon delight! -  delight! -  afternoon delight! AAAAAAAAfternoon delight!

 I`m out of town next week We`ll have to party when I get back. Happy early Birthday! -  k. Happ -  Happy

 Just got to work. Only today and three days left. my heart is breaking. -  breaking. -  my heart is breaking.

 10) I`m allergic to hot wax -  allergic -  allergic

 Not feeling very well. -  Not feeling very well. -  Not feeling very well.

 If I knew it was gonna be this kind of party, I would`ve stuck my **** in the mashed potatoes! -  If I knew it was gonna be this kind of party, I would`ve stuck my **** in the mashed potatoes! -  stuck my ****

 hahahahahahahahahahahahaha that could be interesting -  hahahahahahahahahahahahaha that could be interesting -  hahahahahahahahahahahahaha that could be interesting

 revising as uni exams are looming -  e looming -  looming

 bah! bk on reception comps aint wo

 exhausted and sick... my face is greenish -  sick.. -  exhausted and sick...

 im gone miss ya`ll Lol.don laugh im serious.its bittersweet. lookin forward 2goin home but cant wait till nx semester! -  im gone miss ya`ll Lol.don laugh im serious.its bittersweet. -  miss ya`ll Lol.don laugh im serious.its bittersweet.

 #Happy Mother`s Day euch allen -  Happy - Happy

 read twilight and new moon keen to read eclipse and breaking dawn hmm twilight was better then new moon but there still awesome -  there still awesome -  awesome

 shawty next to me like hella good oowwwww -  good oo -  good

 Gonna nap n chill then probably go to the movie later. Ugh i have a headache this sux ****. Cloudy day too -  sux ****. -  Ugh i have a headache this sux ****.

 I`m the same way, but with Backstreet Boys. I remember gasping when they used 1 song on Chuck I was like O.O OMFG NO WAY -  G NO WAY -  gasping when they used 1 song on Chuck I was like O.O OMFG NO WAY

 HAPPY MOTHERS DAY MARK -  HAPPY MOTH

 http://twitpic.com/67jq9 - i`m really missing this place my grandparents are living there, on the calm country! but im coming to them ... -  missing -  missing

 I don`t seem to have any photos of me and my Grandma together for tomorrow. It is heart breaking -  heart breaking -  It is heart breaking

 Welcome to my network buddies -  Welcome to my network buddies -  Welcome

 Done! Finally.. Yay.. Now I can relax for.. Well one day ;D haha.. -  Yay.. -  Yay..

 Its good to have an old friend at ur new job. Another good day at work. Paycheck day will be even better -  Another good day at work. -  Its good

 Shouldn`t sit weirdly at the pc, I know Im going to hit the deck once I get up -  Shouldn`t sit weirdly at the pc, I know Im going to hit the deck once I get up -  Shouldn`t sit weirdly

 Then me disliking you is a rumor! cyndi! What made it look like i didn`t like you? -  disliking -  disliking

 I`m sure it was amazing Wish I could have been there :] You`re an incredible, phenomen

 won`t be going to the Oxford Internet Institute Summer Doctoral Programme in Brisbane: lack of funding -  lack of funding -  lack of funding

 Math was not fun at all. Oh well, i get to cook as soon as i get home. -  not fun -  not fun at all.

 I have to choose between and _FC on Sunday and wins. I`m shattered. Why can`t I be healthy enough to go to both? -  s. I`m shattered -  shattered.

 Mmmm holiday commercials really ARE nice -  nice -  nice

 One downside of the nice weather: It brings of the chavs... -  downside -  One downside

 Fallen in love with enter shikari again. Might go for a walk with the lady later. now though -  love -  Fallen in love

 about to head to Starbucks. was gonna take the bus to Paradise and Starbucks, but missed it by 2 minutes. -  missed -  missed

 haha I love Dnt Regret It Now with Tyga <3 But just randomly, DeLeon sings on Tifanny Blews! Haha like one line.. XD -  I love -  love

 I totally did go! and he was AMAZING. He`s is the reason I bought the

 i know. i need to get their cd somewhere.. hopefully they sell it here in finland. -  hopefully -  hopefully

 I will tweet you sometime tomorrow, anothe busy day! Goodnight **** Chickie! LOL! -  Goodnight -  Goodnight

 mufasa!!!! warriors or the OCEAN! hahahahahaha -  hahahahahaha -  hahahahahaha

 urgh, over slept for work, still done no revision and im SO snappy today. having a total fat day too -  snappy -  urgh,

 Had fun early night Vegas because Pool tomorrow -  Had fun -  Had fun

 Our kids are both in a Derbyshire schools string concert at the Buxton Opera House today. They`re both quite excited -  excited -  excited

 going for lunch soon with my fave cuzs -  fave -  fave

 Good night all... Just set this twitter thing up. I`m very new at this, but I expect it to come in handy. -  Good -  Good

 happy that Google Wave is trending. Can`t watch the video for it... certain flash videos are blocked by my company`s network. -  happy -  happy

 Where`s yummiest Pan Mee? For me it

 Hahahahahahahahahah! That tickled me so much! -  Hahahahahahahahahah! -  Hahahahahahahahahah! That tickled me so much!

 Just read McDonald`s is actually running more ads than before the economic slowdown. Great. I`m lovin it. -  Great. I`m lovin it. -  Great. I`m lovin it.

 had a nice time with juno http://plurk.com/p/sv71z -  nice -  nice

 Just saw up with my favorites and surprisingly it was way too sad -  it was way too sad -  sad

 Happy Mother`s Day to all your wonderful moms! -  Happy Mother`s Day to all your wonderful moms! -  Happy

 If any one is looking for he is now at (And if you don`t know him yet, follow him anyways, good guy) -  s, good gu -  good

 right on! i`m 29 myself... i turn 30 in october. i think that pretty much makes us awesome -  s awesome -  awesome

 laughed so much today over that picture of lauren, that my chest hurts. -  hurts. -  hurts.

 Lets get rich and give everyone nice sweaters and teach them how to dance... -  nice sw -  nice

 _Morris Congra

 _bop and that marisa mauro can go use a banana as a ****. srsly, some people are such arseholes -  such arseholes -  ****. srsly, some people are such arseholes

 the columbus blue jackes may be movieing to anew city to play at thats sad news -  s sad news -  sad news

 i miss everyone... i need faces , not witty situational updates, but these will do... -  i miss everyone... i need faces , not witty situational updates, but these will do... -  i miss everyone...

 _reality morning trish, have fun today -  have fun today -  have fun

 ever been in a pointless argument with drunk mum, drunk nan and drunk mums bf while having dinner??? i have.... fun times all round hahah -  ever been in a pointless argument with drunk mum, drunk nan and drunk mums bf while having dinner??? i have.... fun times all round hahah -  fun

 i love you twitskies -  i love you twitskies -  i love

 answer my really cool questions -  really cool questions -  cool questions

 Dropping my mum at the station I`ll 

 my stupid tooooth hurts -  hurts -  my stupid tooooth hurts

 Goodnight and Goodbye ? -  Goodnight and Goodbye ? -  Goodnight

 Guilt trips, feeling sick, pressure. stress and too much drama -  sick, -  Guilt trips, feeling sick,

 I`m in belleville at my parents and someone offered me a bus ride to orillia for 10$ at the mall. Thought of you -  offered -  Thought of you

 is being a horrible twitter-er. Moved in to my Houston apartment and awaiting work starting on Monday. Real world=now -  horrible -  horrible

 I miss you too!! -  I miss you too!! -  I miss you too!!

 This is scary they hooked me up to 9 wires n it printed sum graph ohhhhhh -  This is scary -  scary

 Its so dead -  dead -  dead

 ughhh rejected from the 09 mediation program. SUCKSSSS. -  ughhh rejected from the 09 mediation program. SUCKSSSS. -  SUCKSSSS.

 Having a hectic day travelling from PJ to UNITEN, back to PJ. Working now at CC office. -  hectic day -  Having a hectic day

 RAM upgrade=done! FF still slow

 I`ve got a headache !!! -  headache -  headache

 Nice to meet you toooo Good to know another one of my followers ACTUALLY speak! -  Nice to meet you toooo Good to know another one of my followers ACTUALLY speak! -  Nice

 iloveyoumoreeee -  iloveyoumoreeee -  iloveyoumoreeee

 Someone just admitted to having a crush on me.. That`s cool -  That`s cool -  That`s cool

 Thinkin` Twitter is interesting. -  interesting. -  interesting.

 Okay, cool. Hope you had better dreams than you had last week. -  . Hope -  Hope

 I LOVE TWILIGHT !!! -  I LOVE TWILIGHT !!! -  I LOVE

 _Bee same here : / Coincidently, my friend just cancelled our movie date -  t cancelled -  cancelled

 trying to NOT fall asleep while doing ancient assignment. Drinking some water with ice and lime! yum -  yum -  yum

 at freddies having a cotch and LADYHAWKE is here, just missing you my bajan beauty -  missing -  missing

 _ ay beezy! finally lol its time to hookah so get off twiiter...love ya -  .love ya - ...love

 

 saying goodbye to my parents at the airport. it was great having them around. -  it was great -  great

 Last free Friday. -  Last free Friday. -  Last free Friday.

 Is sat in her PJs drinking tea and watching the Politics Show. Lovely -  Lovely -  Lovely

 _MEXICO Hey hey. No problem. -  No problem. -  No problem.

 Watching NHL playoff Game1 tomorrow night if anyone is interested, and I know that none of you are. -  know that none of you are. -  none of you are.

 Dayum, tweets r coming fast, so likely missing a lot. Plz DM, k? Oh, is _stewart 4 real? -  missing -  missing a lot.

 Goodnight all -  Goodnight all -  Goodnight

 _craig Tried to follow one of your #FF recommendations but 'have been blocked from following by request of the user' -  have been blocked -  blocked

 feeling the need for more advil. -  feeling the need for more advil. -  feeling the need for more advil.

 is lonely in need of company -  lonely -  lonely

 Zzzz... I`m taking my mom out for breakfast tomorrow

 and its over now watch on now? hmmm -  and its over now -  and its over

 i`m still crossing my fingers for a fun group -  for a fun group -  fun

 I`d help you if I weren`t poorer than you lol! I`m mexican living in .mx, which automatically makes me 13 times poorer -  I`d help you if I weren`t poorer than you lol! I`m mexican living in .mx, which automatically makes me 13 times poorer -  poorer

 Up earlier because of a stupid orthadontist appointment -  stupid -  stupid

 I just got my JAGK shirt in the mail! Omg I love it!!! see you saturday!! -  g I love it! -  love it!!!

 Join the biggest and bestest group on facebook http://bit.ly/cDrbt -  bestest -  bestest

 Experiencing pain with paginating ASP.NET ListView controls -  pain -  Experiencing pain

 So so happy to be with - she even makes doing laundry wonderful -  So so happy to be with - she even makes doing laundry wonderful -  happy

 is NOT watching Star Trek tonight. But is heading to a lovely dinner and fun board games n

 Just got my marks... BCIT is the death of me I swear! -  e death of me I swear! -  death of me I swear!

 I WISH I LITERALLY COULD **** JUST ABOUT EVERY **** IN THE WORLD...IMA NYMPH -  **** -  I WISH I LITERALLY COULD **** JUST ABOUT EVERY **** IN THE WORLD...

 is so glad the weekend is here. Only one more week left of school with my kids. -  is so glad the weekend is here. -  glad

 wow I just had a two hour conversation with someone on omegle. it was amazing -  amazing -  amazing

 such a dissapointment hhaha -  dissapointment -  dissapointment

 Being involved in requirements and architecture is nice and all, but now I`m looking forward to writing some code! -  architecture is nice an -  nice and all, but now I`m looking forward

 Waiting for the dang pizza to cook. It`s almost 9 and we still have not eaten wifey fail. Did I mention I feel like **** -  Did I mention I feel like **** -  fail. Did I mention I feel like ****

 just bought a good chocolate and a magazine... Later I`l

 Computer remains dead -  dead -  dead

 Happy Star Wars Day. May the 4th be with you! Nice that we get a holiday to celebrate #fb -  Nice -  Happy

 aww homesick i feel you! im homesick for my 2nd home, campp -  homesick -  aww homesick

 excited about CWPM tomorrow.. only one member is going but still , its a good start -  excited about CWPM tomorrow.. only one member is going but still , its a good start -  excited

 i think you`d look cute in the beanie hat -  cute -  cute

 sheboygan / teekay / other kids / fight / angus / ryan / mall / mcdonalds / high / party / beer pong / drink / drive home. fun night -  fun night -  fun

 that was flippin` sweet, dudes. thanks for sharing -  es. thank -  thanks

 have to return my sideways. **** meds and bills. -  **** -  ****

 Just saw my boo he went back to work now time to do my hair but it`s going to rain WTF -  WTF -  WTF

 i seriously need to live somewhere fabulously queer. i miss being around **** people -  i seriously need to live so

 oh darn i`m not in london -  darn -  oh darn

 Disappointment really sucks! I`m getting used to it. -  Disappointment really sucks! -  Disappointment really sucks!

 Just got back from the grocery store. Now I`m starving and can`t find anything to eat! -  Now I`m starving and can`t find anything to eat! -  starving

 Iï¿½m JBobsessed xD I miss them soooooo much! They should have a live web cast on bookface ? EVERY thursday xD -  obsessed xD I miss them soooooo much! - miss th

 , Whered you end up going? I stayed in & watched SNL, one of the funnier shows theyve done this season. -  Whered you end up going? I stayed in & watched SNL, one of the funnier shows theyve done this season. -  funnier

 I need to cancel my appt for 1 have to be home from1-5 for washing machine repair man can I do on the am? So sorry -  So sorry -  So sorry

 'arrest her' or, anything ending in -est that can be done to her. -  'arrest her' -  'arrest her' or, anything ending in -est that can be done to her.

 

 hm seems to have been because my blog was marked as a phishing site -  my blog was marked as a phishing site -  phishing site

 _a_michael i plan on it! Goodnight -  Goodnight -  Goodnight

 all my files got deleted -  all my files got deleted -  deleted

 you and the guys should come down here, we are **** freezing out here! -  **** freezing out here! -  **** freezing out here!

 You must be scared to be trolling here. You guys are so sad. #tcot #right -  sad. -  scared to be trolling here. You guys are so sad.

 I read them a couple of weeks ago They work really well with the movie. -  y well -  well

 oooh sounds yummy. If you get a chance to take some pics please add them to the website as we don`t have many pies on there -  yummy. -  yummy.

 that would be lovely, alas the cubs would not be gracious enough to stop wrecking the house while we were soaking our bones -  that would be lovely, alas the cubs would not be gracious enough to stop wrecking the house while we were soaking 

 I had bad net issues on Weds so couldn`t broadcast Am on tonight tho... tune in for new anthems and bad mixing! -  d bad -  bad

 _C I can`t find the original on blip.fm and the Ex Models version, not so much... -  _C I can`t find the original on blip.fm and the Ex Models version, not so much... -  not so much...

 is bored. my BFF doesn`t want to hang out -  bored. -  bored.

 Yes please and check out your position on the locations map when added - http://bit.ly/ttVn2 -  please -  Yes please

 thanks for the retweet, man having a quiet Sunday morning... how`s yours? -  thanks for the retweet, -  thanks

 OH NO! MY FAN BROKE NOOOOOOOOOOOO! great now i have to swelter in the heat. i like heat-ish but it hot! my laptop warm as well. -  NOOOOOOOOOOOO! -  BROKE

 _E Aw. well im sorry you don`t like July for those reasons -  sorry -  sorry

 says ptour2 tom. hello school again. ****! http://plurk.com/p/x15fh -  ****! -  ****!

 I Hate It There Should Be A Endless Supply Of Hot Water! I Put

 I would totally take you to prom...if i hadnt already gone sorry lol -  sorry -  sorry

 Good Morning BTW - A public holiday in UK, love it and dinner tonoght with 2 special people. -  special people. -  Good Morning BTW - A public holiday in UK, love

 Work, work, work. Finally not sick, though. -  not sick, -  Finally not sick, though.

 _Henrie haha i WISH i coudl meet you.. you should stop by seattle some time home of the STARBUKS ;) I LOVE YOU DAVID!! -  I LOVE -  I LOVE

 Good morning world. -  Good morning world. -  Good

 _EaredPages Yay! I can`t wait to come in the bookstore and gets some new books -  Yay! -  Yay!

 thinks that the limit of 140 letters is really not fair. 300 + would be better -  not fair. -  not fair.

 Being slammed with spam followers today. Is it just me? Or is that all I can attract. Get a life people. -  Being slammed wi -  slammed

 _whitex gratz on your tix. sux u will be poor tho I am trying to get pj harvey tix too but debating my $$s too -  sux u w

# Testing

In [14]:
model_config = transformers.RobertaConfig.from_pretrained(config.ROBERTA_PATH)
model_config.output_hidden_states = True

In [15]:
model1 = TweetModel(conf=model_config)
model1.to(config.DEVICE)
model1.load_state_dict(torch.load(f"{config.OUTPUT_DIR}/{config.SAVE_MODEL_DIR}/model_fold_0.bin"))
model1.eval()

model2 = TweetModel(conf=model_config)
model2.to(config.DEVICE)
model2.load_state_dict(torch.load(f"{config.OUTPUT_DIR}/{config.SAVE_MODEL_DIR}/model_fold_1.bin"))
model2.eval()

model3 = TweetModel(conf=model_config)
model3.to(config.DEVICE)
model3.load_state_dict(torch.load(f"{config.OUTPUT_DIR}/{config.SAVE_MODEL_DIR}/model_fold_2.bin"))
model3.eval()

model4 = TweetModel(conf=model_config)
model4.to(config.DEVICE)
model4.load_state_dict(torch.load(f"{config.OUTPUT_DIR}/{config.SAVE_MODEL_DIR}/model_fold_3.bin"))
model4.eval()

model5 = TweetModel(conf=model_config)
model5.to(config.DEVICE)
model5.load_state_dict(torch.load(f"{config.OUTPUT_DIR}/{config.SAVE_MODEL_DIR}/model_fold_4.bin"))
model5.eval()

TweetModel(
  (roberta): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(50265, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-05, el

In [16]:
test_df.loc[:, 'selected_text'] = test_df.text.values
final_output = []

test_dataset = TweetDataset(
        tweet = test_df.text.values,
        sentiment = test_df.sentiment.values,
        selected_text = test_df.selected_text.values
)

test_data_loader = torch.utils.data.DataLoader(
    test_dataset,
    batch_size = config.VALID_BATCH_SIZE,
    shuffle = False,
    num_workers = 8
)

with torch.no_grad():
    tk0 = tqdm_notebook(test_data_loader, total=len(test_data_loader))
    for bi, d in enumerate(tk0):
        ids = d["ids"]
        token_type_ids = d["token_type_ids"]
        mask = d["mask"]
        sentiment = d["sentiment"]
        orig_selected = d["orig_selected"]
        orig_tweet = d["orig_tweet"]
        targets_start = d["targets_start"]
        targets_end = d["targets_end"]
        offsets = d["offsets"].numpy()

        ids = ids.to(config.DEVICE, dtype=torch.long)
        token_type_ids = token_type_ids.to(config.DEVICE, dtype=torch.long)
        mask = mask.to(config.DEVICE, dtype=torch.long)
        targets_start = targets_start.to(config.DEVICE, dtype=torch.long)
        targets_end = targets_end.to(config.DEVICE, dtype=torch.long)

        outputs_start1, outputs_end1 = model1(
            ids=ids,
            mask=mask,
            token_type_ids=token_type_ids
        )
        
        outputs_start2, outputs_end2 = model2(
            ids=ids,
            mask=mask,
            token_type_ids=token_type_ids
        )
        
        outputs_start3, outputs_end3 = model3(
            ids=ids,
            mask=mask,
            token_type_ids=token_type_ids
        )
        
        outputs_start4, outputs_end4 = model4(
            ids=ids,
            mask=mask,
            token_type_ids=token_type_ids
        )
        
        outputs_start5, outputs_end5 = model5(
            ids=ids,
            mask=mask,
            token_type_ids=token_type_ids
        )
        
        outputs_start = (
            outputs_start1 
            + outputs_start2 
            + outputs_start3 
            + outputs_start4 
            + outputs_start5
        ) / 5
        
        outputs_end = (
            outputs_end1 
            + outputs_end2 
            + outputs_end3 
            + outputs_end4 
            + outputs_end5
        ) / 5
        
        outputs_start = torch.softmax(outputs_start, dim=1).cpu().detach().numpy()
        outputs_end = torch.softmax(outputs_end, dim=1).cpu().detach().numpy()

        for px, tweet in enumerate(orig_tweet):
            selected_tweet = orig_selected[px]
            tweet_sentiment = sentiment[px]
            
            _, output_sentence = calculate_jaccard_score(
                original_tweet=tweet,
                target_string=selected_tweet,
                sentiment_val=tweet_sentiment,
                idx_start=np.argmax(outputs_start[px, :]),
                idx_end=np.argmax(outputs_end[px, :]),
                offsets=offsets[px]
            )
            
            final_output.append(output_sentence)

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`


HBox(children=(FloatProgress(value=0.0, max=111.0), HTML(value='')))




In [17]:
sample = pd.read_csv(os.path.join(config.DATA_DIR, "sample_submission.csv"))
sample.loc[:, 'selected_text'] = final_output

sample['selected_text'] = sample['selected_text'].apply(lambda x: x.replace('!!!!', '!') if len(x.split())==1 else x)
sample['selected_text'] = sample['selected_text'].apply(lambda x: x.replace('..', '.') if len(x.split())==1 else x)
sample['selected_text'] = sample['selected_text'].apply(lambda x: x.replace('...', '.') if len(x.split())==1 else x)

sample.to_csv(os.path.join(config.OUTPUT_DIR, "submission.csv"), index=False)

sample.head()

Unnamed: 0,textID,selected_text
0,f87dea47db,Last session of the day http://twitpic.com/67ezh
1,96d74cb729,exciting
2,eee518ae67,such a shame!
3,01082688c6,happy bday!
4,33987a8ee5,I like it!!
