In [0]:
!nvidia-smi

In [0]:
from google.colab import drive
drive.mount('/content/drive')

In [0]:
!pip uninstall -y tensorflow
!pip install git+https://github.com/huggingface/transformers
!pip list | grep -E 'transformers|tokenizers'

In [0]:
import copy
import numpy as np
import pandas as pd

from tqdm.autonotebook import tqdm

from scipy.stats import norm
from scipy import stats
from sklearn.model_selection import KFold, StratifiedKFold

import torch
import torch.nn as nn
from torch.nn import functional as F
import torch.utils.data

from transformers import *
from transformers import BertTokenizer
import tokenizers

import os
import re
import math
import random
from matplotlib import pyplot as plt
from math import floor, ceil
from sklearn.metrics import confusion_matrix
import seaborn as sns

plt.style.use("fivethirtyeight")
sns.set()
device = torch.device('cuda')


def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True

def jaccard(str1, str2): 
    a = set(str1.lower().split()) 
    b = set(str2.lower().split())
    c = a.intersection(b)
    return float(len(c)) / (len(a) + len(b) - len(c))

def get_best_start_end_idxs(_start_logits, _end_logits):
    best_logit = -1000
    best_idxs = None
    for start_idx, start_logit in enumerate(_start_logits):
        for end_idx, end_logit in enumerate(_end_logits[start_idx:]):
            logit_sum = (start_logit + end_logit).item()
            if logit_sum > best_logit:
                best_logit = logit_sum
                best_idxs = (start_idx, start_idx+end_idx)
    return best_idxs

def calculate_jaccard_score(
    original_tweet, 
    target_string, 
    sentiment_val, 
    idx_start, 
    idx_end, 
    offsets,
    verbose=False):
    
    if idx_end < idx_start:
        idx_end = idx_start
    
    filtered_output  = ""
    for ix in range(idx_start, idx_end + 1):
        filtered_output += original_tweet[offsets[ix][0]: offsets[ix][1]]
        if (ix+1) < len(offsets) and offsets[ix][1] < offsets[ix+1][0]:
            filtered_output += " "

    if sentiment_val == "neutral" or len(original_tweet.split()) < 2:
        filtered_output = original_tweet


    jac = jaccard(target_string.strip(), filtered_output.strip())
    return jac, filtered_output


class config:
    SEED = 43875210
    MAX_LEN = 128
    TRAIN_BATCH_SIZE = 32
    VALID_BATCH_SIZE = 8
    LR = 3e-5
    N_FOLDS = 5
    EPOCHS = 3
    PRETRAINED_WEIGHTS = "roberta-base"
    DRIVE_ROOT = "./drive/My Drive/Study/Tweet"
    TRAIN_PATH = f"{DRIVE_ROOT}/input/all_tweet_data.csv"
    MODEL_PATH = f"{DRIVE_ROOT}/model/roberta_base"
    TOKENIZER = tokenizers.ByteLevelBPETokenizer(vocab_file=f"{MODEL_PATH}/vocab.json", merges_file=f"{MODEL_PATH}/merges.txt", lowercase=True, add_prefix_space=True)
    UUID = "RoBERTa_base_21"
    OUT_DIR = f"{DRIVE_ROOT}/output/{UUID}/"
    THR = 0.5

!mkdir -p "{config.OUT_DIR}"

sentiment_id = {s: config.TOKENIZER.encode(s).ids[0] for s in ["positive", "negative", "neutral"]}
print(sentiment_id)

#import json
#with open(f"{config.MODEL_PATH}/vocab.json", "r") as f:
#    d = json.load(f)
#d["<mask>"]
MASK_ID = 50264

all_tweet_data = pd.read_csv(config.TRAIN_PATH)
train = all_tweet_data[all_tweet_data["data_type"] == "train"].reset_index(drop=True)
print("origin", len(train))
train = train[train["is_keep"]].reset_index(drop=True)
print("del duplicates", len(train))
train = train[["text", "selected_text", "new_sentiment", "sentiment"]]
train.columns = ["text", "selected_text", "sentiment", "org_sentiment"]

test = all_tweet_data[all_tweet_data["data_type"] != "train"].reset_index(drop=True)
print(test.shape[0])
test = test[test["is_keep"]].reset_index(drop=True)
test = test[test["text"].map(lambda x: len(x)) > 3].reset_index(drop=True)
test = test[test["text"].map(lambda x: len(set([xx for xx in x if xx not in [" ", ".", "!", "?", "_"]])) > 2 )].reset_index(drop=True)
print(test.shape[0])
test = test[["text", "text", "assumed_sentiment", "sentiment"]]
test.columns = ["text", "selected_text", "sentiment", "org_sentiment"]

sentiment_label = {v: [config.TOKENIZER.encode(v).ids[0]] for v in train["org_sentiment"].unique()}
sentiment_label

In [0]:
def process_data(tweet, selected_text, sentiment, tokenizer, max_len, org_sentiment):
    tweet = " " + " ".join(str(tweet).split())
    selected_text = " " + " ".join(str(selected_text).split())

    len_st = len(selected_text) - 1
    idx0 = None
    idx1 = None

    for ind in (i for i, e in enumerate(tweet) if e == selected_text[1]):
        if " " + tweet[ind: ind+len_st] == selected_text:
            idx0 = ind
            idx1 = ind + len_st - 1
            break

    char_targets = [0] * len(tweet)
    if idx0 != None and idx1 != None:
        for ct in range(idx0, idx1 + 1):
            char_targets[ct] = 1

    tok_tweet = tokenizer.encode(tweet)
    input_ids_orig = tok_tweet.ids
    tweet_offsets = tok_tweet.offsets
    
    target_idx = []
    for j, (offset1, offset2) in enumerate(tweet_offsets):
        if sum(char_targets[offset1: offset2]) > 0:
            target_idx.append(j)

    try:
        targets_start = target_idx[0]
        targets_end = target_idx[-1]
    except IndexError:
        targets_start = 0
        targets_end = len(tweet) - 1     


    input_ids = [0] + [sentiment_id[sentiment]] + [2] + [2] + input_ids_orig + [2]
    token_type_ids = [0, 0, 0, 0] + [0] * (len(input_ids_orig) + 1)
    mask = [1] * len(token_type_ids)
    tweet_offsets = [(0, 0)] * 4 + tweet_offsets + [(0, 0)]
    targets_start += 4
    targets_end += 4

    padding_length = max_len - len(input_ids)
    if padding_length > 0:
        input_ids = input_ids + ([1] * padding_length)
        mask = mask + ([0] * padding_length)
        token_type_ids = token_type_ids + ([0] * padding_length)
        tweet_offsets = tweet_offsets + ([(0, 0)] * padding_length)
    
    return {
        'ids': input_ids,
        'mask': mask,
        'token_type_ids': token_type_ids,
        'targets_start': targets_start,
        'targets_end': targets_end,
        'orig_tweet': tweet,
        'orig_selected': selected_text,
        'sentiment': sentiment,
        'offsets': tweet_offsets,
        'sentiment_label': sentiment_label[org_sentiment]
    }

class TweetDataset:
    def __init__(self, tweet, sentiment, selected_text, org_sentiment):
        self.tweet = tweet
        self.sentiment = sentiment
        self.selected_text = selected_text
        self.tokenizer = config.TOKENIZER
        self.max_len = config.MAX_LEN
        self.org_sentiment = org_sentiment
    
    def __len__(self):
        return len(self.tweet)

    def __getitem__(self, item):
        data = process_data(
            self.tweet[item], 
            self.selected_text[item],
            self.sentiment[item],
            self.tokenizer,
            self.max_len,
            self.org_sentiment[item]
        )
        
        max_idx = data["mask"].index(0)
        masked_idx = random.sample(list(range(4, max_idx)), 2)

        dummy_ids = [MASK_ID if i in masked_idx else v for i, v in enumerate(data["ids"])]
        dummy_target = [v if i in masked_idx else -1 for i, v in enumerate(data["ids"])]
        
        dummy_target[1] = data["sentiment_label"][0]

        bce_target = torch.zeros(self.max_len)
        bce_target[data["targets_start"]:data["targets_end"]+1] = 1.0

        return {
            'ids': torch.tensor(data["ids"], dtype=torch.long),
            'mask': torch.tensor(data["mask"], dtype=torch.long),
            'token_type_ids': torch.tensor(data["token_type_ids"], dtype=torch.long),
            'targets_start': torch.tensor(data["targets_start"], dtype=torch.long),
            'targets_end': torch.tensor(data["targets_end"], dtype=torch.long),
            'orig_tweet': data["orig_tweet"],
            'orig_selected': data["orig_selected"],
            'sentiment': data["sentiment"],
            'offsets': torch.tensor(data["offsets"], dtype=torch.long),
            'sentiment_label': torch.tensor(data["sentiment_label"], dtype=torch.long),
            'dummy_ids': torch.tensor(dummy_ids, dtype=torch.long),
            'dummy_target': torch.tensor(dummy_target, dtype=torch.long),
            'bce_target': torch.tensor(bce_target, dtype=torch.float),
        }

In [0]:
def train_fn(data_loader, model, optimizer, device, scheduler=None):
    model.train()
    losses, lrs = [], []
    tk0 = tqdm(data_loader, total=len(data_loader), desc="Training")
    for bi, d in enumerate(tk0):

        ids = d["ids"]
        token_type_ids = d["token_type_ids"]
        mask = d["mask"]
        targets_start = d["targets_start"]
        targets_end = d["targets_end"]
        sentiment = d["sentiment"]
        orig_selected = d["orig_selected"]
        orig_tweet = d["orig_tweet"]
        targets_start = d["targets_start"]
        targets_end = d["targets_end"]
        offsets = d["offsets"]
        sentiment_label = d["sentiment_label"]
        dummy_ids = d["dummy_ids"]
        dummy_target = d["dummy_target"]
        bce_target = d["bce_target"]

        ids = ids.to(device, dtype=torch.long)
        token_type_ids = token_type_ids.to(device, dtype=torch.long)
        mask = mask.to(device, dtype=torch.long)
        targets_start = targets_start.to(device, dtype=torch.long)
        targets_end = targets_end.to(device, dtype=torch.long)
        sentiment_label = sentiment_label.to(device, dtype=torch.long)
        dummy_ids = dummy_ids.to(device, dtype=torch.long)
        dummy_target = dummy_target.to(device, dtype=torch.long)
        bce_target = bce_target.to(device, dtype=torch.float)

        model.zero_grad()
        outputs_start, outputs_end, outputs, lm = model(
        #outputs, lm = model(
        #outputs_start, outputs_end, lm = model(
        #outputs_start, outputs_end = model(
            ids=ids,
            mask=mask,
            token_type_ids=token_type_ids,
            is_train = True,
            dummy_ids=dummy_ids,
        )
        loss = loss_fn(outputs_start, outputs_end, targets_start, targets_end, outputs, bce_target, lm, dummy_target)
        #loss = loss_fn(outputs, bce_target, lm, dummy_target)
        #loss = loss_fn(outputs_start, outputs_end, targets_start, targets_end, lm, dummy_target)
        #loss = loss_fn(outputs_start, outputs_end, targets_start, targets_end)
        loss.backward()
        optimizer.step()
        scheduler.step()

        tk0.set_postfix(loss=loss.item())

        losses.append(float(loss))
        lrs.append(np.array([param_group["lr"] for param_group in optimizer.param_groups]).mean())

    return losses, lrs


def eval_fn(data_loader, model, device):
    model.eval()
    losses = []
    jaccards = []
    
    with torch.no_grad():
        tk0 = tqdm(data_loader, total=len(data_loader))
        for bi, d in enumerate(tk0):
            ids = d["ids"]
            token_type_ids = d["token_type_ids"]
            mask = d["mask"]
            sentiment = d["sentiment"]
            orig_selected = d["orig_selected"]
            orig_tweet = d["orig_tweet"]
            targets_start = d["targets_start"]
            targets_end = d["targets_end"]
            offsets = d["offsets"].numpy()
            sentiment_label = d["sentiment_label"]

            ids = ids.to(device, dtype=torch.long)
            token_type_ids = token_type_ids.to(device, dtype=torch.long)
            mask = mask.to(device, dtype=torch.long)
            targets_start = targets_start.to(device, dtype=torch.long)
            targets_end = targets_end.to(device, dtype=torch.long)
            sentiment_label = sentiment_label.to(device, dtype=torch.long)

            #outputs = model(
            outputs_start, outputs_end = model(
                ids=ids,
                mask=mask,
                token_type_ids=token_type_ids
            )
            
            outputs_start = torch.softmax(outputs_start, dim=1).cpu().detach().numpy()
            outputs_end = torch.softmax(outputs_end, dim=1).cpu().detach().numpy()

            """
            output_sigmoid = outputs.sigmoid().detach().cpu().numpy()
            pos_index = [np.where(o > config.THR)[0] for o in output_sigmoid]
            outputs_start = []
            outputs_end = []
            for idx in pos_index:
                try:
                    h = 4 if idx[0] < 4 else idx[0]
                except IndexError:
                    h = 4
                try:
                    t = idx[-1]
                except IndexError:
                    t = config.MAX_LEN - 1
                outputs_start.append(h)
                outputs_end.append(t)
            """

            jaccard_scores = []
            for px, tweet in enumerate(orig_tweet):
                selected_tweet = orig_selected[px]
                tweet_sentiment = sentiment[px]
                idx_start, idx_end = get_best_start_end_idxs(outputs_start[px, :], outputs_end[px, :])
                #idx_start, idx_end = outputs_start[px], outputs_end[px]
                jaccard_score, _ = calculate_jaccard_score(
                    original_tweet=tweet,
                    target_string=selected_tweet,
                    sentiment_val=tweet_sentiment,
                    idx_start=idx_start,
                    idx_end=idx_end,
                    offsets=offsets[px]
                )
                jaccard_scores.append(jaccard_score)

            jaccards.append(np.mean(jaccard_scores))
            losses.append(0.0)
    
    return np.mean(losses), np.mean(jaccards)

In [0]:
def loss_fn(start_logits, end_logits, start_positions, end_positions, outputs, bce_target, lm, ids):
#def loss_fn(outputs, bce_target, lm, ids):
#def loss_fn(start_logits, end_logits, start_positions, end_positions, lm, ids):
#def loss_fn(start_logits, end_logits, start_positions, end_positions):
    loss_fct = nn.CrossEntropyLoss()
    start_loss = loss_fct(start_logits, start_positions)
    end_loss = loss_fct(end_logits, end_positions)

    bce_loss = nn.BCEWithLogitsLoss()(outputs, bce_target)
    lm_loss = nn.CrossEntropyLoss(ignore_index=-1)(lm, ids.view(-1))
    
    #total_loss = start_loss + end_loss + lm_loss
    total_loss = bce_loss + lm_loss + start_loss + end_loss
    return total_loss
    
class RoBERTaForTweet(nn.Module):
    def __init__(self):
        super(RoBERTaForTweet, self).__init__()

        conf = RobertaConfig.from_pretrained(config.MODEL_PATH)
        conf.output_hidden_states = True

        self.n_vocab = conf.vocab_size

        self.roberta = RobertaModel.from_pretrained(config.MODEL_PATH, config=conf)
        self.dropout = nn.Dropout(0.2)

        n_weights = conf.num_hidden_layers + 1
        weights_init = torch.zeros(n_weights).float()
        weights_init.data[:-1] = -3
        self.layer_weights = torch.nn.Parameter(weights_init)

        self.dropouts = nn.ModuleList([nn.Dropout(0.2) for _ in range(5)])
        self.qa_outputs1 = nn.Linear(768, 2)
        self.qa_outputs2 = nn.Linear(768, 1)

        self.lm_outputs = modeling_roberta.RobertaLMHead(conf)


    def forward(self, ids, mask, token_type_ids, is_train=False, dummy_ids=None):
        _, _, out = self.roberta(ids, attention_mask=mask, token_type_ids=token_type_ids)

        sequence_output = torch.stack([self.dropout(layer) for layer in out], dim=3)
        sequence_output = (torch.softmax(self.layer_weights, dim=0) * sequence_output).sum(-1)
        #logits = sum([self.qa_outputs(dropout(sequence_output)) for dropout in self.dropouts])/5

        logits1 = sum([self.qa_outputs1(dropout(sequence_output)) for dropout in self.dropouts])/5
        logits2 = sum([self.qa_outputs2(dropout(sequence_output)) for dropout in self.dropouts])/5

        logits = logits1
        logits2 = logits2.squeeze(-1)

        start_logits, end_logits = logits.split(1, dim=-1)
        start_logits = start_logits.squeeze(-1)
        end_logits = end_logits.squeeze(-1)

        if is_train:
            lm_outputs, _, _ = self.roberta(dummy_ids, attention_mask=mask, token_type_ids=token_type_ids)
            lm = self.lm_outputs(lm_outputs)
            lm = lm.view(-1, self.n_vocab)

            #return start_logits, end_logits, lm
            #return logits, lm
            return start_logits, end_logits, logits2, lm

        return start_logits, end_logits
        #return logits

In [0]:
skf = StratifiedKFold(n_splits=config.N_FOLDS, shuffle=True, random_state=config.SEED).split(train["sentiment"].values, train["sentiment"].values)

train_log_dfs, valid_log_dfs = [], []
for fold, (train_idx, valid_idx) in enumerate(skf):
    print(f"### Fold-{fold} ###")
    #if fold in [0, 1, 2]:
    #    continue
    seed_everything(config.SEED + fold)

    df_train = train.iloc[train_idx].reset_index(drop=True)
    df_valid = train.iloc[valid_idx].reset_index(drop=True)

    train_dataset = TweetDataset(
        tweet=df_train.text.values,
        sentiment=df_train.sentiment.values,
        selected_text=df_train.selected_text.values,
        org_sentiment=df_train.org_sentiment.values
    )

    train_data_loader = torch.utils.data.DataLoader(
        train_dataset,
        batch_size=config.TRAIN_BATCH_SIZE,
        num_workers=0,
        shuffle=True
    )

    valid_dataset = TweetDataset(
        tweet=df_valid.text.values,
        sentiment=df_valid.sentiment.values,
        selected_text=df_valid.selected_text.values,
        org_sentiment=df_valid.org_sentiment.values
    )

    valid_data_loader = torch.utils.data.DataLoader(
        valid_dataset,
        batch_size=config.VALID_BATCH_SIZE,
        num_workers=0,
        shuffle=False
    )

    model = RoBERTaForTweet()
    model.to(device)

    param_optimizer = list(model.named_parameters())
    no_decay = ["bias", "LayerNorm.bias", "LayerNorm.weight"]
    optimizer_parameters = [
        {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.001},
        {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0},
    ]
    optimizer = AdamW(optimizer_parameters, lr=config.LR)
    
    num_train_steps = int(len(df_train) / config.TRAIN_BATCH_SIZE * config.EPOCHS)
    scheduler = get_cosine_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=num_train_steps)
    
    t_res, v_res = [], []
    best_score = 0
    for epoch in range(config.EPOCHS):
        train_loss, lrs = train_fn(train_data_loader, model, optimizer, device, scheduler=scheduler)
        valid_loss, valid_jaccard = eval_fn(valid_data_loader, model, device)
        print(f"{epoch} epoch, jaccard={valid_jaccard}")
        if best_score < valid_jaccard:
            print(f"  --> Best Model Update!!")
            best_score = valid_jaccard
            torch.save(model.state_dict(), f"{config.OUT_DIR}/roberta_f{fold}_best.bin")
        t_res.append(pd.DataFrame(zip(train_loss, lrs), columns=["train_loss", "lrs"]))
        v_res.append((valid_loss, valid_jaccard))

    train_log_dfs.append(pd.concat(t_res, axis=0).reset_index(drop=True))
    valid_log_dfs.append(pd.DataFrame(v_res, columns=["valid_loss", "jaccard_score"]))

In [0]:
!ls