In [0]:
!nvidia-smi

In [0]:
from google.colab import drive
drive.mount('/content/drive')

In [0]:
!pip install git+https://github.com/huggingface/transformers
!pip list | grep -E 'transformers|tokenizers'

In [0]:
import copy
import numpy as np
import pandas as pd

from tqdm.autonotebook import tqdm

from scipy.stats import norm
from scipy import stats
from sklearn.model_selection import KFold, StratifiedKFold

import torch
import torch.nn as nn
from torch.nn import functional as F
import torch.utils.data

from transformers import *
from transformers import BertTokenizer
import tokenizers

import os
import re
import math
import random
from matplotlib import pyplot as plt
from math import floor, ceil
from sklearn.metrics import confusion_matrix
import seaborn as sns

plt.style.use("fivethirtyeight")
sns.set()
device = torch.device('cuda')

class config:
    SEED = 43875210
    MAX_LEN = 128
    TRAIN_BATCH_SIZE = 64
    VALID_BATCH_SIZE = 32
    LR = 3e-5
    N_FOLDS = 5
    EPOCHS = 3
    PRETRAINED_WEIGHTS = "roberta-base"
    DRIVE_ROOT = "./drive/My Drive/Study/Tweet"
    TRAIN_PATH = f"{DRIVE_ROOT}/input/all_tweet_data.csv"
    MODEL_PATH = f"{DRIVE_ROOT}/model/roberta_base"
    TOKENIZER = tokenizers.ByteLevelBPETokenizer(vocab_file=f"{MODEL_PATH}/vocab.json", merges_file=f"{MODEL_PATH}/merges.txt", lowercase=True, add_prefix_space=True)
    UUID = "RoBERTa_base_41"
    OUT_DIR = f"{DRIVE_ROOT}/output/{UUID}/"

!mkdir -p "{config.OUT_DIR}"

In [0]:
def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True

def jaccard(str1, str2): 
    a = set(str1.lower().split()) 
    b = set(str2.lower().split())
    c = a.intersection(b)
    return float(len(c)) / (len(a) + len(b) - len(c))

def get_best_start_end_idxs(_start_logits, _end_logits):
    best_logit = -1000
    best_idxs = None
    for start_idx, start_logit in enumerate(_start_logits):
        for end_idx, end_logit in enumerate(_end_logits[start_idx:]):
            logit_sum = (start_logit + end_logit).item()
            if logit_sum > best_logit:
                best_logit = logit_sum
                best_idxs = (start_idx, start_idx+end_idx)
    return best_idxs

def calculate_jaccard_score(
    original_tweet, 
    target_string, 
    sentiment_val, 
    idx_start, 
    idx_end, 
    offsets,
    verbose=False):
    
    if idx_end < idx_start:
        idx_end = idx_start
    
    filtered_output  = ""
    for ix in range(idx_start, idx_end + 1):
        filtered_output += original_tweet[offsets[ix][0]: offsets[ix][1]]
        if (ix+1) < len(offsets) and offsets[ix][1] < offsets[ix+1][0]:
            filtered_output += " "

    if len(original_tweet.split()) < 2:
        filtered_output = original_tweet

    filtered_output = filtered_output.replace("¿¿", "¿").replace("ïï", "ï")
    jac = jaccard(target_string.strip(), filtered_output.strip())
    return jac, filtered_output


sentiment_id = {s: config.TOKENIZER.encode(s).ids[0] for s in ["positive", "negative", "neutral"]}
print(sentiment_id)


all_tweet_data = pd.read_csv(config.TRAIN_PATH)
train = all_tweet_data[all_tweet_data["data_type"] == "train"].reset_index(drop=True)
print("origin", len(train))
train = train[train["is_keep"]].reset_index(drop=True)
print("del duplicates", len(train))
train = train[["text", "selected_text", "new_sentiment", "sentiment"]]
train.columns = ["text", "selected_text", "sentiment", "org_sentiment"]

#sentiment_label = {v: i for i, v in enumerate(train["org_sentiment"].unique())}
#sentiment_label = {v: config.TOKENIZER.encode(v).ids[0] for v in train["org_sentiment"].unique()}
#sentiment_label = {v: i for i, v in enumerate(train["sentiment"].unique())}
#sentiment_label

In [0]:
"""
def filter(text):
    # "RT @user:"を削除
    if "RT " in text:
        text = text.split(":", 1)[1]
    # "@user"を削除
    if "@" in text and " " in text:
        text = text.split(" ", text.count("@"))[-1]
    # "#tag"を削除
    if "#" in text:
        text = text.split("#", 1)[0]
    # "URL"を削除
    if "http" in text:
        text = text.split("http", 1)[0]
    return text
    
big_tweet_data = pd.read_csv("./drive/My Drive/Study/Tweet/input/training.1600000.processed.noemoticon.csv",
                             encoding="ISO-8859-1", names=["target", "ids", "date", "flag", "user", "text"])
big_tweet_preds = pd.read_csv("./drive/My Drive/Study/Tweet/input/sentiment140_0_10000.csv")

decode_map = {0: "negative", 2: "neutral", 4: "positive"}
big_tweet_data["sentiment"] = big_tweet_data["target"].map(decode_map)
big_tweet_data = big_tweet_data[["ids", "text", "sentiment"]]
big_tweet_data.columns = ["textID", "text", "sentiment"]

nega_df = big_tweet_data[big_tweet_data["sentiment"] == "negative"].head(10000).tail(10000)
posi_df = big_tweet_data[big_tweet_data["sentiment"] == "positive"].head(10000).tail(10000)
nega_df["text"] = nega_df["text"].map(filter)
posi_df["text"] = posi_df["text"].map(filter)
df = pd.concat([posi_df, nega_df], axis=0).reset_index(drop=True)

#big_data_df = pd.merge(big_tweet_preds, df, on='textID')
big_data_df = pd.concat([big_tweet_preds, df], axis=1)
big_data_df["org_sentiment"] = "empty"

big_data_df = big_data_df[big_data_df["text"].map(lambda x: len(x) > 4)].reset_index(drop=True)

big_data_df = big_data_df.sample(10000, random_state=config.SEED)

print(big_data_df.shape)
big_data_df.head()"""

In [0]:
#blob_train = pd.read_csv("./drive/My Drive/Study/Tweet/input/blob_train.csv")
#print(blob_train.shape, train.shape)
#display(blob_train.head())
#train = blob_train

#pseudo = pd.read_csv("./drive/My Drive/Study/Tweet/input/lb0.715_submission.csv")
#test = pd.read_csv("./drive/My Drive/Study/Tweet/input/test.csv")
#pseudo = pd.merge(test, pseudo, on='textID')
#senti_dict = {k: v for k, v in all_tweet_data[["aux_id", "sentiment"]].values}
#pseudo["org_sentiment"] = pseudo["textID"].map(senti_dict)

pseudo = pd.read_csv("./drive/My Drive/Study/Tweet/input/lb0.715_submission_private.csv")
private = all_tweet_data[all_tweet_data["data_type"] != "train"].reset_index(drop=True)
pseudo.columns = ["aux_id", "selected_text"]
pseudo = pd.merge(private[["aux_id", "assumed_sentiment", "text"]], pseudo, on='aux_id')
senti_dict = {k: v for k, v in all_tweet_data[["aux_id", "sentiment"]].values}
pseudo["org_sentiment"] = pseudo["aux_id"].map(senti_dict)
pseudo.columns = ["textID", "sentiment", "text", "selected_text", "org_sentiment"]

print(pseudo.shape)
pseudo.head()

In [0]:
def process_data(tweet, selected_text, sentiment, tokenizer, max_len, org_sentiment):
    tweet = " " + " ".join(str(tweet).split())
    selected_text = " " + " ".join(str(selected_text).split())

    len_st = len(selected_text) - 1
    idx0 = None
    idx1 = None

    for ind in (i for i, e in enumerate(tweet) if e == selected_text[1]):
        if " " + tweet[ind: ind+len_st] == selected_text:
            idx0 = ind
            idx1 = ind + len_st - 1
            break

    char_targets = [0] * len(tweet)
    if idx0 != None and idx1 != None:
        for ct in range(idx0, idx1 + 1):
            char_targets[ct] = 1

    tok_tweet = tokenizer.encode(tweet)
    input_ids_orig = tok_tweet.ids
    tweet_offsets = tok_tweet.offsets
    
    target_idx = []
    for j, (offset1, offset2) in enumerate(tweet_offsets):
        if sum(char_targets[offset1: offset2]) > 0:
            target_idx.append(j)

    try:
        targets_start = target_idx[0]
        targets_end = target_idx[-1]
    except IndexError:
        targets_start = 0
        targets_end = len(tweet) - 1     


    input_ids = [0] + [sentiment_id[sentiment]] + [2] + [2] + input_ids_orig + [2]
    token_type_ids = [0, 0, 0, 0] + [0] * (len(input_ids_orig) + 1)
    mask = [1] * len(token_type_ids)
    tweet_offsets = [(0, 0)] * 4 + tweet_offsets + [(0, 0)]
    targets_start += 4
    targets_end += 4

    padding_length = max_len - len(input_ids)
    if padding_length > 0:
        input_ids = input_ids + ([1] * padding_length)
        mask = mask + ([0] * padding_length)
        token_type_ids = token_type_ids + ([0] * padding_length)
        tweet_offsets = tweet_offsets + ([(0, 0)] * padding_length)
    
    return {
        'ids': input_ids,
        'mask': mask,
        'token_type_ids': token_type_ids,
        'targets_start': targets_start,
        'targets_end': targets_end,
        'orig_tweet': tweet,
        'orig_selected': selected_text,
        'sentiment': sentiment,
        'offsets': tweet_offsets,
        #'sentiment_label': sentiment_label[org_sentiment]
        #'sentiment_label': sentiment_label[sentiment]
    }

class TweetDataset:
    def __init__(self, tweet, sentiment, selected_text, org_sentiment):
        self.tweet = tweet
        self.sentiment = sentiment
        self.selected_text = selected_text
        self.tokenizer = config.TOKENIZER
        self.max_len = config.MAX_LEN
        self.org_sentiment = org_sentiment
    
    def __len__(self):
        return len(self.tweet)

    def __getitem__(self, item):
        data = process_data(
            self.tweet[item], 
            self.selected_text[item],
            self.sentiment[item],
            self.tokenizer,
            self.max_len,
            self.org_sentiment[item]
        )
        
        return {
            'ids': torch.tensor(data["ids"], dtype=torch.long),
            'mask': torch.tensor(data["mask"], dtype=torch.long),
            'token_type_ids': torch.tensor(data["token_type_ids"], dtype=torch.long),
            'targets_start': torch.tensor(data["targets_start"], dtype=torch.long),
            'targets_end': torch.tensor(data["targets_end"], dtype=torch.long),
            'orig_tweet': data["orig_tweet"],
            'orig_selected': data["orig_selected"],
            'sentiment': data["sentiment"],
            'offsets': torch.tensor(data["offsets"], dtype=torch.long),
            #'sentiment_label': torch.tensor(data["sentiment_label"], dtype=torch.long),
        }

In [0]:
def train_fn(data_loader, model, optimizer, device, scheduler=None):
    model.train()
    losses, lrs = [], []
    tk0 = tqdm(data_loader, total=len(data_loader), desc="Training")
    for bi, d in enumerate(tk0):

        ids = d["ids"]
        token_type_ids = d["token_type_ids"]
        mask = d["mask"]
        targets_start = d["targets_start"]
        targets_end = d["targets_end"]
        sentiment = d["sentiment"]
        orig_selected = d["orig_selected"]
        orig_tweet = d["orig_tweet"]
        targets_start = d["targets_start"]
        targets_end = d["targets_end"]
        offsets = d["offsets"]
        #sentiment_label = d["sentiment_label"]

        ids = ids.to(device, dtype=torch.long)
        token_type_ids = token_type_ids.to(device, dtype=torch.long)
        mask = mask.to(device, dtype=torch.long)
        targets_start = targets_start.to(device, dtype=torch.long)
        targets_end = targets_end.to(device, dtype=torch.long)
        #sentiment_label = sentiment_label.to(device, dtype=torch.long)

        model.zero_grad()
        outputs_start, outputs_end = model(
            ids=ids,
            mask=mask,
            token_type_ids=token_type_ids,
        )
        loss = loss_fn(outputs_start, outputs_end, targets_start, targets_end)
        loss.backward()
        optimizer.step()
        scheduler.step()

        tk0.set_postfix(loss=loss.item())

        losses.append(float(loss))
        lrs.append(np.array([param_group["lr"] for param_group in optimizer.param_groups]).mean())

    return losses, lrs


def eval_fn(data_loader, model, device):
    model.eval()
    losses = []
    jaccards = []
    final_output = []
    
    with torch.no_grad():
        tk0 = tqdm(data_loader, total=len(data_loader))
        for bi, d in enumerate(tk0):
            ids = d["ids"]
            token_type_ids = d["token_type_ids"]
            mask = d["mask"]
            sentiment = d["sentiment"]
            orig_selected = d["orig_selected"]
            orig_tweet = d["orig_tweet"]
            targets_start = d["targets_start"]
            targets_end = d["targets_end"]
            offsets = d["offsets"].numpy()

            ids = ids.to(device, dtype=torch.long)
            token_type_ids = token_type_ids.to(device, dtype=torch.long)
            mask = mask.to(device, dtype=torch.long)
            targets_start = targets_start.to(device, dtype=torch.long)
            targets_end = targets_end.to(device, dtype=torch.long)

            outputs_start, outputs_end = model(   
                ids=ids,
                mask=mask,
                token_type_ids=token_type_ids
            )
            
            outputs_start = torch.softmax(outputs_start, dim=1).cpu().detach().numpy()
            outputs_end = torch.softmax(outputs_end, dim=1).cpu().detach().numpy()

            jaccard_scores = []
            for px, tweet in enumerate(orig_tweet):
                selected_tweet = orig_selected[px]
                tweet_sentiment = sentiment[px]
                idx_start, idx_end = get_best_start_end_idxs(outputs_start[px, :], outputs_end[px, :])
                jaccard_score, output_sentence = calculate_jaccard_score(
                    original_tweet=tweet,
                    target_string=selected_tweet,
                    sentiment_val=tweet_sentiment,
                    idx_start=idx_start,
                    idx_end=idx_end,
                    offsets=offsets[px]
                )
                jaccard_scores.append(jaccard_score)
                final_output.append(output_sentence)

            jaccards.append(np.mean(jaccard_scores))
            losses.append(0.0)
    
    return np.mean(losses), np.mean(jaccards), final_output

In [0]:
class LabelSmoothingLoss(nn.Module):
    def __init__(self, classes, smoothing=0.0, dim=-1):
        super(LabelSmoothingLoss, self).__init__()
        self.confidence = 1.0 - smoothing
        self.smoothing = smoothing
        self.cls = classes
        self.dim = dim

    def forward(self, pred, target):
        pred = pred.log_softmax(dim=self.dim)
        with torch.no_grad():
            true_dist = torch.zeros_like(pred)
            true_dist.fill_(self.smoothing / (self.cls - 1))
            true_dist.scatter_(1, target.data.unsqueeze(1), self.confidence)
        return torch.mean(torch.sum(-true_dist * pred, dim=self.dim))

def loss_fn(start_logits, end_logits, start_positions, end_positions):
    loss_fct = LabelSmoothingLoss(config.MAX_LEN, 0.2)

    start_loss = loss_fct(start_logits, start_positions)
    end_loss = loss_fct(end_logits, end_positions)

    total_loss = start_loss + end_loss
    return total_loss


class RoBERTaForTweet(nn.Module):
    def __init__(self):
        super(RoBERTaForTweet, self).__init__()

        conf = RobertaConfig.from_pretrained(config.MODEL_PATH)
        conf.output_hidden_states = True

        self.ksizes = [1, 2, 3]

        self.roberta = RobertaModel.from_pretrained(config.MODEL_PATH, config=conf)
        self.dropouts = nn.ModuleList([nn.Dropout(0.2) for _ in range(5)])
        self.qa_outputs = nn.ModuleList([nn.Conv1d(768, 2, k) for k in self.ksizes])

    def forward(self, ids, mask, token_type_ids):
        _, _, out = self.roberta(ids, attention_mask=mask, token_type_ids=token_type_ids)
        
        lst = []
        for k, qa_outputs in zip(self.ksizes, self.qa_outputs):
            sequence_output = out[-1].transpose(1, 2)
            if k > 1:
                sequence_output = F.pad(sequence_output, (0, k-1))
            h = sum([qa_outputs(dropout(sequence_output)).transpose(1, 2) for dropout in self.dropouts])/5
            lst.append(h)
        logits = sum(lst)
        
        start_logits, end_logits = logits.split(1, dim=-1)
        start_logits = start_logits.squeeze(-1)
        end_logits = end_logits.squeeze(-1)

        return start_logits, end_logits

In [0]:
import nltk
nltk.download('wordnet')
from nltk.corpus import wordnet

def get_synonyms(word):
    synonyms = set()
    for syn in wordnet.synsets(word):
        for l in syn.lemmas():
            w = l.name().lower()
            if w == word:
                continue
            synonyms.add(w)
    return list(synonyms)

def get_augument_df(df, k, n):
    lst = []
    sample_idx = random.choices(list(range(len(df))), k=k)
    for idx in sample_idx:
        text = df.loc[idx, "text"]
        selected_text = df.loc[idx, "selected_text"]
        sentiment = df.loc[idx, "sentiment"]
        org_sentiment = df.loc[idx, "org_sentiment"]

        try:
            words = random.sample(text.split(), n)
        except ValueError:
            continue
        if len(words) < n:
            continue
        replace_dict = {}
        for word in words:
            word = word.lower()
            synonyms = get_synonyms(word)
            if len(synonyms) == 0:
                continue
            replace_word = random.sample(synonyms, 1)[0]
            replace_dict[word] = replace_word
        if len(replace_dict) < n:
            continue

        _lst = []
        for w in text.split():
            try:
                w = replace_dict[w.lower()]
            except KeyError:
                pass
            _lst.append(w)
        aug_text = " ".join(_lst)

        _lst = []
        for w in selected_text.split():
            try:
                w = replace_dict[w.lower()]
            except KeyError:
                pass
            _lst.append(w)
        aug_selected_text = " ".join(_lst)

        d = (aug_text, aug_selected_text, sentiment, org_sentiment)
        lst.append(d)

    df_aug = pd.DataFrame(lst, columns=["text", "selected_text", "sentiment", "org_sentiment"])
    return df_aug

In [0]:
accept_one_words_h = ["i", "a", "u"]
def fix_selected_text_header(selected_text):
    words = selected_text.lower().split()
    head_i = 1 if len(words[0]) == 1 and words[0] not in accept_one_words_h and not words[0].isdigit() else 0
    selected_text = " ".join(selected_text.split()[head_i:])
    return selected_text

In [0]:
def fix_selected_text_tail(text, selected_text):
    if text.split() == selected_text.split():
        return selected_text

    last_token = selected_text.split()[-1]
    if last_token in text.split():
        return selected_text

    sp_text = text.split(selected_text)
    back_text = sp_text[-1]
    if back_text == "" or back_text[0] == " ":
        return selected_text

    tail_token = back_text.split()[0]
    if not tail_token.isalpha():
        return selected_text
        
    _selected_text = selected_text + tail_token


    last_token = _selected_text.split()[-1]
    if last_token.isalpha():
        return _selected_text
    if last_token[:4] == "http" or "www." in last_token:
        return _selected_text
    
    sp_last_token =  [w for w in last_token.split(".") if w != ""]
    if len(sp_last_token) == 1:
        return _selected_text

    for s in sp_last_token[1:]:
        _selected_text = _selected_text.replace(s, "")

    return _selected_text

In [0]:
def fix_header_token_2(text, selected_text):
    header_token = selected_text.split()[0]
    split_header = (" "+text).split(header_token)[0]
    if split_header != " " and header_token not in text.split() and split_header[-1] != " " and len(selected_text_fixed.split()) != 1:
        return " ".join(selected_text_fixed.split()[1:])
    else:
        return selected_text

In [0]:
skf = StratifiedKFold(n_splits=config.N_FOLDS, shuffle=True, random_state=config.SEED).split(train["sentiment"].values, train["sentiment"].values)

train_log_dfs, valid_log_dfs = [], []
final_outputs = []
for fold, (train_idx, valid_idx) in enumerate(skf):
    print(f"### Fold-{fold} ###")
    #if fold in [0, 1, 2, 3]:
    #    continue
    seed_everything(config.SEED + fold)

    df_train = train.iloc[train_idx].reset_index(drop=True)
    df_valid = train.iloc[valid_idx].reset_index(drop=True)

    df_train["selected_text"] = df_train.apply(lambda x: fix_selected_text_tail(x["text"], x["selected_text"]), axis=1)
    df_train["selected_text"] = df_train["selected_text"].map(fix_selected_text_header)
    df_train = df_train[df_train["selected_text"] != ""].reset_index(drop=True)
    df_train["selected_text"] = df_train["selected_text"].map(lambda x: x[:-1] if x[-1] == "," else x)
    df_train["selected_text"] = df_train.apply(lambda x: fix_header_token_2(x["text"], x["selected_text"]), axis=1)


    #df_aug = get_augument_df(df_train, 20000, 2)
    #df_train = pd.concat([df_train, df_aug]).reset_index(drop=True)
    df_train = pd.concat([df_train, pseudo[["text", "selected_text", "sentiment", "org_sentiment"]]], axis=0).reset_index(drop=True)
    #df_train = pd.concat([df_train, big_data_df[["text", "selected_text", "sentiment", "org_sentiment"]]], axis=0).reset_index(drop=True)

    train_dataset = TweetDataset(
        tweet=df_train.text.values,
        sentiment=df_train.sentiment.values,
        selected_text=df_train.selected_text.values,
        org_sentiment=df_train.org_sentiment.values
    )

    train_data_loader = torch.utils.data.DataLoader(
        train_dataset,
        batch_size=config.TRAIN_BATCH_SIZE,
        num_workers=0,
        shuffle=True
    )

    valid_dataset = TweetDataset(
        tweet=df_valid.text.values,
        sentiment=df_valid.sentiment.values,
        selected_text=df_valid.selected_text.values,
        org_sentiment=df_valid.org_sentiment.values
    )

    valid_data_loader = torch.utils.data.DataLoader(
        valid_dataset,
        batch_size=config.VALID_BATCH_SIZE,
        num_workers=0,
        shuffle=False
    )

    model = RoBERTaForTweet()
    model.to(device)

    param_optimizer = list(model.named_parameters())
    no_decay = ["bias", "LayerNorm.bias", "LayerNorm.weight"]
    optimizer_parameters = [
        {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.001},
        {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0},
    ]
    optimizer = AdamW(optimizer_parameters, lr=config.LR)
    
    num_train_steps = int(len(df_train) / config.TRAIN_BATCH_SIZE * config.EPOCHS)
    scheduler = get_cosine_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=num_train_steps)
    

    t_res, v_res = [], []
    best_score = 0
    best_final_output = []
    for epoch in range(config.EPOCHS):
        train_loss, lrs = train_fn(train_data_loader, model, optimizer, device, scheduler=scheduler)
        valid_loss, valid_jaccard, final_output = eval_fn(valid_data_loader, model, device)
        print(f"{epoch} epoch, jaccard={valid_jaccard}")
        if best_score < valid_jaccard:
            print(f"  --> Best Model Update!!")
            best_score = valid_jaccard
            best_final_output = final_output
            torch.save(model.state_dict(), f"{config.OUT_DIR}/roberta_f{fold}_best.bin")
        t_res.append(pd.DataFrame(zip(train_loss, lrs), columns=["train_loss", "lrs"]))
        v_res.append((valid_loss, valid_jaccard))

    final_outputs.append(best_final_output)
    train_log_dfs.append(pd.concat(t_res, axis=0).reset_index(drop=True))
    valid_log_dfs.append(pd.DataFrame(v_res, columns=["valid_loss", "jaccard_score"]))

In [0]:
"""skf = StratifiedKFold(n_splits=config.N_FOLDS, shuffle=True, random_state=config.SEED).split(train["sentiment"].values, train["sentiment"].values)

final_outputs = []
for fold, (train_idx, valid_idx) in enumerate(skf):
    print(f"### Fold-{fold} ###")
    seed_everything(config.SEED + fold)

    df_valid = train.iloc[valid_idx].reset_index(drop=True)

    valid_dataset = TweetDataset(
        tweet=df_valid.text.values,
        sentiment=df_valid.sentiment.values,
        selected_text=df_valid.selected_text.values,
        org_sentiment=df_valid.org_sentiment.values
    )

    valid_data_loader = torch.utils.data.DataLoader(
        valid_dataset,
        batch_size=config.VALID_BATCH_SIZE,
        num_workers=0,
        shuffle=False
    )

    model = RoBERTaForTweet()
    model.to(device)

    model.load_state_dict(torch.load(f"{config.OUT_DIR}/roberta_f{fold}_best.bin"))

    valid_loss, valid_jaccard, final_output = eval_fn(valid_data_loader, model, device)
    final_outputs.append(final_output)"""

In [0]:
oof = []
skf = StratifiedKFold(n_splits=config.N_FOLDS, shuffle=True, random_state=config.SEED).split(train["sentiment"].values, train["sentiment"].values)
for fold, (train_idx, valid_idx) in enumerate(skf):
    df_valid = train.iloc[valid_idx].reset_index(drop=True)

    df_valid["predict"] = final_outputs[fold]
    oof.append(df_valid)
oof = pd.concat(oof, axis=0).reset_index(drop=True)

oof.to_csv(f"{config.OUT_DIR}/oof.csv", index=None)
oof.head()

In [0]:
!ls "{config.OUT_DIR}"