In [1]:
import os
import gc
import re
import nltk
import time
import random
import numpy as np
import pandas as pd
from datetime import datetime
from tqdm import tqdm
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split, StratifiedKFold
from torch.utils.data.sampler import SequentialSampler
from transformers import BertModel, BertTokenizer, BertConfig
from transformers import RobertaModel, RobertaTokenizer, RobertaConfig
# from transformers import AdamW, get_linear_schedule_with_warmup, get_constant_schedule
from tokenizers import BertWordPieceTokenizer, ByteLevelBPETokenizer

In [2]:
# config

In [3]:
class config:
    work_path = os.path.dirname(os.path.abspath('.'))
    input_path = os.path.join(work_path, "input")
    data_path = os.path.join(input_path, "tweet-sentiment-extraction")
    train_path = os.path.join(data_path, 'train.csv')
    test_path = os.path.join(data_path, 'test.csv')
    sample_submission_path = os.path.join(data_path, 'sample_submission.csv')
    model_path = os.path.join(work_path, "model")
    for path in [model_path]:
        if not os.path.isdir(path):
            os.makedirs(path)
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    #     暂时没有截取，有很多符号的，长度不止110
#     暂时没有截取，有很多符号的，长度不止110
#     max_seq_len = 110
    max_seq_len = 128
    n_splits = 5
    patience_epoch = 2
    
    batch_size = 32
    epochs_num = 1
    train_print_step = 50

In [4]:
# model_config

In [5]:
# class model_config:
# model_name = 'bert'
#     pretrain_model_name = 'bert-base-uncased'
#     pretrain_model_path = os.path.join(config.input_path, pretrain_model_name)
#     tokenizer = BertWordPieceTokenizer('{}/vocab.txt'.format(pretrain_model_path), lowercase=True)
#     learning_rate = 5e-5
#     adjust_lr_num = 0

class model_config:
    model_name = 'roberta'
    pretrain_model_name = 'roberta-base'
    pretrain_model_path = os.path.join(config.input_path, pretrain_model_name)
    tokenizer = ByteLevelBPETokenizer(
        vocab_file='{}/vocab.json'.format(pretrain_model_path), 
        merges_file='{}/merges.txt'.format(pretrain_model_path), 
        lowercase=True,
        add_prefix_space=True
    )
    learning_rate = 5e-5  
    adjust_lr_num = 0

In [6]:
# utils

In [7]:
def set_seed():
    np.random.seed(0)
    torch.manual_seed(0)
    torch.cuda.manual_seed_all(0)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = True
    
def jaccard(str1, str2):
    a = set(str1.lower().split())
    b = set(str2.lower().split())
    c = a.intersection(b)
    return float(len(c)) / (len(a) + len(b) - len(c))


def get_selected_text(text, sentiment, start_idx, end_idx, offsets):
    # Set the predicted output as the original tweet when the tweet's sentiment is "neutral",
    # or the tweet only contains one word
    if sentiment == "neutral" or len(text.split()) < 2:
        selected_text = text
        return selected_text
    selected_text = ""
    for ix in range(start_idx, end_idx + 1):
        selected_text += text[offsets[ix][0]: offsets[ix][1]]
        if (ix + 1) < len(offsets) and offsets[ix][1] < offsets[ix + 1][0]:
            selected_text += " "
    return selected_text


def calculate_jaccard_score(text, true_selected_text, sentiment, pred_start_idx, pred_end_idx, offsets):
    """
    Calculate the jaccard score from the predicted span and the actual span for a batch of tweets
    """
    # A span's end index has to be greater than or equal to the start index
    # If this doesn't hold, the start index is set to equal the end index (the span is a single token)
    if pred_end_idx < pred_start_idx:
        pred_end_idx = pred_start_idx
    pred_selected_text = get_selected_text(text, sentiment, pred_start_idx, pred_end_idx, offsets)
    score = jaccard(true_selected_text.strip(), pred_selected_text.strip())
    return score


def get_score(y_true_list, y_pred_list):
    jaccard_score = 0
    for y_true, y_pred in zip(y_true_list, y_pred_list):
        text = y_true[0]
        selected_text = y_true[1]
        sentiment = y_true[2]
        offsets = y_true[3]
        pred_start_idx = y_pred[0]
        pred_end_idx = y_pred[1]

        score = calculate_jaccard_score(text, selected_text, sentiment, pred_start_idx, pred_end_idx, offsets)
        jaccard_score += score
    return jaccard_score / len(y_true_list)



set_seed()

In [8]:
# data_utils

In [9]:
class MyDataset(Dataset):

    def __init__(self, df, tokenizer, mode='train'):
        self.mode = mode
        self.df = df
        self.tokenizer = tokenizer
        # self.x_data = []
        # self.y_data = []
        # for i, row in df.iterrows():
        #     x, y = self.row_to_tensor(self.tokenizer, row)
        #     self.x_data.append(x)
        #     self.y_data.append(y)
        self.data = []
        for i, row in df.iterrows():
            data = self.row_to_tensor(row)
            try:
                assert len(data["ids"]) == config.max_seq_len
                assert len(data['attention_mask']) == config.max_seq_len
                assert len(data["token_type_ids"]) == config.max_seq_len
                assert len(data["offsets"]) == config.max_seq_len
            except:
                print(data)
                print('ids:{}'.format(len(data["ids"])))
                print('attention_mask:{}'.format(len(data["attention_mask"])))
                print('token_type_ids:{}'.format(len(data["token_type_ids"])))
                print('offsets:{}'.format(len(data["offsets"])))
            # tmp = {
            #     'ids': torch.tensor(data["ids"], dtype=torch.long),
            #     'mask': torch.tensor(data["mask"], dtype=torch.long),
            #     'token_type_ids': torch.tensor(data["token_type_ids"], dtype=torch.long),
            #     'targets_start': torch.tensor(data["targets_start"], dtype=torch.long),
            #     'targets_end': torch.tensor(data["targets_end"], dtype=torch.long),
            #     'sentiment': data["sentiment"],
            #     'offsets': torch.tensor(data["offsets"], dtype=torch.long)
            # }
            if self.mode in ['train', 'val']:
                start_idx, end_idx = self.get_target_idx(row, data['tweet'], data['offsets'])
                data['selected_text'] = row['selected_text']
                data['start_idx'] = start_idx
                data['end_idx'] = end_idx
            data['sentiment'] = row['sentiment']
            self.data.append(data)

    def get_target_idx(self, row, tweet, offsets):
        selected_text = " " + " ".join(row.selected_text.lower().split())

        len_st = len(selected_text) - 1
        idx0 = None
        idx1 = None

        for ind in (i for i, e in enumerate(tweet) if e == selected_text[1]):
            if " " + tweet[ind: ind + len_st] == selected_text:
                idx0 = ind
                idx1 = ind + len_st - 1
                break

        char_targets = [0] * len(tweet)
        if idx0 != None and idx1 != None:
            for ct in range(idx0, idx1 + 1):
                char_targets[ct] = 1

        target_idx = []
        for j, (offset1, offset2) in enumerate(offsets):
            if sum(char_targets[offset1: offset2]) > 0:
                target_idx.append(j)

        start_idx = target_idx[0]
        end_idx = target_idx[-1]

        return start_idx, end_idx

    def row_to_tensor(self, row):
        tweet = " " + " ".join(row.text.lower().split())
        encoding = self.tokenizer.encode(tweet)
        sentiment_id = self.tokenizer.encode(row.sentiment).ids
        ids = [0] + sentiment_id + [2, 2] + encoding.ids + [2]
        token_type_ids = [0, 0, 0, 0] + [0] * (len(encoding.ids) + 1)
        offsets = [(0, 0)] * 4 + encoding.offsets + [(0, 0)]

        pad_len = config.max_seq_len - len(ids)
        if pad_len > 0:
            ids += [1] * pad_len
            token_type_ids = token_type_ids + ([0] * pad_len)
            offsets += [(0, 0)] * pad_len

        ids = torch.tensor(ids)
        attention_mask = torch.where(ids != 1, torch.tensor(1), torch.tensor(0))
        token_type_ids = torch.tensor(token_type_ids)
        offsets = torch.tensor(offsets)

        data = dict()
        data['tweet'] = tweet
        data['ids'] = ids
        data['attention_mask'] = attention_mask
        data['token_type_ids'] = token_type_ids
        data['offsets'] = offsets
        return data

    def __getitem__(self, index):
        # return self.x_data[index], self.y_data[index]
        return self.data[index]

    def __len__(self):
        # return len(self.y_data)
        return len(self.data)


In [10]:
# model/bert

In [12]:
# model/roberta

In [13]:
from transformers import RobertaModel, RobertaTokenizer, RobertaConfig
class Model(nn.Module):
    def __init__(self):
        super(Model, self).__init__()
        self.config = RobertaConfig.from_pretrained(model_config.pretrain_model_path)
        self.config.output_hidden_states = True
        self.model = RobertaModel.from_pretrained(model_config.pretrain_model_path, config=self.config)
        self.config.output_hidden_states = True
        self.dropout = nn.Dropout(self.config.hidden_dropout_prob)
        # self.classifier = nn.Linear(self.config.hidden_size * 2, 2)
        self.classifier = nn.Linear(self.config.hidden_size, 2)
        torch.nn.init.normal_(self.classifier.weight, std=0.02)

    def forward(self, input_ids, attention_mask=None, token_type_ids=None, labels=None,
                position_ids=None, head_mask=None):
        outputs = self.model(
            input_ids,
            attention_mask=attention_mask,
            token_type_ids=token_type_ids
        )
        hidden_states = outputs[2]
        # out = torch.cat((hidden_states[-1], hidden_states[-2]), dim=-1)
        out = torch.stack([hidden_states[-1], hidden_states[-2], hidden_states[-3]])
        out = torch.mean(out, 0)
        out = self.dropout(out)
        logits = self.classifier(out)
        start_logits, end_logits = logits.split(1, dim=-1)

        start_logits = start_logits.squeeze(-1) 
        end_logits = end_logits.squeeze(-1) 

        return start_logits, end_logits

In [None]:
# predict

In [None]:
model_name = model_config.model_name
tokenizer = model_config.tokenizer

def model_predict(model, test_loader):
    y_true_list = []
    y_pred_list = []
    model.eval()
    with torch.no_grad():
        for inputs in test_loader:
            tweet = inputs["tweet"]
            sentiment = inputs["sentiment"]
            ids = inputs["ids"]
            token_type_ids = inputs["token_type_ids"]
            attention_mask = inputs["attention_mask"]
            offsets = inputs["offsets"]
            ids = ids.to(config.device, dtype=torch.long)
            token_type_ids = token_type_ids.to(config.device, dtype=torch.long)
            attention_mask = attention_mask.to(config.device, dtype=torch.long)
            start_logits, end_logits = model(
                input_ids=ids, attention_mask=attention_mask, token_type_ids=token_type_ids,
            )
            pred_start_probs = torch.softmax(start_logits, dim=1).cpu().data.numpy()
            pred_end_probs = torch.softmax(end_logits, dim=1).cpu().data.numpy()
            for i in range(len(tweet)):
                y_true_list.append((tweet[i], sentiment[i], offsets[i]))
                y_pred_list.append((pred_start_probs[i], pred_end_probs[i]))
    return y_true_list, y_pred_list
    
def get_pred_list(y_true_list, preds_dict):
    data_len = len(y_true_list)
    y_pred_list = []
    mode = 2
    if mode == 1:
        for i in range(data_len):
            preds = []
            for fold_idx in range(config.n_splits):
                prob = preds_dict['{}'.format(fold_idx)][i]
                pred = np.argmax(prob)
                preds.append(pred)
            # pred_set = set([x for x in preds])
            label_id = max(preds, key=preds.count)
            y_pred_list.append(label_id)
    else:
        for i in range(data_len):
            start_logits, end_logits = None, None
            for fold_idx in range(config.n_splits):
                if start_logits is None:
                    start_logits, end_logits = preds_dict['{}'.format(fold_idx)][i]
                else:
                    tmp1, tmp2 = preds_dict['{}'.format(fold_idx)][i]
                    start_logits += tmp1
                    end_logits += tmp2
            y_pred_list.append((np.argmax(start_logits), np.argmax(end_logits)))
            
    pred_list = []
    for y_true, y_pred in zip(y_true_list, y_pred_list):
        text = y_true[0]
        sentiment = y_true[1]
        offsets = y_true[2]
        pred_start_idx = y_pred[0]
        pred_end_idx = y_pred[1]
        pred = get_selected_text(text, sentiment, pred_start_idx, pred_end_idx, offsets)
        pred_list.append(pred)
    return pred_list

def predict():
    test_df = pd.read_csv(config.test_path)
    test_df = test_df[:100]
    test_df.loc[:, 'selected_text'] = test_df.text.values

    test_dataset = MyDataset(test_df, tokenizer, 'test')
    test_loader = DataLoader(test_dataset, batch_size=config.batch_size)

    preds_dict = dict()
    y_true_list = []
    for fold_idx in range(config.n_splits):
        model = Model().to(config.device)
        model_save_path = os.path.join(config.model_path, '{}_fold{}.bin'.format(model_name, fold_idx))
        model.load_state_dict(torch.load(model_save_path))
        y_true_list, y_pred_list = model_predict(model, test_loader)
#         submission = pd.DataFrame(pred_list)
#         submission.to_csv('{}/{}_fold{}_submission.csv'
#                           .format(config.submission_path, model_name, fold_idx), index=False, header=False)
        preds_dict['{}'.format(fold_idx)] = y_pred_list
    pred_list = get_pred_list(y_true_list, preds_dict)

    submission = pd.read_csv(config.sample_submission_path)
    submission = submission[:100]
    submission['selected_text'] = pred_list
    submission.to_csv('submission.csv', index=False)

predict()