# 一、项目背景¶
    由于每秒钟都有大量的推文在传播，所以很难判断一个特定推文背后的情感是否会影响一个人或一个公司的品牌，可能因为它具有积极性或者因为它带有负面的基调。用语言捕捉情绪是很重要的，尤其是当几秒钟内就需要做出决策或判断的时候。但是，到底是哪些词语主导了情感的表达呢? 在这次比赛中，数据集名为“情绪分析:带有情感标签的推文中的情感”。目标是建立一个模型来做同样的事情——查看给定tweet的情感标记，并挑选出推文中最能支撑这个情感的单词或短语。

In [5]:
import torch
import re
import string
import nltk
from nltk.corpus import stopwords
import transformers
from transformers import BertModel, BertTokenizer, AdamW, get_linear_schedule_with_warmup
import numpy as np
import pandas as pd
import seaborn as sns
from pylab import rcParams
import matplotlib.pyplot as plt
from matplotlib import rc
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, classification_report
from collections import defaultdict
from textwrap import wrap
from torch import nn, optim
from torch.utils.data import Dataset, DataLoader
import tokenizers
from tqdm.autonotebook import tqdm
from nltk.tokenize import word_tokenize

In [6]:
RANDOM_SEED = 42
np.random.seed(RANDOM_SEED)
torch.manual_seed(RANDOM_SEED)
EPOCHS = 5
BATCH_SIZE = 64
MAX_LEN = 140

<torch._C.Generator at 0x7fd394019bf0>

# 二、数据前处理
## 1.数据清洗
     查看数据集，包含文本ID（textID）、文本内容（text）、代表情感的文本（selected_text）以及该文本所对应的情感（sentiment）。总共包含27481条训练集以及3534条需要预测的测试集。将所有的文本小写化，并且删去所有空值（个数极少）。

In [7]:
ds_train = pd.read_csv('../input/mydata/tw_train2.csv')
ds_test = pd.read_csv('../input/tweet-sentiment-extraction/test.csv')
print(ds_train.shape,ds_test.shape)
ds_train.head(2)

(27481, 4) (3534, 3)


Unnamed: 0,textID,text,selected_text,sentiment
0,cb774db0d1,"I`d have responded, if I were going","I`d have responded, if I were going",neutral
1,549e992a42,Sooo SAD I will miss you here in San Diego!!!,Sooo SAD,negative


In [8]:
def clean_text(text):
    '''Make text lowercase, remove text in square brackets,remove links,remove punctuation
    and remove words containing numbers.'''
    text = text.lower()
    text = re.sub('\[.*?\]', '', text)  #删去. * ? \

    text = re.sub('https?://\S+|www\.\S+', '', text)  #。网址后面的字母会删去，\S匹配任意非空白符， ？是前面操作无或一次 
    #text = re.sub('https?://\S+|www\S+', '', text)    

    
    text = re.sub('<.*?>+', '', text)
    text = re.sub('[%s]' % re.escape(string.punctuation), '', text)
    text = re.sub('\n', '', text)
    #text = re.sub('\w*\d\w*', '', text)  #数字和后面的字母
    text = re.sub('\d', '', text)    #只删除数字
    return text

def text_preprocessing(text):
    """
    Cleaning and parsing the text.

    """
    tokenizer = nltk.tokenize.RegexpTokenizer(r'\w+')
    nopunc = clean_text(text)
    tokenized_text = tokenizer.tokenize(nopunc)
    #remove_stopwords = [w for w in tokenized_text if w not in stopwords.words('english')]
    combined_text = ' '.join(tokenized_text)
    return combined_text

In [9]:
#标点，网址
ds_train['text'] = ds_train['text'].apply(str).apply(lambda x :x.lower())
ds_test['text'] = ds_test['text'].apply(str).apply(lambda x :x.lower())
ds_train['selected_text'] = ds_train['selected_text'].apply(str).apply(lambda x :x.lower())
ds_train.loc[ds_train['selected_text']=='#name?', 'selected_text'] = None
#有很多非nan的空值
#pd.set_option('display.max_colwidth', 200)
ds_train.replace('', np.nan, inplace=True)
#ds_test.replace('', np.nan,inplace=True)
ds_train.dropna(axis=0, how ='any',inplace=True) 
ds_train.head()
#ds_test.dropna(axis=0, how='any',inplace=True)
#网址数据有很大干扰性，且常报错，全部删去
#ds_web = ds_train.loc[ds_train.text.str.contains('www\.|http?//|https//')]
#ds_dropweb = ds_train[~ds_train.text.str.contains('www\.|http|https//')]
#ds_train = ds_dropweb
#ds_train.loc[ds_train.text.str.contains('http|www\.'), 'text']

Unnamed: 0,textID,text,selected_text,sentiment
0,cb774db0d1,"i`d have responded, if i were going","i`d have responded, if i were going",neutral
1,549e992a42,sooo sad i will miss you here in san diego!!!,sooo sad,negative
2,088c60f138,my boss is bullying me...,bullying me,negative
3,9642c003ef,what interview! leave me alone,leave me alone,negative
4,358bd9e861,"sons of ****, why couldn`t they put them on t...","sons of ****,",negative


In [13]:
tokenizer = tokenizers.ByteLevelBPETokenizer(vocab_file='../input/roberta/roberta-base-vocab.json',
                      merges_file='../input/roberta/roberta-base-merges.txt',
                      lowercase=True,
                      add_prefix_space=True
                      )

## 2.Data Loader
    将每条文本转化为token，将问题视为Question Answer(QA),将sentiment作为question、在文本tweet中寻找答案selected_text，并且标记出答案在文本中的起始和结束位置。其中需要将sentiment和tweet连接，中间用【SEP】分隔，并且使合并的句子填充至同一长度。

In [16]:
class TweetDataset(Dataset):

  def __init__(self,ds, tokenizer, max_len, mod):
    self.ds = ds
    self.tweets = np.array(self.ds['text'])
    self.tokenizer = tokenizer
    self.max_len = max_len
    self.sentiments = np.array(self.ds['sentiment'])
    self.mod = mod
    if self.mod != 'test':
      self.selected_texts = np.array(self.ds['selected_text'])
  def __len__(self):
    return len(self.tweets)
  
  def __getitem__(self, item):
  ###需要输出：原推特，推特input_ids，padding与否的标识mask，两个句子的区分token_type_ids, sentiment_text的起始和结束，offset，sentiment
  #通过迭代器item，写出每一行的内容
    #将文本变为str，以下长度计数会按字母数数，如：len（str（my））=2
    tweet = str(self.tweets[item])
    sentiment = str(self.sentiments[item])
    #print(item)  #找出错数据的时候开启
    #print(tweet)
    #print(self.selected_texts[item])
    #测试模式：
    if self.mod == 'test':
      data = self.get_test_data(tweet, self.tokenizer, self.max_len, sentiment)
      return data
    #训练模式：
    selected_text = str(self.selected_texts[item])
    data = self.process_data(tweet, selected_text, self.tokenizer, self.max_len, sentiment)


    return data
  def process_data(self,tweet, selected_text, tokenizer, max_len, sentiment):
    """
    实现找到selected_text对应的token在原tweet token所在的位置（先找到文本的对应位置，再转化为token）:
    1.通过原文本的一一对应关系找到selected_text的起始引索，
    2.新开一个列表char_targets用来记录tweet各个位置是否为selected_text的一部分，是为1，否为0；
    3.将文本tokenize，token的个数肯定小于字母的个数，但是可以得到一个包含token位于原文本起始位置的元组offsets
    4.通过char_targets与offsets判断每个token是否是target的一部分，最终得到target的起始token位置
    """
    #通过答案案的第一个字母和答案的长度判断答案是否在原文中，将开始和结束的位置提出，返回一个用字母计数的长度
    tweet = " " + " ".join(str(tweet).split())
    selected_text = " " + " ".join(str(selected_text).split())

    len_st = len(selected_text) - 1
    idx0 = None
    idx1 = None

    for ind in (i for i, e in enumerate(tweet) if e == selected_text[1]):
        if " " + tweet[ind: ind+len_st] == selected_text:
            idx0 = ind
            idx1 = ind + len_st - 1
            break
    #将答案的位置全部赋值为1，其他位置赋值为0，用于以下判断
    char_targets = [0] * len(tweet)
    if idx0 != None and idx1 != None:
        for ct in range(idx0, idx1 + 1):
            char_targets[ct] = 1
    
    tok_tweet = tokenizer.encode(tweet)
    input_ids_orig = tok_tweet.ids
    tweet_offsets = tok_tweet.offsets #找到每个token（单词）的出现的起始位置（一个字母算1，空格单词结尾，也记为1）和结束位置(start,end)

    
    target_idx = []
    for j, (offset1, offset2) in enumerate(tweet_offsets):
        if sum(char_targets[offset1: offset2]) > 0:          #判断每个token（单词）是否为答案的一部分
            target_idx.append(j)
    
    targets_start = target_idx[0]
    targets_end = target_idx[-1]
#########################################################################################################
    #直接写出sentiment（question）的token,手动加到input_ids里面
    sentiment_id = {
        'positive': 1313,
        'negative': 2430,
        'neutral': 7974
    }
    
    input_ids = [0] + [sentiment_id[sentiment]] + [2] + [2] + input_ids_orig + [2]
    token_type_ids = [0, 0, 0, 0] + [0] * (len(input_ids_orig) + 1) 
    #三个0代表【cls】【sentiment】【sep】，用于区分前后两个句子，相当于segementation
    mask = [1] * len(token_type_ids) #用于区分是否是padding，padding为0
    tweet_offsets = [(0, 0)] * 4 + tweet_offsets + [(0, 0)]
    targets_start += 4
    targets_end += 4
    #如果句子小于设定的长度，用0padding
    padding_length = max_len - len(input_ids)
    if padding_length > 0:
        input_ids = input_ids + ([1] * padding_length)
        mask = mask + ([0] * padding_length)
        token_type_ids = token_type_ids + ([0] * padding_length)
        tweet_offsets = tweet_offsets + ([(0, 0)] * padding_length)
    
    return {
        'ids': torch.tensor(input_ids, dtype=torch.long),
        'mask': torch.tensor(mask, dtype=torch.long),
        'token_type_ids': torch.tensor(token_type_ids, dtype=torch.long),
        'targets_start': torch.tensor(targets_start, dtype=torch.long),
        'targets_end': torch.tensor(targets_end, dtype=torch.long),
        'tweet_text': tweet,
        'selected_text': selected_text,
        'sentiment': sentiment,
        'offsets': torch.tensor(tweet_offsets, dtype=torch.long)
    }
  def get_test_data(self, tweet, tokenizer, max_len, sentiment):
    tweet = " " + " ".join(str(tweet).split())
    sentiment_id = {
        'positive': 1313,
        'negative': 2430,
        'neutral': 7979
    }
    input_ids_orig = tokenizer.encode(tweet).ids   #这里的ids是否有必要
    input_ids = [0] + [sentiment_id[sentiment]] + [2] + [2] + input_ids_orig + [2]
    token_type_ids = [0, 0, 0, 0] + [0] * (len(input_ids_orig) + 1) 
    #三个0代表【cls】【sentiment】【sep】，用于区分前后两个句子，相当于segementation
    mask = [1] * len(token_type_ids) #用于区分是否是padding，padding为0
    #tweet_offsets = [(0, 0)] * 3 + tweet_offsets + [(0, 0)]
    #targets_start += 3
    #targets_end += 3
    #如果句子小于设定的长度，用0padding
    padding_length = max_len - len(input_ids)
    if padding_length > 0:
        input_ids = input_ids + ([1] * padding_length)
        mask = mask + ([0] * padding_length)
        token_type_ids = token_type_ids + ([0] * padding_length)
        #tweet_offsets = tweet_offsets + ([(0, 0)] * padding_length)
    
    return {
        'ids': torch.tensor(input_ids, dtype=torch.long),
        'mask': torch.tensor(mask, dtype=torch.long),
        'token_type_ids': torch.tensor(token_type_ids, dtype=torch.long),
        'tweet_text': tweet,
        'sentiment': sentiment
    }
def create_data_loader(df, tokenizer, max_len, batch_size, mod=None):
    ds = TweetDataset(ds=df, 
    tokenizer = tokenizer, 
    max_len = max_len,
    mod = mod
  )
  
    return DataLoader(ds,
    batch_size = batch_size,
    num_workers = 0
  )

In [19]:
train_data_loader = create_data_loader(ds_train, tokenizer, MAX_LEN, BATCH_SIZE)
#val_data_loader = create_data_loader(eval, tokenizer, MAX_LEN, BATCH_SIZE)
test_data_loader = create_data_loader(ds_test, tokenizer, MAX_LEN, BATCH_SIZE, mod='test')

# 三、Roberta-base Models
## 1. Model_1
    使用roberta模型的最后两层输出，合并大小为(batch_size,MAX_LEN,2*hidden_state)。将模型的输出进行dropout(rate=0.3)之后连接一个全连接层，全连接层的大小为(2*hidden_state, 2),得到每个位置的作为起始位置和结束位置的两个分数，大小皆为(batch_size,MAX_LEN).

In [20]:
class Roberta_model_1(nn.Module):
    def __init__(self,config):
        super(Roberta_model_1, self).__init__()
        self.bert = transformers.RobertaModel.from_pretrained('../input/roberta/roberta-base-pytorch_model.bin',config=config)
        self.drop = nn.Dropout(p=0.3)
        self.start_end = nn.Linear(self.bert.config.hidden_size*2, 2)
    def forward(self, ids, mask, token_type_ids):
        _,_, out = self.bert(
          input_ids=ids,
          attention_mask=mask,
          token_type_ids=token_type_ids
        )      #tuple:(embedding(batch_size,sequence_length,hidden_size),output of each layer(batch_size,sequence_length,hidden_size))
        output = torch.cat((out[-1], out[-2]), dim=-1) #(batch_size, sequence_length, hidden_size*2)
        output = self.drop(output)
        output = self.start_end(output) #(batch_size, sequence_length, 2)
        start, end = torch.split(output, split_size_or_sections=1, dim=-1)  #(batch_size, sequence_length, 1)
        start = torch.squeeze(input=start, dim=-1)
        end = torch.squeeze(input=end, dim=-1)

        return start, end

## 2.Model_2
    将roberta模型的倒数四层输出做平均处理，大小为(batch_size,MAX_LEN,hidden_size)，将模型的输出进行dropout(rate=0.5)之后连接一个全连接层，大小为(hidden_size, 2)，输出的两个元素分别代表答案在推特文本中的起始位置和结束位置，大小皆为(batch_size,MAX_LEN).

In [21]:
class Roberta_model_2(nn.Module):
    def __init__(self, config):
        super(Roberta_model_2, self).__init__()  
        self.roberta = transformers.RobertaModel.from_pretrained(
            '../input/roberta/roberta-base-pytorch_model.bin', config=config)
        self.dropout = nn.Dropout(0.5)
        self.fc = nn.Linear(config.hidden_size, 2)
        nn.init.normal_(self.fc.weight, std=0.02)
        nn.init.normal_(self.fc.bias, 0)

    def forward(self, input_ids, attention_mask):
        _, _, hs = self.roberta(input_ids, attention_mask)
        x = torch.stack([hs[-1], hs[-2], hs[-3], hs[-4]])
        x = torch.mean(x, 0)
        x = self.dropout(x)
        x = self.fc(x)
        start_logits, end_logits = x.split(1, dim=-1)
        start_logits = start_logits.squeeze(-1)
        end_logits = end_logits.squeeze(-1)
                
        return start_logits, end_logits


## 3.Model_3:连接cnn的模型
    使用Roberta最后一层学到的hidden_state，大小为（batch_size,sequence_length,768),Dropout(rate=0.3),然后连接kernel_size为128的1D卷积层，对768维的hidden_state进行卷积，stride为1，目的是提取出能代表文本起始与结束位置的有效信息。1D-cnn输入的channel和输出的channel大小皆为句子长度，意味着对每个位置的信息都进行筛选。将cnn的输出进行Maxpool之后连接一个全连接层，全连接层输出的大小为(batch_size,sequence_length,2),这样每个位置都得到一个代表起始位置和结束位置的分数。

In [22]:
class Cnn_roberta_models(nn.Module):
    def __init__(self,config):
        super(Cnn_roberta_models, self).__init__()
        self.bert = transformers.RobertaModel.from_pretrained('../input/roberta/roberta-base-pytorch_model.bin',config=config)
        self.drop = nn.Dropout(p=0.3)
        self.start_end = nn.Linear(self.bert.config.hidden_size*2, 2)
        self.cnn_input = nn.Conv1d(in_channels=140, out_channels=140, kernel_size=128, stride=1)
        self.Linear = nn.Linear(in_features=320, out_features=2)
        self.maxpool = nn.MaxPool1d(kernel_size=2, stride=2)
        torch.nn.init.normal_(self.Linear.weight, std=0.02)
        self.cnn_256 = nn.Conv1d(in_channels=32, out_channels=16, kernel_size=256, stride=1)
    def forward(self, ids, mask, token_type_ids):
        hidden_state,_, out = self.bert(
          input_ids=ids,
          attention_mask=mask,
          token_type_ids=token_type_ids
        )      #tuple:(embedding(batch_size,sequence_length,hidden_size),output of each layer(batch_size,sequence_length,hidden_size))
        hidden_state = self.drop(hidden_state)
        out = self.cnn_input(hidden_state) #(batch_size, out_channels=140,  640)
        out = self.maxpool(out) #（b,140,320 ）
        out = self.Linear(out) #(b, 140, 2)
        start, end = torch.split(out, split_size_or_sections=1, dim=-1)
        start = torch.squeeze(start, dim=-1)
        end = torch.squeeze(end, dim=-1)
        return start, end

# 四、模型训练与评估
## 1.Loss 
    模型的输出为每个位置作为起始位置和结束位置的分数，将其softmax得到归一化的分数，训练的时候与已知的target一起，用CrossEntropy作为损失函数，总的Loss为起始位置与结束位置的Loss总和。

In [24]:
def loss_fn(start_logits, end_logits, start_positions, end_positions):
    loss_fct = torch.nn.CrossEntropyLoss() #softmax+loss(分类器)
    start_loss = loss_fct(start_logits, start_positions)
    end_loss = loss_fct(end_logits, end_positions)
    total_loss = (start_loss + end_loss)
    return total_loss

## 2.评估函数
    使用Jaccard similarity作为评估标准：即两个句子所含单词的交集与并集的比值

In [25]:
def calculate_jaccard_score(
    original_tweet, 
    target_string, 
    sentiment_val, 
    idx_start, 
    idx_end, 
    offsets,
    verbose=False):
    
    if idx_end < idx_start:
        idx_end = idx_start
    
    filtered_output  = ""
    for ix in range(idx_start, idx_end + 1):
        filtered_output += original_tweet[offsets[ix][0]: offsets[ix][1]]
        if (ix+1) < len(offsets) and offsets[ix][1] < offsets[ix+1][0]:
            filtered_output += " "

    if sentiment_val == "neutral" or len(original_tweet.split()) < 2:
        filtered_output = original_tweet

    jac = jaccard(target_string, filtered_output)
    return jac, filtered_output
def jaccard(str1, str2): 
    a = set(str1.lower().split()) 
    b = set(str2.lower().split())
    c = a.intersection(b)
    return float(len(c)) / (len(a) + len(b) - len(c))

## 3.优化函数

In [26]:
def scheduler(train_data, optimizer, EPOCHS):

  total_steps = len(train_data) * EPOCHS
  scheduler = get_linear_schedule_with_warmup(optimizer,
                          num_warmup_steps=0,
                          num_training_steps=total_steps)   
    #学习率设置为线性下降https://huggingface.co/transformers/main_classes/optimizer_schedules.html#learning-rate-schedules-pytorch
  return scheduler
def optimizer(model):
  return AdamW(model.parameters(), lr=2e-5, correct_bias=False)

## 4.训练

In [28]:
def train_epoch(
      model, 
      data_loader, 
      loss_fn, 
      optimizer, 
      device, 
      scheduler,
      train_data
     ):
    model = model.train()
    losses = []
    mean_jaccard = []
    data_loader = tqdm(data_loader)
    optimizer = optimizer(model)
    scheduler = scheduler(train_data, optimizer, EPOCHS)

    for i_batch,d in enumerate(data_loader):              #每个d就是一个batch
        input_ids = d["ids"].to(device)
        attention_mask = d["mask"].to(device)
        token_type_ids = d["token_type_ids"].to(device)
        target_start = d['targets_start'].to(device)
        target_end = d['targets_end'].to(device)

        tweet_text = d['tweet_text']
        selected_text = d['selected_text']
        offsets = d['offsets']
        sentiment = d['sentiment']

        start, end= model(
          ids=input_ids,
          mask=attention_mask,
          token_type_ids=token_type_ids
        )             #(batch_size, sequence_length)
        loss = loss_fn(start_logits=start,
                end_logits=end,
                start_positions=target_start,
                end_positions=target_end
        )#crossentropy
        losses.append(loss.item())
        loss.backward()
        nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
        optimizer.step()
        scheduler.step()
        optimizer.zero_grad()
        #softmax得到每个位置概率的start和end，因为是batch_size的维度，所以此处不能直接max
        start = torch.nn.functional.softmax(start, dim=1).cpu().detach().numpy()  #(batch)
        end = torch.nn.functional.softmax(end, dim=1).cpu().detach().numpy()  #(batch)
        jaccard_scores = []
        #对batch里面的每条推特计算jaccard_score
        for i, irow_tweet in enumerate(tweet_text):
          i_selected_text = selected_text[i]
          i_sentiment = sentiment[i]
          i_start = np.argmax(start[i,:])
          i_end = np.argmax(end[i,:])
          i_offsets = offsets[i]
          jaccard_score, _ = calculate_jaccard_score(
                original_tweet=irow_tweet,
                target_string=i_selected_text,
                sentiment_val=i_sentiment,
                idx_start=i_start,
                idx_end=i_end,
                offsets=i_offsets
            )
          jaccard_scores.append(jaccard_score)        #每条tweet的分数
        mean_jaccard.append(np.mean(jaccard_scores))      #每个batch的平均分数
       # print(f'第{i_batch+1}个batch: loss：{loss}, jaccard_score:{np.mean(mean_jaccard)}')

      
    return np.mean(mean_jaccard), np.mean(losses)

In [29]:
def eval_model(model, data_loader, loss_fn, device):
    model = model.eval()

    losses = []
    mean_jaccard = []

    with torch.no_grad():
        for d in tqdm(data_loader):
          input_ids = d["ids"].to(device)
          attention_mask = d["mask"].to(device)
          token_type_ids = d["token_type_ids"].to(device)
          target_start = d['targets_start'].to(device)
          target_end = d['targets_end'].to(device)

          tweet_text = d['tweet_text']
          selected_text = d['selected_text']
          offsets = d['offsets']
          sentiment = d['sentiment']

          start, end= model(
            ids=input_ids,
            mask=attention_mask,
            token_type_ids=token_type_ids
          )             #(batch_size, sequence_length)
          loss = loss_fn(start_logits=start,
                  end_logits=end,
                  start_positions=target_start,
                  end_positions=target_end
          )#crossentropy
          losses.append(loss.item())
          #softmax得到每个位置概率的start和end，因为是batch_size的维度，所以此处不能直接max
          start = torch.nn.functional.softmax(start, dim=1).cpu().detach().numpy()  #(batch)
          end = torch.nn.functional.softmax(end, dim=1).cpu().detach().numpy()  #(batch)
          jaccard_scores = []
          #对batch里面的每条推特计算jaccard_score
          for i, irow_tweet in enumerate(tweet_text):
            i_selected_text = selected_text[i]
            i_sentiment = sentiment[i]
            i_start = np.argmax(start[i,:])
            i_end = np.argmax(end[i,:])
            i_offsets = offsets[i]
            jaccard_score, _ = calculate_jaccard_score(
                  original_tweet=irow_tweet,
                  target_string=i_selected_text,
                  sentiment_val=i_sentiment,
                  idx_start=i_start,
                  idx_end=i_end,
                  offsets=i_offsets
              )
            jaccard_scores.append(jaccard_score)     #每条tweet的分数
          mean_jaccard.append(np.mean(jaccard_scores))   #每个batch的平均分数

    return np.mean(mean_jaccard), np.mean(losses)

In [30]:
def run(fold_num,model):
  #%%time
  history = defaultdict(list)
  best_accuracy = 0
  train = [train_1, train_2, train_3, train_4, train_5]
  train_data = train
  print(len(train_data))
  eval_data = train_data.pop(fold_num)
  train_data = pd.concat(train_data).sample(frac=1)
  train_data_loader = create_data_loader(train_data,
                      tokenizer=tokenizer,
                      max_len=MAX_LEN,
                      batch_size=BATCH_SIZE
                      )
  eval_data_loader = create_data_loader(eval_data,
                      tokenizer=tokenizer,
                      max_len=MAX_LEN,
                      batch_size=BATCH_SIZE
                      )

  print(f'Fold {fold_num}')
  for epoch in range(EPOCHS):

      print(f'Epoch {epoch + 1}/{EPOCHS}')
      print('-' * 10)

      train_jaccard_score, train_loss = train_epoch(
      model,
      train_data_loader,    
      loss_fn, 
      optimizer, 
      device, 
      scheduler,
      train_data
    )

      print(f'Train loss:{train_loss} Train jaccard score:{train_jaccard_score}')

      val_jaccard_score, val_loss = eval_model(
      model,
      eval_data_loader,
      loss_fn, 
      device
    )

      print(f'Val   loss:{val_loss}  Val jaccard score:{val_jaccard_score}')
      print()

      history['train_jaccard_score'].append(train_jaccard_score)
      history['train_loss'].append(train_loss)
      history['val_jaccard_score'].append(val_jaccard_score)
      history['val_loss'].append(val_loss)

      if val_jaccard_score > best_accuracy:
          torch.save(model.state_dict(), f'best_model_{fold_num}.bin')
          best_accuracy = val_jaccard_score
  torch.save(model.state_dict(),f'model_{fold_num}.bin')

In [31]:
models = []
models2 = []
config = transformers.BertConfig.from_pretrained('../input/roberta/roberta-base-config.json')
config.output_hidden_state = True
for fold in range(5):
    model = Roberta_model_1(config)
    model.to(device)
    model.load_state_dict(torch.load(f'../input/5fold-roberta/model_{fold}.bin'))
    model.eval()
    models.append(model)
for fold in range(5):
    model = Roberta_model_1(config)
    model.to(device)
    model.load_state_dict(torch.load(f'../input/lr0-5-fold0/5lr-3out-init-best_model_{fold}.bin'))
    model.eval()
    models.append(model)
for fold in range(5):
    model = Cnn_roberta_models(config)
    model.to(device)
    model.load_state_dict(torch.load(f'../input/cnn01final/model_{fold}.bin'))
    model.eval()
    models.append(model)
for fold in range(10):
    model = Roberta_model_2(config)
    model.cuda()
    model.load_state_dict(torch.load(f'../input/tweet-sentiment-roberta-pytorch/roberta_fold{fold+1}.pth'))
    model.eval()
    models2.append(model)


<All keys matched successfully>

# 五、后处理与预测

In [37]:
def get_predictions(models,models2,data_loader):

  tweet_text = []
  predict_selects = []
  ori_tweet = []
  coef =[[0.8, 1, 0.5, 2], [0.8, 1, 0.5, 2]]
  #optimizer = optim.SGD(coef, lr=0.01, momentum=0.9)
  
  if True:
        for d in tqdm(data_loader):
          input_ids = d["ids"].to(device)
          attention_mask = d["mask"].to(device)
          token_type_ids = d["token_type_ids"].to(device)
          #target_start = d['targets_start'].to(device)
          #target_end = d['targets_end'].to(device)

          tweet_text = d['tweet_text']
          #selected_text = d['selected_text']
          #offsets = d['offsets']
          sentiment = d['sentiment']
          batch_starts = []
          batch_ends = []
          with torch.no_grad():
              for model in models:
                    model.eval()
                    start, end= model(
                ids=input_ids,
                mask=attention_mask,
                token_type_ids=token_type_ids
              )             #(batch_size, sequence_length)
                    batch_starts.append(torch.unsqueeze(start, dim=0))
                    batch_ends.append(torch.unsqueeze(end, dim=0))             #每个batch*每个模型的end[模型数，batch，句子长度]
              
              for model in models2:
                    model.eval()
                    start, end= model(
                input_ids,
                attention_mask
              )             #(batch_size, sequence_length)
                    batch_starts.append(torch.unsqueeze(start, dim=0))
                    batch_ends.append(torch.unsqueeze(end, dim=0))             #每个batch*每个模型的end[模型数，batch，句子长度]
              batch_starts = torch.cat(tuple(batch_starts))
              batch_ends = torch.cat(tuple(batch_ends)) 
                
                
          
          for i, irow_tweet in enumerate(tweet_text):  #对batch里的每个句子进行循环，算所有模型的后处理值
              losses = []
              starts = batch_starts[:,i,:]
              ends = batch_ends[:,i,:]
              starts_pp = []
              ends_pp = []
              #i_target_start = target_start[i,:]
              #i_target_end = i_target_end[i, :]
              for start in starts:
           
                start = torch.nn.functional.softmax(start, dim=-1).cpu().detach().numpy()   #得到每个位置是开始的概率
                starts_pp.append(start)                         #[模型数， 句子长度]
              for end in ends:
                end = torch.nn.functional.softmax(end, dim=-1).cpu().detach().numpy()
                ends_pp.append(end)
              #optR = OptimizedRounder()
              #optR.fit(starts_pp, target_start)
              #coefficients = optR.coefficients()
              i_start = predict(starts_pp, coef[0])
              i_end = predict(ends_pp, coef[1])
              #optimizer.zero_grad()
              #loss = loss_fn(i_start, i_end, i_target_start, i_target_end)
              #losses.append(loss)
              #loss.backward()
              #optimizer.step()
         
              
   
              i_sentiment = sentiment[i]
    
              predict_select = get_predict_text(
                tweet=irow_tweet,
                sentiment=i_sentiment,
                start=i_start,
                end=i_end
            )
              predict_selects.append(predict_select)
              ori_tweet.append(irow_tweet)
          #print(np.mean(losses))
         # print(coef)
              

  
  return ori_tweet, predict_selects
def get_predict_text(tweet, sentiment, start, end):    #这里的start 和 end 是按token编号，且算上了问题和原文
  sentiment_id = {
        'positive': 1313,
        'negative': 2430,
        'neutral': 7974
    }

  tokenize_tweet = tokenizer.encode(tweet).ids   #原tweet
  all_token = [0] + [sentiment_id[sentiment]] + [2] + [2] + tokenize_tweet + [2]  #问题和原文总的token
  predict_text_token = all_token[start:end+1]
  predict_text = tokenizer.decode(predict_text_token)
  return predict_text
def predict( start_pp, coef):
        
        start_final = np.zeros(MAX_LEN)
        for j, start in enumerate(start_pp):     #对每个模型而言
            for i in range(MAX_LEN):
                if start[i] < np.median(start):
                    start[i] = 0
            if j <5:
                start_final =start_final +  start*10000000*coef[0]
            elif j>=5 and j<10:
                start_final =start_final +  start*10000000*coef[1]
            elif j>=10 and j<15 :
                start_final =start_final +  start*10000000*coef[2]
            elif j>15:
                start_final =start_final +  start*10000000*coef[3]
        start_final = np.array(start_final)/len(start_pp)
        return np.argmax(start_final)




In [40]:
def post_process(s):
    a = re.findall('[^A-Za-z0-9]',s)
    b = re.sub('[^A-Za-z0-9]+', '', s)

    try:
        if a.count('.') >= 3:
            text =b +" " + b + '. ' + b + '..'
        elif a.count('!') >= 3:
            text = b + " " + b + '! ' + b + '!! ' 
        elif a.count('?') >= 3:
            text = b + " " + b + '? ' + b + '?? ' 
        else:
            text = s
        return text
    except:
        return text

In [41]:
ds_test.loc[:,'selected_text'] = pre_selects
ds_test.selected_text.replace('', np.nan, inplace=True)
ds_test.loc[ds_test.selected_text.isnull(),'selected_text'] = ds_test.loc[ds_test.selected_text.isnull(),'text']
ds_test['selected_text'] = ds_test['selected_text'].apply(lambda x:post_process(x) if len(x.split())==1 else x)
ds_test['selected_text'] = ds_test['selected_text'].apply(lambda x: post_process(x) if len(x.split())==1 else x)
ds_test['selected_text'] = ds_test['selected_text'].apply(lambda x: post_process(x) if len(x.split())==1 else x)
submit = pd.concat((ds_test.textID, ds_test.selected_text), axis=1)


In [42]:
submit.to_csv('submission.csv', index=False)