# 情感分析项目总结
感谢官方Baseline：[http://aistudio.baidu.com/aistudio/projectdetail/2085599](http://aistudio.baidu.com/aistudio/projectdetail/2085599)

感谢江流同学观点抽取任务的代码：[https://aistudio.baidu.com/aistudio/projectdetail/2107758](http://aistudio.baidu.com/aistudio/projectdetail/2107758)

感谢Hansen0506同学预测结果融合的思路：[https://aistudio.baidu.com/aistudio/projectdetail/2115867](http://aistudio.baidu.com/aistudio/projectdetail/2115867)


In [None]:
!pip install --upgrade paddlenlp -i https://pypi.org/simple 

## 1. 句子级情感分析
由于调参越调越糟，这里只改了Baseline的epochs数。

改为epochs=10后，对生成的10个预测结果进行融合，得到最终预测结果。

### 1.0 官方Baseline

In [None]:
import paddlenlp
from paddlenlp.transformers import SkepForSequenceClassification, SkepTokenizer

In [None]:
# 解压数据
!unzip -o datasets/ChnSentiCorp
!unzip -o datasets/NLPCC14-SC

In [None]:
# 得到数据集字典
def open_func(file_path):
    return [line.strip() for line in open(file_path, 'r', encoding='utf8').readlines()[1:] if len(line.strip().split('\t')) >= 2]

data_dict = {'chnsenticorp': {'test': open_func('ChnSentiCorp/test.tsv'),
                              'dev': open_func('ChnSentiCorp/dev.tsv'),
                              'train': open_func('ChnSentiCorp/train.tsv')},
             'nlpcc14sc': {'test': open_func('NLPCC14-SC/test.tsv'),
                           'train': open_func('NLPCC14-SC/train.tsv')}}

In [None]:
# 定义数据集
from paddle.io import Dataset, DataLoader
from paddlenlp.data import Pad, Stack, Tuple
import numpy as np
label_list = [0, 1]

# 注意，由于token type在此项任务中并没有起作用，因此这里不再考虑，让模型自行填充。
class MyDataset(Dataset):
    def __init__(self, data, tokenizer, max_len=512, for_test=False):
        super().__init__()
        self._data = data
        self._tokenizer = tokenizer
        self._max_len = max_len
        self._for_test = for_test
    
    def __len__(self):
        return len(self._data)
    
    def __getitem__(self, idx):
        samples = self._data[idx].split('\t')
        label = samples[-2]
        text = samples[-1]
        label = int(label)
        text = self._tokenizer.encode(text, max_seq_len=self._max_len)['input_ids']
        if self._for_test:
            return np.array(text, dtype='int64')
        else:
            return np.array(text, dtype='int64'), np.array(label, dtype='int64')

def batchify_fn(for_test=False):
    if for_test:
        return lambda samples, fn=Pad(axis=0, pad_val=tokenizer.pad_token_id): np.row_stack([data for data in fn(samples)])
    else:
        return lambda samples, fn=Tuple(Pad(axis=0, pad_val=tokenizer.pad_token_id),
                                        Stack()): [data for data in fn(samples)]


def get_data_loader(data, tokenizer, batch_size=32, max_len=512, for_test=False):
    dataset = MyDataset(data, tokenizer, max_len, for_test)
    shuffle = True if not for_test else False
    data_loader = DataLoader(dataset=dataset, batch_size=batch_size, collate_fn=batchify_fn(for_test), shuffle=shuffle)
    return data_loader

In [None]:
import paddle
from paddle.static import InputSpec

# 模型和分词
model = SkepForSequenceClassification.from_pretrained('skep_ernie_1.0_large_ch', num_classes=2)
tokenizer = SkepTokenizer.from_pretrained('skep_ernie_1.0_large_ch')

# 参数设置
data_name = 'nlpcc14sc'  # 更改此选项改变数据集

## 训练相关
epochs = 10
learning_rate = 2e-5
batch_size = 8
max_len = 512

## 数据相关
train_dataloader = get_data_loader(data_dict[data_name]['train'], tokenizer, batch_size, max_len, for_test=False)
if data_name == 'chnsenticorp':
    dev_dataloader = get_data_loader(data_dict[data_name]['dev'], tokenizer, batch_size, max_len, for_test=False)
else:
    dev_dataloader = None

input = InputSpec((-1, -1), dtype='int64', name='input')
label = InputSpec((-1, 2), dtype='int64', name='label')
model = paddle.Model(model, [input], [label])

# 模型准备

optimizer = paddle.optimizer.Adam(learning_rate=learning_rate, parameters=model.parameters())
model.prepare(optimizer, loss=paddle.nn.CrossEntropyLoss(), metrics=[paddle.metric.Accuracy()])

In [None]:
# 开始训练，save_freq改为1
model.fit(train_dataloader, dev_dataloader, batch_size, epochs, eval_freq=5, save_freq=1, save_dir='./checkpoints', log_freq=200)

In [None]:
#生成10个预测模型
for i in range(epochs):
    # 导入预训练模型
    checkpoint_path = './checkpoints/{}'.format(i)  # 填写预训练模型的保存路径

    model = SkepForSequenceClassification.from_pretrained('skep_ernie_1.0_large_ch', num_classes=2)
    input = InputSpec((-1, -1), dtype='int64', name='input')
    model = paddle.Model(model, input)
    model.load(checkpoint_path)

    # 导入测试集
    test_dataloader = get_data_loader(data_dict[data_name]['test'], tokenizer, batch_size, max_len, for_test=True)
    # 预测保存

    save_file = {'chnsenticorp': './submission/ChnSentiCorp{}.tsv'.format(i), 'nlpcc14sc': './submission/NLPCC14-SC{}.tsv'.format(i)}
    predicts = []
    for batch in test_dataloader:
        predict = model.predict_batch(batch)
        predicts += predict[0].argmax(axis=-1).tolist()

    with open(save_file[data_name], 'w', encoding='utf8') as f:
        f.write("index\tprediction\n")
        for idx, sample in enumerate(data_dict[data_name]['test']):
            qid = sample.split('\t')[0]
            f.write(qid + '\t' + str(predicts[idx]) + '\n')
        f.close()


### 1.1 结果融合
对生成的10个epochs的预测结果求平均数，取整得到最终预测结果

In [None]:
import os
path = "./submission" #文件夹目录
files= os.listdir(path)
d2={} #建立字典存放结果

for file in files:
    if not os.path.isdir(file):
        f2 = open(path+"/"+file, 'r', encoding='utf-8')
        next(f2)
        ls2 = []
        for line in f2:
            ls2.append(line.strip().split())
        for item in ls2:
            d2[eval(item[0])]=d2.get(eval(item[0]),0)+int(item[1])
        print(d2)

with open('NLPCC14-SC.tsv', 'w', encoding='utf8') as f:
    f.write("index\tprediction\n")
    for key in d2.keys():
        f.write(str(key) + '\t' + str(round(d2[key]/epochs+0.0000001)) + '\n') #平均数为0.5时，取预测值为1
    f.close()

## 2. 目标级情感分析
这里将官方Baseline的预训练模型改为RoBERTa模型，并对SE-ABSA16_PHNS和SE-ABSA16_CAME的训练集进行了人工清洗与合并。

合并后的setrain.tsv已上传至目录内。

### 2.0 使用RoBERTa预训练模型

In [None]:
import paddlenlp
from paddlenlp.transformers import RobertaForSequenceClassification, RobertaTokenizer

In [None]:
# 解压数据
!unzip -o datasets/SE-ABSA16_CAME
!unzip -o datasets/SE-ABSA16_PHNS

In [None]:
# 得到数据集字典
# 得到数据集字典
def open_func(file_path):
    return [line.strip() for line in open(file_path, 'r', encoding='utf8').readlines()[1:] if len(line.strip().split('\t')) >= 2]

data_dict = {'seabsa16phns': {'test': open_func('SE-ABSA16_PHNS/test.tsv'),
                              'train': open_func('setrain.tsv')},
             'seabsa16came': {'test': open_func('SE-ABSA16_CAME/test.tsv'),
                              'train': open_func('setrain.tsv')}}

In [None]:
# 定义数据集
from paddle.io import Dataset, DataLoader
from paddlenlp.data import Pad, Stack, Tuple
import numpy as np
label_list = [0, 1]

# 考虑token_type_id
class MyDataset(Dataset):
    def __init__(self, data, tokenizer, max_len=512, for_test=False):
        super().__init__()
        self._data = data
        self._tokenizer = tokenizer
        self._max_len = max_len
        self._for_test = for_test
    
    def __len__(self):
        return len(self._data)
    
    def __getitem__(self, idx):
        samples = self._data[idx].split('\t')
        label = samples[-3]
        text_b = samples[-1]
        text_a = samples[-2]
        label = int(label)
        encoder_out = self._tokenizer.encode(text_a, text_b, max_seq_len=self._max_len)
        text = encoder_out['input_ids']
        token_type = encoder_out['token_type_ids']
        if self._for_test:
            return np.array(text, dtype='int64'), np.array(token_type, dtype='int64')
        else:
            return np.array(text, dtype='int64'), np.array(token_type, dtype='int64'), np.array(label, dtype='int64')

def batchify_fn(for_test=False):
    if for_test:
        return lambda samples, fn=Tuple(Pad(axis=0, pad_val=tokenizer.pad_token_id),
                                        Pad(axis=0, pad_val=tokenizer.pad_token_type_id)): [data for data in fn(samples)]
    else:
        return lambda samples, fn=Tuple(Pad(axis=0, pad_val=tokenizer.pad_token_id),
                                        Pad(axis=0, pad_val=tokenizer.pad_token_type_id),
                                        Stack()): [data for data in fn(samples)]


def get_data_loader(data, tokenizer, batch_size=32, max_len=512, for_test=False):
    dataset = MyDataset(data, tokenizer, max_len, for_test)
    shuffle = True if not for_test else False
    data_loader = DataLoader(dataset=dataset, batch_size=batch_size, collate_fn=batchify_fn(for_test), shuffle=shuffle)
    return data_loader

经调参，learning_rate取1e-5, epochs取10

In [None]:
import paddle
from paddle.static import InputSpec

# 模型和分词
model = RobertaForSequenceClassification.from_pretrained('roberta-wwm-ext-large', num_classes=2)
tokenizer = RobertaTokenizer.from_pretrained('roberta-wwm-ext-large')

# 参数设置
data_name = 'seabsa16came'  # 更改此选项改变数据集

## 训练相关
epochs = 10
learning_rate = 1e-5
batch_size = 8
max_len = 512

## 数据相关
train_dataloader = get_data_loader(data_dict[data_name]['train'], tokenizer, batch_size, max_len, for_test=False)

input = InputSpec((-1, -1), dtype='int64', name='input')
token_type = InputSpec((-1, -1), dtype='int64', name='token_type')
label = InputSpec((-1, 2), dtype='int64', name='label')
model = paddle.Model(model, [input, token_type], [label])

print(data_dict[data_name]['train'][-1])
print(data_dict[data_name]['train'][0])

# 模型准备

optimizer = paddle.optimizer.Adam(learning_rate=learning_rate, parameters=model.parameters())
model.prepare(optimizer, loss=paddle.nn.CrossEntropyLoss(), metrics=[paddle.metric.Accuracy()])

In [None]:
# 开始训练，save_freq改为1
model.fit(train_dataloader, batch_size=batch_size, epochs=epochs, save_freq=1, save_dir='./checkpoints', log_freq=200)

In [None]:
#生成10个预测模型
for i in range(epochs):
    # 导入预训练模型
    checkpoint_path = './checkpoints/{}'.format(i)  # 填写预训练模型的保存路径

    model = RobertaForSequenceClassification.from_pretrained('roberta-wwm-ext-large', num_classes=2)
    input = InputSpec((-1, -1), dtype='int64', name='input')
    token_type = InputSpec((-1, -1), dtype='int64', name='token_type')
    model = paddle.Model(model, [input, token_type])
    model.load(checkpoint_path)

    data_name = 'seabsa16phns'  # 切换测试集

    # 导入测试集
    test_dataloader = get_data_loader(data_dict[data_name]['test'], tokenizer, batch_size, max_len, for_test=True)
    # 预测保存

    save_file = {'seabsa16phns': './submission/SE-ABSA16_PHNS{}.tsv'.format(i), 'seabsa16came': './submission/SE-ABSA16_CAME{}.tsv'.format(i)}
    predicts = []
    for batch in test_dataloader:
        predict = model.predict_batch(batch)
        predicts += predict[0].argmax(axis=-1).tolist()

    with open(save_file[data_name], 'w', encoding='utf8') as f:
        f.write("index\tprediction\n")
        for idx, sample in enumerate(data_dict[data_name]['test']):
            qid = sample.split('\t')[0]
            f.write(qid + '\t' + str(predicts[idx]) + '\n')
        f.close()

**经提交测试，两个数据集都在epochs=5时预测结果最佳，且高于10个epochs合并后的结果**

## 3. 观点抽取
### 3.0 使用ErnieGram预训练模型
这部分代码基本来自江流同学

In [None]:
import paddlenlp
from paddlenlp.transformers import ErnieGramTokenizer, ErnieGramForTokenClassification

In [None]:
# 解压数据
!unzip -o datasets/COTE-BD
!unzip -o datasets/COTE-DP
!unzip -o datasets/COTE-MFW

Archive:  datasets/COTE-BD.zip
   creating: COTE-BD/
  inflating: COTE-BD/train.tsv       
   creating: __MACOSX/
   creating: __MACOSX/COTE-BD/
  inflating: __MACOSX/COTE-BD/._train.tsv  
  inflating: COTE-BD/License.pdf     
  inflating: __MACOSX/COTE-BD/._License.pdf  
  inflating: COTE-BD/test.tsv        
  inflating: __MACOSX/COTE-BD/._test.tsv  
  inflating: __MACOSX/._COTE-BD      
Archive:  datasets/COTE-DP.zip
   creating: COTE-DP/
  inflating: COTE-DP/train.tsv       
   creating: __MACOSX/COTE-DP/
  inflating: __MACOSX/COTE-DP/._train.tsv  
  inflating: COTE-DP/License.pdf     
  inflating: __MACOSX/COTE-DP/._License.pdf  
  inflating: COTE-DP/test.tsv        
  inflating: __MACOSX/COTE-DP/._test.tsv  
  inflating: __MACOSX/._COTE-DP      
Archive:  datasets/COTE-MFW.zip
   creating: COTE-MFW/
  inflating: COTE-MFW/train.tsv      
   creating: __MACOSX/COTE-MFW/
  inflating: __MACOSX/COTE-MFW/._train.tsv  
  inflating: COTE-MFW/License.pdf    
  inflating: __MACOSX/COTE-MFW/

In [None]:
# 得到数据集字典
def open_func(file_path):
    return [line.strip() for line in open(file_path, 'r', encoding='utf8').readlines()[1:] if len(line.strip().split('\t')) >= 2]

data_dict = {'cotebd': {'test': open_func('COTE-BD/test.tsv'),
                        'train': open_func('COTE-BD/train.tsv')},
             'cotedp': {'test': open_func('COTE-DP/test.tsv'),
                        'train': open_func('COTE-DP/train.tsv')},
             'cotemfw': {'test': open_func('COTE-MFW/test.tsv'),
                        'train': open_func('COTE-MFW/train.tsv')}}

In [None]:
# 定义数据集
from paddle.io import Subset, Dataset, DataLoader
from paddlenlp.data import Pad, Stack, Tuple
import numpy as np
label_list = {'B': 0, 'I': 1, 'O': 2}
index2label = {0: 'B', 1: 'I', 2: 'O'}

# 考虑token_type_id
class MyDataset(Dataset):
    def __init__(self, data, tokenizer, max_len=512, for_test=False):
        super().__init__()
        self._data = data
        self._tokenizer = tokenizer
        self._max_len = max_len
        self._for_test = for_test
    
    def __len__(self):
        return len(self._data)
    
    def __getitem__(self, idx):
        samples = self._data[idx].split('\t')
        label = samples[-2]
        text = samples[-1]
        if self._for_test:
            origin_enc = self._tokenizer.encode(text, max_seq_len=self._max_len)['input_ids']
            return np.array(origin_enc, dtype='int64')
        else:
            
            # 由于并不是每个字都是一个token，这里采用一种简单的处理方法，先编码label，再编码text中除了label以外的词，最后合到一起
            texts = text.split(label)
            label_enc = self._tokenizer.encode(label)['input_ids']
            cls_enc = label_enc[0]
            sep_enc = label_enc[-1]
            label_enc = label_enc[1:-1]
            
            # 合并
            origin_enc = []
            label_ids = []
            for index, text in enumerate(texts):
                text_enc = self._tokenizer.encode(text)['input_ids']
                text_enc = text_enc[1:-1]
                origin_enc += text_enc
                label_ids += [label_list['O']] * len(text_enc)
                if index != len(texts) - 1:
                    origin_enc += label_enc
                    label_ids += [label_list['B']] + [label_list['I']] * (len(label_enc) - 1)

            origin_enc = [cls_enc] + origin_enc + [sep_enc]
            label_ids = [label_list['O']] + label_ids + [label_list['O']]
            
            # 截断
            if len(origin_enc) > self._max_len:
                origin_enc = origin_enc[:self._max_len-1] + origin_enc[-1:]
                label_ids = label_ids[:self._max_len-1] + label_ids[-1:]
            return np.array(origin_enc, dtype='int64'), np.array(label_ids, dtype='int64')


def batchify_fn(for_test=False):
    if for_test:
        return lambda samples, fn=Pad(axis=0, pad_val=tokenizer.pad_token_id): np.row_stack([data for data in fn(samples)])
    else:
        return lambda samples, fn=Tuple(Pad(axis=0, pad_val=tokenizer.pad_token_id),
                                        Pad(axis=0, pad_val=label_list['O'])): [data for data in fn(samples)]


def get_data_loader(data, tokenizer, batch_size=32, max_len=512, for_test=False, k=0):
    dataset = MyDataset(data, tokenizer, max_len, for_test)
    if for_test == False:
        shuffle = True
        train_ds = Subset(dataset=dataset, indices=[i for i in range(len(dataset)) if i % 10 != k])
        dev_ds = Subset(dataset=dataset, indices=[i for i in range(len(dataset)) if i % 10 == k])
        train_loader = DataLoader(dataset=train_ds, batch_size=batch_size, collate_fn=batchify_fn(for_test), shuffle=shuffle)
        dev_loader = DataLoader(dataset=dev_ds, batch_size=batch_size, collate_fn=batchify_fn(for_test), shuffle=shuffle)
        return train_loader, dev_loader
    else:
        shuffle = False
        test_loader = DataLoader(dataset=dataset, batch_size=batch_size, collate_fn=batchify_fn(for_test), shuffle=shuffle)
        return test_loader

In [None]:
import paddle
from paddle.static import InputSpec
from paddlenlp.metrics import Perplexity

# 模型和分词

model = ErnieGramForTokenClassification.from_pretrained("ernie-gram-zh", num_classes=3)
tokenizer = ErnieGramTokenizer.from_pretrained('ernie-gram-zh')

# 参数设置
data_name = 'cotebd'  # 更改此选项改变数据集

## 训练相关
epochs = 10
learning_rate = 2e-5
batch_size = 8
max_len = 256

## 数据相关
train_dataloader, dev_dataloader = get_data_loader(data_dict[data_name]['train'], tokenizer, batch_size, max_len, for_test=False)

input = InputSpec((-1, -1), dtype='int64', name='input')
label = InputSpec((-1, -1, 3), dtype='int64', name='label')
model = paddle.Model(model, [input], [label])
# 模型准备
optimizer = paddle.optimizer.Adam(learning_rate=learning_rate, parameters=model.parameters())
model.prepare(optimizer, loss=paddle.nn.CrossEntropyLoss(), metrics=[Perplexity()])

NameError: name 'ErnieGramForTokenClassification' is not defined

In [None]:
# 开始训练，save_freq改为1
model.fit(train_dataloader, batch_size=batch_size, epochs=epochs, save_freq=1, save_dir='./checkpoints', log_freq=200)

In [None]:
import re

#生成10个预测结果
for i in range(epochs):
    # 导入预训练模型
    checkpoint_path = './checkpoints/{}'.format()  # 填写预训练模型的保存路径

    model = ErnieGramForTokenClassification.from_pretrained("ernie-gram-zh", num_classes=3)

    input = InputSpec((-1, -1), dtype='int64', name='input')
    model = paddle.Model(model, [input])
    model.load(checkpoint_path)

    # 导入测试集
    test_dataloader = get_data_loader(data_dict[data_name]['test'], tokenizer, batch_size, max_len, for_test=True)
    # 预测保存

    predicts = []
    input_ids = []
    for batch in test_dataloader:
        predict = model.predict_batch(batch)
        predicts += predict[0].argmax(axis=-1).tolist()
        input_ids += batch.numpy().tolist()


    # 先找到B所在的位置，即标号为0的位置，然后顺着该位置一直找到所有的I，即标号为1，即为所得。
    def find_entity(prediction, input_ids):
        entity = []
        entity_ids = []
        for index, idx in enumerate(prediction):
            if idx == label_list['B']:
                entity_ids = [input_ids[index]]
            elif idx == label_list['I']:
                if entity_ids:
                    entity_ids.append(input_ids[index])
            elif idx == label_list['O']:
                if entity_ids:
                    entity_s = ''
                    for i in entity_ids:
                        try:
                            s = tokenizer.convert_ids_to_tokens(i)
                        except:
                            s = '[UNK]'
                        else:
                            entity_s += s
                    entity.append(entity_s)
                    entity_ids = []
        return entity


    save_file = {'cotebd': './submission/COTE_BD{}.tsv'.format(), 'cotedp': './submission/COTE_DP{}.tsv'.format(),
                 'cotemfw': './submission/COTE_MFW{}.tsv'.format()}
    with open(save_file[data_name], 'w', encoding='utf8') as f:
        f.write("index\tprediction\n")
        for idx, sample in enumerate(data_dict[data_name]['test']):
            qid = sample.split('\t')[0]
            entity = find_entity(predicts[idx], input_ids[idx])
            entity = list(set(entity))  # 去重
            entity = [re.sub('##', '', e) for e in entity]  # 去除英文编码时的特殊符号
            entity = [re.sub('[UNK]', '', e) for e in entity]  # 去除未知符号
            f.write(qid + '\t' + '\x01'.join(entity) + '\n')
        f.close()

### 3.1 结果融合
对生成的10个epochs的预测结果求平均数，取整得到最终预测结果

In [None]:
data_name = 'cotebd'
import os
path = "./submission" #文件夹目录
files= os.listdir(path)

d2={} #建立字典存放结果
for i in range(len(data_dict[data_name]['test'])):
    d2[i]=[]

for file in files:
    f2 = open(path+"/"+file, 'r', encoding='utf-8')
    next(f2)
    ls2 = []
    for line in f2:
        ls2.append(line.strip().split())
    for item in ls2:
        try:
            d2[eval(item[0])].append(item[1].split('\x01')) #拆开同一个文件中的多个预测结果，一同存入字典
        except:
            pass #排除没有生成结果的情况
    print(d2)

#整理结果字典格式
dfinal={} 
for i in range(len(data_dict[data_name]['test'])):
    dfinal[i]=[]
for key in d2.keys():
    for item in d2[key]:
        for itemitem in item:
            dfinal[key].append(itemitem)
print(dfinal)

#取出频率最高的结果
def maxElement(listA):
    countA,elementA = 0,''
    for word in listA:
        tempCount = listA.count(word) #key
        if tempCount > countA:
            countA = tempCount
            elementA = word
    return elementA

with open('COTE_BD, 'w', encoding='utf8') as f:
    f.write("index\tprediction\n")
    for i in range(len(data_dict[data_name]['test'])):
        f.write(str(i) + '\t' + maxElement(dfinal[i]) + '\n')
    f.close()