## 代码参考了https://www.jianshu.com/p/fba7df3a76fa

### 这是基于gensim和keras的简单basline，直接对文本进行情感分类，并且没有对实体进行筛选

In [9]:
import os
from os.path import join
import numpy as np
from gensim.models.word2vec import Word2Vec
from gensim.corpora.dictionary import Dictionary
import keras
from keras import Sequential
from keras.preprocessing.sequence import pad_sequences
from keras.layers import Bidirectional, LSTM, Dense, Embedding, Dropout, Activation, Softmax
from sklearn.model_selection import train_test_split
from keras.utils import np_utils
import pickle as pkl
from os.path import join
import pandas as pd
from conf import conf
import gensim
import jieba
from tqdm import tqdm
DATA_DIR = '****'#将此处改为自己的数据存放路径
print(os.listdir(DATA_DIR))
TrainDataPath = join(DATA_DIR, 'Train_Data.csv')
TestDataPath = join(DATA_DIR, 'Test_Data.csv')



['Train_Data.csv', 'Test_Data.csv']


In [10]:
#读取文本用于gensim训练词向量
def read_data(data_path):
    dataframe = pd.read_csv(data_path)[['text']]
    textList = dataframe['text'].to_list()
    return textList

#训练gensim词向量模型
def train_word2vec(sentences, save_path):
    sentences_seg = []
    sen_str = "\n".join(sentences)
    res = jieba.lcut(sen_str)
    seg_str = " ".join(res)
    sen_list = seg_str.split("\n")
    for i in sen_list:
        sentences_seg.append(i.split())
    print("开始训练词向量")
    #     logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
    model = Word2Vec(sentences_seg,
                     size=100,  # 词向量维度
                     min_count=2,  # 词频阈值
                     window=5)  # 窗口大小
    model.save(save_path)
    return model

#将gensim模型转换为单词到id的映射和词向量矩阵
def generate_id2wec(word2vec_model):
    gensim_dict = Dictionary()
    gensim_dict.doc2bow(model.wv.vocab.keys(), allow_update=True)
    w2id = {v: k + 1 for k, v in gensim_dict.items()}  # 词语的索引，从1开始编号
    w2vec = {word: model[word] for word in w2id.keys()}  # 词语的词向量
    n_vocabs = len(w2id) + 1
    embedding_weights = np.zeros((n_vocabs, 100))
    for w, index in w2id.items():  # 从索引为1的词语开始，用词向量填充矩阵
        embedding_weights[index, :] = w2vec[w]
    return w2id, embedding_weights

# 单词转索引数字
def text_to_array(w2index, senlist):  
    sentences_array = []
    for sen in senlist:
        new_sen = [w2index.get(word, 0) for word in sen]  
        sentences_array.append(new_sen)
    return np.array(sentences_array)

#准备用于训练和验证的数据
def prepare_data(w2id, max_len=200):
    df = pd.read_csv(TrainDataPath)[['text', 'negative']]
    sentences = df['text'].to_list()
    labels = df['negative'].to_list()
    X_train, X_val, y_train, y_val = train_test_split(sentences, labels, test_size=0.2)
    X_train = text_to_array(w2id, X_train)
    X_val = text_to_array(w2id, X_val)
    X_train = pad_sequences(X_train, maxlen=max_len)
    X_val = pad_sequences(X_val, maxlen=max_len)
    return np.array(X_train), np_utils.to_categorical(y_train), np.array(X_val), np_utils.to_categorical(y_val)

#准备用于测试的数据
def prepare_predict_data(w2id, max_len=200):
    df = pd.read_csv(TestDataPath)[['text', 'entity']]
    sentences = df['text'].to_list()
    sentences = [str(s) for s in sentences]
    entities = df['entity'].to_list()
    return sentences, entities



In [15]:
#定义模型
class Sentiment:
    def __init__(self, w2id, embedding_weights, Embedding_dim, maxlen, labels_category):
        self.Embedding_dim = Embedding_dim
        self.embedding_weights = embedding_weights
        self.vocab = w2id
        self.labels_category = labels_category
        self.maxlen = maxlen
        self.model = self.build_model()

    def build_model(self):
        model = Sequential()
        # input dim(140,100)
        model.add(Embedding(output_dim=self.Embedding_dim,
                            input_dim=len(self.vocab) + 1,
                            weights=[self.embedding_weights],
                            input_length=self.maxlen))
        model.add(Bidirectional(LSTM(50), merge_mode='concat'))
        model.add(Dropout(0.5))
        model.add(Dense(self.labels_category))
        model.add(Activation('softmax'))
        model.compile(loss='categorical_crossentropy',
                      optimizer='adam',
                      metrics=['accuracy'])
        model.summary()
        return model

    def train(self, X_train, y_train, X_test, y_test, batch_size = 128, n_epoch=35):
        early_stop = keras.callbacks.EarlyStopping(monitor='val_loss', min_delta=0, patience=5, verbose=0, mode='auto',
                                                   baseline=None, restore_best_weights=False)

        self.model.fit(X_train, y_train, batch_size=batch_size, epochs=n_epoch,
                       validation_data=(X_test, y_test), callbacks=[early_stop])
        self.model.save('sentiment.h5')

    def predict(self, model_path, sents):
        model = self.model
        model.load_weights(model_path)
        new_sen_lists = [jieba.lcut(new_sen) for new_sen in sents]
        sen2id = [[self.vocab.get(word, 0) for word in new_sen_list] for new_sen_list in new_sen_lists]
        sen_input = pad_sequences(sen2id, maxlen=self.maxlen)
        res = model.predict(sen_input)
        return res



In [18]:

#训练词向量
texts1 = read_data(TrainDataPath)
texts2 = read_data(TestDataPath)
texts = texts1 + texts2
texts = [str(t) for t in texts]
model = train_word2vec(texts, 'word2vec.model')

w2id, embedding_weights = generate_id2wec(model)
x_train, y_trian, x_val, y_val = prepare_data(w2id, 100)

#训练keras文本分类模型
senti = Sentiment(w2id, embedding_weights, 100, 100, 2)
senti.train(x_train, y_trian, x_val, y_val, 30)




开始训练词向量


  'See the migration notes for details: %s' % _MIGRATION_NOTES_URL


_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_4 (Embedding)      (None, 100, 100)          2511000   
_________________________________________________________________
bidirectional_4 (Bidirection (None, 100)               60400     
_________________________________________________________________
dropout_4 (Dropout)          (None, 100)               0         
_________________________________________________________________
dense_4 (Dense)              (None, 2)                 202       
_________________________________________________________________
activation_4 (Activation)    (None, 2)                 0         
Total params: 2,571,602
Trainable params: 2,571,602
Non-trainable params: 0
_________________________________________________________________
Train on 3999 samples, validate on 1000 samples
Epoch 1/35
Epoch 2/35
Epoch 3/35
Epoch 4/35
Epoch 5/35
Epoch 6/35
Epoch 7/35
Epoch 8

In [22]:
#利用训练好的模型进行预测
X_test, entities = prepare_predict_data(w2id, 100)
rs = senti.predict('sentiment.h5', X_test)

# 提交答案
df = pd.read_csv(TestDataPath)[['id', 'entity']]
df['negative'] = np.argmax(rs, axis=1)

df.columns = ['id', 'key_entity', 'negative']
df = df[['id', 'negative', 'key_entity']]
df.loc[df['negative']==0,'key_entity'] = np.nan
df.to_csv('basic_baseline.csv', index=False)

In [23]:
#查看结果
df.head()

Unnamed: 0,id,negative,key_entity
0,f3b61b38,1,小资钱包;资易贷
1,84b12bae,0,
2,6abf4a82,1,嘉石榴
3,8d076785,1,宜贷网(沪);易捷金融;宜贷网
4,d65a1577,1,贵金属
