# 基于TextCNN模型的豆瓣影评情感分析

## 导入项目需要的相关python包

In [23]:
import pandas as pd
from sklearn.model_selection import train_test_split
from tqdm import tqdm_notebook
import re
import jieba  #分词工具包

## 读取豆瓣影评数据
评分大于3星的评论设定为积极情绪，小于等于3星的评论设定为消极情绪。<br>
打印出前五行数据，这样可以看到加载的数据到底长什么样子。

In [24]:
data = pd.read_csv('DMSC.csv', index_col=0)
data = data.assign(Star=data['Star'].map(lambda x: 0 if x <=3 else 1))
data.head()

Unnamed: 0_level_0,Movie_Name_EN,Movie_Name_CN,Crawl_Date,Number,Username,Date,Star,Comment,Like
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
0,Avengers Age of Ultron,复仇者联盟2,2017-01-22,1,然潘,2015-05-13,0,连奥创都知道整容要去韩国。,2404
1,Avengers Age of Ultron,复仇者联盟2,2017-01-22,2,更深的白色,2015-04-24,0,非常失望，剧本完全敷衍了事，主线剧情没突破大家可以理解，可所有的人物都缺乏动机，正邪之间、...,1231
2,Avengers Age of Ultron,复仇者联盟2,2017-01-22,3,有意识的贱民,2015-04-26,0,2015年度最失望作品。以为面面俱到，实则画蛇添足；以为主题深刻，实则老调重弹；以为推陈出...,1052
3,Avengers Age of Ultron,复仇者联盟2,2017-01-22,4,不老的李大爷耶,2015-04-23,1,《铁人2》中勾引钢铁侠，《妇联1》中勾引鹰眼，《美队2》中勾引美国队长，在《妇联2》中终于...,1045
4,Avengers Age of Ultron,复仇者联盟2,2017-01-22,5,ZephyrO,2015-04-22,0,虽然从头打到尾，但是真的很无聊啊。,723


## 对影评数据进行简单的分析与采样

In [25]:
data["Star"].value_counts()  # 统计影评数据正负样本比例

1    1279892
0     845164
Name: Star, dtype: int64

In [26]:
data["Movie_Name_CN"].value_counts()  # 统计每部电影对应的影评数

疯狂动物城     137511
大圣归来      133393
后会无期      120200
寻龙诀       113687
你的名字      113260
夏洛特烦恼     109162
釜山行       102876
爱乐之城       96620
西游伏妖篇      91452
小时代1       88903
泰囧         85677
大鱼海棠       83692
长城         83173
西游降魔篇      79962
复仇者联盟      78281
美人鱼        73882
七月与安生      68359
美国队长3      64410
变形金刚4      58746
复仇者联盟2     54153
十二生肖       46233
九层妖塔       44366
小时代3       41152
左耳         39802
湄公河行动      35093
栀子花开       30475
何以笙箫默      26797
钢铁侠1       23739
Name: Movie_Name_CN, dtype: int64

In [27]:
# 由于数据量太大，会导致训练很慢，我们可以从原始数据中按比例采样出部分数据，加快训练过程
# 当然如果有时间，可以用全部数据进行训练
# 以下是数据采样的代码，采样完以后展示采样的数据
sample_df = data.groupby(['Movie_Name_CN', 'Star']).apply(
    lambda x: x.sample(n=int(2125056/(28*200)), replace=True, random_state=0))
sample_df.head(3)

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Movie_Name_EN,Movie_Name_CN,Crawl_Date,Number,Username,Date,Star,Comment,Like
Movie_Name_CN,Star,ID,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
七月与安生,0,1012184,Soulmate,七月与安生,2017-01-05,7152,陈信宏大过天,2016-10-13,0,也许是七月 也许是安生 总有一颗不想稳定下来的心,0
七月与安生,0,1034976,Soulmate,七月与安生,2017-01-05,30357,砂糖的砂,2016-11-07,0,跟全世界路过差不多水平吧 摄影也有很大问题 全片抓不出来一帧称得上电影构图的画面 剧情理解...,0
七月与安生,0,1032011,Soulmate,七月与安生,2017-01-05,27332,马边疆,2016-11-08,0,不喜欢，不过马思纯很好看，给两星。,1


## 数据预处理阶段
1. 需要先把数据分成训练集和验证集，训练集用来训练模型，验证集用来检验模型学习的阶段性成果。
2. 使用正则表达式清理非中文字符。
3. 对影评文本进行分词。
4. 分词的同时加载停用词词典，过滤影评中的停用词
5. 制作词典，对影评语料的每一个词赋予一个整型编号。

In [28]:
comments = sample_df.values[:, 7]
star = sample_df.values[:, 6]

x_train, x_test, y_train, y_test, = train_test_split(comments, star, test_size=0.2, random_state=0)

len(y_train), len(y_test), len(x_train), len(x_test)

(16979, 4245, 16979, 4245)

In [29]:
x_train[:2]

array([' 看完我也没有热泪盈眶，故事大概讲的是梦想与儿女情长（这是句废话，大部分电影都讲这些），情说的多了，我看的就有点累。好故事与讲好一个故事，它的好更偏向于后者（我觉得）。最后的蒙太奇堪比黄粱一梦。关于我们未发生的故事，都在我的琴键下，当最后一个音弥散于梦里，消失在空气里，你我都得醒。',
       ' 硬伤不少，但是这样强大的阵容和欢脱的情节我实在是不想说什么了！ps. 习惯用男演员加标签的我加的好崩溃！ IMAX 3D 2012.5.7'],
      dtype=object)

In [30]:
y_train[:2]

array([1, 1], dtype=object)

In [31]:
# 清理非中文字符
def clean_str(line):
    line.strip('\n')
    line = re.sub(r"[^\u4e00-\u9fff]", "", line)
    line = re.sub(
        "[0-9a-zA-Z\-\s+\.\!\/_,$%^*\(\)\+(+\"\')]+|[+——！，。？、~@#￥%……&*（）<>\[\]:：★◆【】《》;；=?？]+", "", line)
    return line.strip()

In [32]:
# 加载停用词
with open('stopWord.txt') as f:
    stopwords = [line.strip('\n') for line in f.readlines()]

FileNotFoundError: [Errno 2] No such file or directory: 'stopWord.txt'

In [None]:
def cut(text_data, labels, stopwords):
    result = []
    new_labels = []
    for index in tqdm_notebook(range(len(text_data))):
        comment = clean_str(text_data[index])
        label = labels[index]
        # 分词
        seg_list = jieba.cut(comment, cut_all=False, HMM=True)
        seg_list = [x.strip('\n')
                    for x in seg_list if x not in stopwords and len(x) > 1]
        if len(seg_list) > 1:
            result.append(seg_list)
            new_labels.append(label)
    # 返回分词结果和对应的标签
    return result, new_labels

In [None]:
# 分别对训练数据和测试数据分词
train_cut_result, train_labels = cut(x_train, y_train, stopwords)
test_cut_result, test_labels = cut(x_test, y_test, stopwords)

In [None]:
len(train_cut_result), len(train_labels), len(test_cut_result), len(test_labels)

In [None]:
train_cut_result[2:4]

In [None]:
train_labels[2:4]

In [None]:
test_cut_result[2:4]

In [None]:
test_labels[2:4]

In [None]:
vocab = train_cut_result + test_cut_result
word2index = {}
vocab_count = 0
for sent in vocab:
    for v in sent:
        if v not in word2index:
            word2index[v] = vocab_count
            vocab_count += 1
print("vocabulary size: {}".format(vocab_count))

In [None]:
for k in list(word2index.keys())[:10]:
    print("{}: {}".format(k, word2index[k]))

## 训练数据和验证数据准备阶段
1. 对每一条训练数据分词，去停用词。
2. 将词应设为其在词典中对应的编号。

In [None]:
x_train = []
for sent in train_cut_result:
    x_train.append([word2index[v] for v in sent])
x_train[2:4]

In [None]:
len(x_train), len(train_labels)

In [None]:
x_test = []
for sent in test_cut_result:
    x_test.append([word2index[v] for v in sent])
x_test[2:4]

In [None]:
len(x_test), len(test_labels)

## 导入训练模型需要的深度学习框架

In [None]:
from keras.callbacks import EarlyStopping, ModelCheckpoint
from keras.datasets import imdb
from keras.preprocessing import sequence

from text_cnn import TextCNN

## 训练模型

In [None]:
# 设置模型参数
maxlen = 80
batch_size = 32
embedding_dims = 50
epochs = 10

In [None]:
print('Pad sequences (samples x time)...')
x_train = sequence.pad_sequences(x_train, maxlen=maxlen)
x_test = sequence.pad_sequences(x_test, maxlen=maxlen)
print('x_train shape:', x_train.shape)
print('x_test shape:', x_test.shape)

print('Build model...')
model = TextCNN(maxlen, vocab_count, embedding_dims).get_model()
model.compile('adam', 'binary_crossentropy', metrics=['accuracy'])

print('Train...')
early_stopping = EarlyStopping(monitor='val_acc', patience=3, mode='max')
# 设置保存训练好的模型的名字
checkpointer = ModelCheckpoint('montion_best.h5', verbose=1, save_best_only=True)
print(len(x_test), len(test_labels))
model.fit(x_train, train_labels,
          batch_size=batch_size,
          epochs=epochs,
          callbacks=[early_stopping, checkpointer],
          validation_data=(x_test, test_labels))

model.save("montion_final.h5")

## 预测阶段

In [None]:
sample_text = "不喜欢，不过马思纯很好看，给两星。"
tmp_text = clean_str(sample_text)
seg_list = jieba.cut(tmp_text, cut_all=False, HMM=True)
seg_list = [x.strip('\n') for x in seg_list if x not in stopwords and len(x) > 1]
seg_list

In [None]:
seg_list = [word2index[v] for v in seg_list]
seg_list

In [None]:
import numpy as np

In [None]:
seg_list = seg_list+[0]*(maxlen-len(seg_list)) if len(seg_list) <= 80 else seg_list[:80]
seg_list = np.array([seg_list])
print(seg_list.shape)
result = model.predict(seg_list)
print(result)
if result[0] > 0.5:
    print("积极")
else:
    print("消极")