In [18]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import re
import jieba.posseg as psg

# word = pd.read_csv("./word.csv")

reviews = pd.read_csv('data/reviews.csv')

reviews = reviews.drop_duplicates(subset=['content', 'content_type'])
content = reviews["content"]
# 去除英文、数字、京东、美的、电热水器等词语,pattern
strinfo = re.compile('[0-9a-zA-Z]|京东|美的|电热水器|热水器|')
content = content.apply(lambda x: strinfo.sub('', x))
# 分词
worker = lambda s: [(x.word, x.flag) for x in psg.cut(s)]  # 自定义简单分词函数
seg_word = content.apply(worker)
# 删除停用词
stop_path = open("data/stoplist.txt", 'r', encoding='UTF-8')
stop = stop_path.readlines()
stop = [x.replace('\n', '') for x in stop]
# 遍历所有词，取出停用词并选出名词，统计词频
word_posneg = pd.DataFrame(columns=['index_content', 'word', 'nature', 'content_type', 'index_word'])
index_content = 0
for word_set in seg_word:
    index_content += 1
    index_word = 0
    for w in word_set:
        index_word += 1
#         if w[0] not in stop and 'n' in w[1]:
            # DataFrame每行要添加的Series
            # word_series = pd.Series(
            #     [index_content, w[0], w[1], reviews.iloc[index_content - 1]["content_type"], index_word])
            # word_posneg = pd.concat([word_posneg, word_series], axis=0, ignore_index=True)
        word_posneg.loc[len(word_posneg)] = [index_content, w[0], w[1], reviews.iloc[index_content - 1]["content_type"], index_word]

# 读入正面、负面情感评价词
pos_comment = pd.read_csv("data/正面评价词语（中文）.txt", header = None, sep = "/n", encoding = 'utf-8', engine = 'python')
neg_comment = pd.read_csv("data/负面评价词语（中文）.txt", header = None, sep = "/n", encoding = 'utf-8', engine = 'python')
pos_emotion = pd.read_csv("data/正面情感词语（中文）.txt", header = None, sep = "/n", encoding = 'utf-8', engine = 'python')
neg_emotion = pd.read_csv("data/负面情感词语（中文）.txt", header = None, sep = "/n", encoding = 'utf-8', engine = 'python')

# 合并情感词与评价词
positive = set(pos_comment.iloc[:, 0]) | set(pos_emotion.iloc[:, 0])
negative = set(neg_comment.iloc[:, 0]) | set(neg_emotion.iloc[:, 0])

# 正负面情感词表中相同的词语
intersection = positive & negative

positive = list(positive - intersection)
negative = list(negative - intersection)

positive = pd.DataFrame({"word": positive,"weight": [1] * len(positive)})
negative = pd.DataFrame({"word": negative,"weight": [-1] * len(negative)})

posneg = pd.concat([positive,negative],axis=0)

# 将分词结果与正负面情感词表合并，定位情感词
data_posneg = posneg.merge(word_posneg, left_on='word', right_on='word', how='right')
# data_posneg = data_posneg.sort_values(by = ['index_content','index_word'])

# 查看原来该句评论问pos，但其中分词后词情感标注未负面的
data_posneg[(data_posneg["content_type"] == 'pos') & (data_posneg["weight"] < 1)]
data_posneg[(data_posneg["content_type"] == 'neg') & (data_posneg["weight"] == 1)]
data_posneg.to_csv("data/data_posneg.csv", index = None)

In [19]:
data_posneg = pd.read_csv("data/data_posneg.csv")
# 载入否定词表
notdict = pd.read_csv("data/not.csv")

# 构造新列，作为经过否定词修正后的情感值
data_posneg['amend_weight'] = data_posneg['weight']
data_posneg['id'] = np.arange(0, len(data_posneg))

# 只保留有情感值的词语
only_inclination = data_posneg.dropna().reset_index(drop=True)
index = only_inclination['id']

for i in np.arange(0, len(only_inclination)):
    # 提取第i个情感词所在的评论
    review = data_posneg[data_posneg['index_content'] == only_inclination['index_content'][i]]
    review.index = np.arange(0, len(review))
    # 第i个情感值在该文档的位置
    affective = only_inclination['index_word'][i]
    if affective == 1:
        ne = sum([i in notdict['term'] for i in review['word'][affective - 1]]) % 2
        if ne == 1:
            data_posneg['amend_weight'][index[i]] = -data_posneg['weight'][index[i]]          
    elif affective > 1:
        ne = sum([i in notdict['term'] for i in review['word'][[affective - 1, affective - 2]]]) % 2
        if ne == 1:
            data_posneg['amend_weight'][index[i]] = -data_posneg['weight'][index[i]]
# 更新只保留情感值的数据
only_inclination = only_inclination.dropna()
# 计算每条评论的情感值
emotional_value = only_inclination.groupby(['index_content'], as_index = False)['amend_weight'].sum()
# 去除情感值为0的评论
emotional_value = emotional_value[emotional_value['amend_weight'] != 0]

In [20]:
emotional_value

Unnamed: 0,index_content,amend_weight
0,1,5.0
1,2,2.0
2,3,2.0
3,4,7.0
4,5,8.0
...,...,...
1855,1966,2.0
1857,1968,-2.0
1858,1969,-1.0
1859,1970,-3.0


In [22]:
review

Unnamed: 0,word,weight,index_content,nature,content_type,index_word,amend_weight,id
0,东西,,1973,ns,neg,1,,63765
1,送,,1973,v,neg,2,,63766
2,的,,1973,uj,neg,3,,63767
3,挺快,1.0,1973,v,neg,4,1.0,63768
4,，,,1973,x,neg,5,,63769
5,后期,,1973,t,neg,6,,63770
6,报装,,1973,v,neg,7,,63771
7,天,,1973,q,neg,8,,63772
8,还,,1973,d,neg,9,,63773
9,没人,,1973,v,neg,10,,63774
