In [1]:
import os
import re
import jieba
from tqdm import tqdm
import numpy as np
from gensim import corpora
import matplotlib.pyplot as plt
from gensim.models.word2vec import Word2Vec

# 定义常用函数

In [2]:
# 读取文件,code编码方式
def read_coding(filename, code='utf-8'):
    with open(filename, 'r') as fp:
        word_list = fp.readlines()
        word_encode = [word.strip().decode(code, 'ignore') for word in word_list]
    return word_encode

# 打印列表
def print_list(lst):
    print ' '.join(lst)

# 全角转半角
def full2half(ustring):
    rstring = ''
    for uchar in ustring:
        inside_code = ord(uchar)
        if inside_code == 12288:  # 空格      
            inside_code = 32
        elif inside_code == 12290:  # 句号
            inside_code = 46
        elif (inside_code >= 65281 and inside_code <= 65374):
            inside_code -= 65248
        rstring += unichr(inside_code)
    return rstring

# 词典预处理

## 构建基础情感词典
- Hownet知网情感词典（积极：4566  消极：4370）：`dict/hownet/*`
- NTUSD情感词典（积极：2810  消极：8276）: `dict/`
- 中文褒贬义情感词典（积极：5567  消极：4469）: `dict/`

将三个词典进行合并去重,并人工筛选出明显错误，最终得到情感词典`dict/fin_neg`和`dict/fin_pos`


## 停用词处理
- 原始停用词表`dict/stop_words_ch.txt`中去除情感词，否定词，程度副词, 总结词， 关联词， 转折词

## 词典结构
- dict/fin_neg 负情感词
- dict/fin_pos 正情感词
- dict/inversedict.txt 否定词
- dict/degreeDict.txt 程度副词
- dict/stop_words_ch.txt 停用词

## 数据集
- 谭松波酒店评论语料（10000条），选出其中2000条作为测试集
- 李军酒店评论语料（20000条）

In [8]:
# 合并的情感词典
neg_list = read_coding('dict/fin_neg')  # 13533
pos_list = read_coding('dict/fin_pos')  # 10008

# 获取种子词典
seeds = read_coding('dict/seed_501')
seed_n = []
seed_p = []
for seed in seeds: 
    s = seed.split()
    if int(s[1]) == 1:
        seed_p.append(s[0])
    elif int(s[1]) == -1:
        seed_n.append(s[0])
    else:
        print s[0]

new_neg_list = neg_list + seed_n  # 13852
new_pos_list = pos_list + seed_p  # 10189

# 初始情感词典
sen_dict = {}
for w in neg_list:
    sen_dict[w] = -1
for w in pos_list:
    sen_dict[w] = 1

# 关键词
guanjian_neg = u'再也不来 下次不来 后悔入住 再也不住 再也不来住 决不住 不会入住 考虑换酒店 下次不会去 别去了'.split()
guanjian_pos = u'建议入住 推荐入住 强烈推荐 极力推荐 很值得'.split()
for w in guanjian_neg:
    sen_dict[w] = -10
for w in guanjian_pos:
    sen_dict[w] = 10

sen_list = sen_dict.keys()

# 否定词
not_list = read_coding('dict/inversedict.txt')
# 程度副词
degree_list = read_coding('dict/degreeDict.txt', 'gbk')
degree_dict = {}
for line in degree_list:
    lines = line.split(' ')
    degree_dict[lines[0]] = lines[1]
degree_list = degree_dict.keys()

zongjie = u'总体说来 总体来说 总体看 总体感觉 总的来看 总的说来 总之 总而言之 总结 整体感觉 整体来说 整体说来 整体看 一句话'.split()
zhuanzhe = u'但是 却 然而 可是 只是 不过 不料 竟然 偏偏 可惜 岂知 没想到'.split()
guanjian = guanjian_neg + guanjian_pos
# 停用词
stop_words = read_coding('dict/stop_words_ch.txt', 'gbk')

# 将总结词，关键词，转折词放入结巴分词词典中，避免被错分
words = zongjie + zhuanzhe + guanjian
for w in zongjie + zhuanzhe + guanjian:
    jieba.add_word(w)

In [4]:
# 分句：转化为半角符号，根据逗号，句号，分号，回车，问号分句
def text2sen(text):
    text = full2half(text)
    if u'\u5bbe\u9986\u53cd\u9988' in text:
        text = text.split(u'\u5bbe\u9986\u53cd\u9988')[0]  # 去除宾馆反馈的影响
    sents = re.split(',|\.|;|\n|\?| |', text)
    return_text = []
    for sent in sents:
        if '!' in sent:
            return_text.append(sent.split('!')[0]*3)  # 感叹句翻4倍计算
        return_text.append(sent)
    return return_text

# 分词、去除停用词
def sent2word(sentence, new_stop_words):
    segList = jieba.lcut(sentence)
    newSent = [w for w in segList if w not in new_stop_words]
    return newSent

# 获取语料集，先分句后分词，[[senlist], [senlist]]
def corpus2list(path, new_stop_words):
    corpus = []
    for parent, _, fileNames in os.walk(path):
        for fileName in tqdm(fileNames):
            currentPath = os.path.join(parent, fileName)
            with open(currentPath, 'r') as fp:
                text = fp.read().decode('gbk', 'ignore').strip()  # unicode
                sens = text2sen(text)  # 分句
                sen_list = []  # 每一句分词
                for sen in sens:
                    cut_sent = sent2word(sen, new_stop_words)  # 分词
                    sen_list.append(cut_sent)
                corpus.append(sen_list)
    return corpus

# 计算情感分

In [5]:
# 词性分类
def classifyWords(wordList, sen_dict):
    sen_list = sen_dict.keys()
    sen_word, not_word, degree_word = {}, {}, {}
    for i, word in enumerate(wordList):
        if word in sen_list and word not in not_list and word not in degree_list:
            sen_word[i] = sen_dict[word]  # {'loc':score}
        elif word in not_list and word not in degree_list:
            not_word[i] = -1  # {'loc':-1}
        elif word in degree_list:
            degree_word[i] = degree_dict[word]  # {'loc': score}
    return sen_word, not_word, degree_word

# 计算句子级别情感分,输入已分词的句子列表
def sent_score(wordList, sen_dict):
    senWord, notWord, degreeWord = classifyWords(wordList, sen_dict)
    senloc = 0 # 上个情感词位置
    scoreSum = 0
    # 存所有情感词的位置的列表
    senLoc = senWord.keys()
    notLoc = notWord.keys()
    degreeLoc = degreeWord.keys()

    for i in range(len(wordList)): # 遍历每一个词
        # 如果是情感词
        score = 0
        if i in senLoc:
            score += float(senWord[i])
            for j in range(senloc , i): # 在两个情感词之间查找否定词与程度副词
                if j in notLoc: # 存在否定词
                    score *= -1
                elif j in degreeLoc: # 存在程度副词
                    score *= float(degreeWord[j])
                elif j in zhuanzhe:
                    score *= 3
                elif j in zongjie:
                    score *= 2
            senloc = i
            scoreSum += score
    return scoreSum

# 篇章级别情感分
def doc_score(test, sen_dict):
    score_all = []
    for doc in tqdm(test):
        score = 0
        for sent in doc:
            score += sent_score(sent, sen_dict)
        score_all.append(score)
    return score_all

# 一个文本的情感得分
def a_doc_score(doc):
    score = 0
    for sent in doc:
        score += sent_score(sent)
    return score

# 输出情感分以便观察
def print_score(cor):
    for doc in cor:
        score = a_doc_score(doc)
        if score > 0:
            print 
            print u'总得分：', score
            print 
            for sen in doc:
                print sent_score(sen),
                print_list(sen)
    return 0

def get_result(neg_score, pos_score):
    np_neg = np.array(neg_score)
    np_pos = np.array(pos_score)
    np_all = np.append(np_neg, np_pos)
    accuracy = np.append((np_neg < 0), (np_pos > 0)).astype('int').mean()
    precision_neg = sum(np_neg < 0)*1.0/sum(np_all < 0)
    precision_pos = sum(np_pos > 0)*1.0/sum(np_all > 0)
    recall_neg = sum(np_neg < 0)*1.0/len(np_neg)
    recall_pos = sum(np_pos > 0)*1.0/len(np_pos)
    f1_neg = 2*precision_neg*recall_neg/(precision_neg+recall_neg + 1e-8) # f1值
    f1_pos = 2*precision_pos*recall_pos/(precision_pos+recall_pos + 1e-8)
    print 'accuracy: ', accuracy
    print 'precision_neg: ', precision_neg
    print 'precision_pos: ', precision_pos
    print 'recall_neg: ', recall_neg
    print 'recall_pos: ', recall_pos
    print 'f1_neg: ', f1_neg
    print 'f1_pos: ', f1_pos
    return accuracy, precision_neg, precision_pos, recall_neg, recall_pos

# print_score(test_neg)

In [6]:
def sen_main(neg_list, pos_list):
    sen_dict = {}
    for w in neg_list:
        sen_dict[w] = -1
    for w in pos_list:
        sen_dict[w] = 1
    
    # 关键词
    for w in guanjian_neg:
        sen_dict[w] = -10
    for w in guanjian_pos:
        sen_dict[w] = 10

    sen_list = sen_dict.keys()
    
    not_stop_words = sen_list + degree_list + not_list + zongjie + zhuanzhe + guanjian
    new_stop_words = [word for word in stop_words if word not in not_stop_words]
    
    test_neg = corpus2list('data/htl_corpus/test/neg', new_stop_words)
    test_pos = corpus2list('data/htl_corpus/test/pos',new_stop_words)
    
    neg_score = doc_score(test_neg, sen_dict)
    pos_score = doc_score(test_pos, sen_dict)
    
    return get_result(neg_score, pos_score)

In [7]:
result_init = sen_main(neg_list, pos_list)

100%|██████████| 1000/1000 [00:04<00:00, 203.81it/s]
100%|██████████| 1000/1000 [00:03<00:00, 283.52it/s]
100%|██████████| 1000/1000 [00:55<00:00, 18.05it/s]
100%|██████████| 1000/1000 [00:39<00:00, 19.67it/s]

accuracy:  0.724
precision_neg:  0.860563380282
precision_pos:  0.738095238095
recall_neg:  0.611
recall_pos:  0.837
f1_neg:  0.714619878185
f1_pos:  0.784442356782





In [9]:
result_seed = sen_main(new_neg_list, new_pos_list)

100%|██████████| 1000/1000 [00:05<00:00, 170.47it/s]
100%|██████████| 1000/1000 [00:03<00:00, 277.46it/s]
100%|██████████| 1000/1000 [00:57<00:00, 17.37it/s]
100%|██████████| 1000/1000 [00:41<00:00, 24.34it/s]

accuracy:  0.7745
precision_neg:  0.829347826087
precision_pos:  0.814507772021
recall_neg:  0.763
recall_pos:  0.786
f1_neg:  0.794791661675
f1_pos:  0.799999995002





# word2vector建立词向量

## 训练集
- 谭松波酒店评论语料（8000条）
- 李军酒店评论语料（20000条）

In [10]:
def get_text(path):
    text_list = []
    for parent, _, fileNames in os.walk(path):
        for fileName in tqdm(fileNames):
            currentPath = os.path.join(parent, fileName)
            with open(currentPath, 'r') as fp:
                text_list.append(fp.read().decode('gbk', 'ignore').strip())
    return text_list

with open('data/review_sentiment.txt') as fp:
    text = fp.read().decode('gbk', 'ignore').split('ljthunlp')[2:]
    pro_text = [sen.split(' train')[0] for sen in text]

text_neg_chi = get_text('data/htl_corpus/train/neg')
text_pos_chi = get_text('data/htl_corpus/train/pos')
text_merge = text_neg_chi + text_pos_chi + pro_text

100%|██████████| 2000/2000 [00:01<00:00, 1991.43it/s]
100%|██████████| 6000/6000 [00:03<00:00, 1860.13it/s]


## 生成w2v模型

In [11]:
# class MyText(object):
#     def __iter__(self):
#         for line in text_merge:
#             yield jieba.lcut(line)

# sentences = MyText()
# w2v_model = Word2Vec(sentences, size=200, min_count=5, workers=10)
# w2v_model.save('data/w2v_ours_200')
w2v_model = Word2Vec.load('data/w2v_ours_200')
# w2v_sougou_ours_200 全部10W语料
# w2v_ours_200 谭+李2W语料

# 卡方检验进行特征选择

In [12]:
# 分词不分句
def cut_sent(sent_list):
    return_sents = []
    for sent in tqdm(sent_list):
        cuts = jieba.lcut(sent)
        cut = [w for w in cuts if w not in stop_words]  # 去除停用词
        try:
            w1 = cut.index(u'\u5bbe\u9986')  # 去除宾馆反馈
            w2 = cut.index(u'\u53cd\u9988')
            if w2-w1 == 1:
                cut = cut[:w1]
        except:
            cut = cut
        return_sents.append(cut)
    return return_sents
# 分词后的语料
neg_cut = cut_sent(text_neg_chi)
pos_cut = cut_sent(text_pos_chi)

100%|██████████| 2000/2000 [00:09<00:00, 208.31it/s]
100%|██████████| 6000/6000 [00:18<00:00, 322.66it/s]


In [13]:
# 计算CHI,去除停用词，停用词对CHI值影响不大，可去可不去。使用stop_words
def compute_chi(neg_cut, pos_cut):
    neg_corpus_dict = corpora.Dictionary(neg_cut)
    pos_corpus_dict = corpora.Dictionary(pos_cut)
    n = 8000.0
    neg_num = 2000.0
    pos_num = 6000.0
    dfs_neg = neg_corpus_dict.dfs  # 词id在文档集中出现的文档数
    dfs_pos = pos_corpus_dict.dfs
    neg_chi = {}
    pos_chi = {}
    merge_chi = {}
    for i, word in neg_corpus_dict.items():
        a = dfs_neg[i]
        try:
            b = dfs_pos[pos_corpus_dict.token2id[word]]
        except:
            b = 0
        c = neg_num - a
        d = pos_num - b
        x = n*(a*d-b*c)*(a*d-b*c)/neg_num/pos_num/(a+b)/(c+d)  # 计算卡方检验
        neg_chi[word] = x

    for i, word in pos_corpus_dict.items():
        a = dfs_pos[i]
        try:
            b = dfs_neg[neg_corpus_dict.token2id[word]]
        except:
            b = 0
        c = pos_num - a
        d = neg_num - b
        x = n*(a*d-b*c)*(a*d-b*c)/neg_num/pos_num/(a+b)/(c+d)  # 计算卡方检验
        pos_chi[word] = x
    
    # 经测试，CHI值相同
    for i, word in neg_corpus_dict.items():
        neg_chi_score = neg_chi[word]
        neg_chi_num = dfs_neg[i]
        try:
            pos_chi_score = pos_chi[word]
            pos_chi_num = dfs_pos[i]
        except:
            pos_chi_score = 0
            pos_chi_num = 0
        merge_chi[word] = (neg_chi_score*neg_chi_num + pos_chi_score*pos_chi_num)/n
    
    for i, word in pos_corpus_dict.items():
        pos_chi_score = pos_chi[word]
        pos_chi_num = dfs_pos[i]
        try:
            neg_chi_score = neg_chi[word]
            neg_chi_num = dfs_neg[i]
        except:
            neg_chi_score = 0
            neg_chi_num = 0
        merge_chi[word] = (neg_chi_score*neg_chi_num + pos_chi_score*pos_chi_num)/n
    return merge_chi, neg_chi, pos_chi

In [14]:
merge_chi, neg_chi, pos_chi = compute_chi(neg_cut, pos_cut)

sort_chi = sorted(merge_chi.items(), key=lambda i:i[1], reverse=True)

fin_chi_list = [i[0] for i in sort_chi[:1000]]

for i in sort_chi[:10]:
    print i[0],i[1]

不错 467.129534039
不 72.9955600927
好 72.2940172515
很 61.1338529954
说 37.5261589613
房间 17.1679198633
前台 15.7065076259
携程 15.1791901801
一个 13.9857686953
住 12.754512599


## 计算种子词的情感分
- 找出种子词的10个近邻词
- 种子词在原始情感词典中，直接取出情感倾向
- 种子词不在原始情感词典中，则将种子词的情感分加和，大于零得分1，小于零得分-1

In [15]:
def seed_score(seed):
    w2v_list = w2v_model.wv.vocab.keys()
    seed_sen = {}
    for w in tqdm(seed):
        score  = 0
        if w in sen_list:
            seed_sen[w] = sen_dict[w]
        elif w in w2v_list:
            sim_list = w2v_model.wv.most_similar(w, topn=10)
            for e in sim_list:
                if e[0] in sen_list:
                    score += float(sen_dict[e[0]])
            seed_sen[w] = 1 if score>0 else -1
        else:
            seed_sen[w] = 0
    return seed_sen

seed_sen = seed_score(fin_chi_list)
for w,s in seed_sen.items()[:10]:
    print w,s

100%|██████████| 1000/1000 [00:07<00:00, 140.48it/s]

退 1
all -1
挂 -1
奉劝 -1
表扬 1
糟糕 -1
还要 -1
千万 -1
整体 1
凌晨 -1





## 计算近邻词情感分
- 近邻词在原始情感词典中，直接取出情感倾向
- 近邻词不在原始情感词典中，则寻找与近邻词最相近的种子词，近邻词的倾向与该种子词相同

In [16]:
def get_score_by_seed(seed_sen_dict, word):
    score = 0
    fin_sim = 0
    seed_sen_list = seed_sen_dict.keys()
    for seed in seed_sen_list:
        try:
            sim = w2v_model.wv.similarity(word, seed)
        except:
            sim = 0
        if sim > fin_sim:
            socre = seed_sen_dict[seed]
            fin_sim = sim
    return score

def neig_score(seed_sen_dict):
    neig_sen = {}
    seed_sen_list = seed_sen_dict.keys()
    w2v_list = w2v_model.wv.vocab.keys()
    for w in tqdm(seed_sen_list):
        if w not in w2v_list:  # 种子词不在v2w词表中
            continue
        sim_list = w2v_model.wv.most_similar(w, topn=10)  # 找到10个近邻词
        for e in sim_list:
            score = 0
            if e[0] in neig_sen.keys():  # 近邻词已保存
                continue
            elif e[0] in sen_list:  # 近邻词在情感词典中
                score = sen_dict[e[0]]
            elif e[0] in seed_sen_list:  # 近邻词在种子词典中
                score = seed_sen_dict[e[0]]
            else:  # 通过相似度计算近邻词得分
                score = get_score_by_seed(seed_sen_dict, e[0])
            neig_sen[e[0]] = score
    return neig_sen

neig_sen = neig_score(seed_sen)
for w,s in neig_sen.items()[:10]:
    print w,s

100%|██████████| 1000/1000 [00:52<00:00, 19.10it/s]

旅舍 0
斑驳 -1
挂 -1
西方 0
　0
何况 0
出来 0
第二 1
不问 0
明快 1





In [17]:
seed_and_neig = seed_sen.copy()
seed_and_neig.update(neig_sen)
# 输出保存，以便人工筛选
with open('dict/seed', 'w') as fp:
    for w,s in seed_and_neig.items():
        fp.write(w.encode('utf8') + '\t' + str(s) + '\n')