# 1.查看数据

In [7]:
# 查看前五条数据
import pandas as pd
df = pd.read_csv('./data/waimai_10k.csv',encoding = "utf8")
df.head()

Unnamed: 0,label,review
0,1,很快，好吃，味道足，量大
1,1,没有送水没有送水没有送水
2,1,非常快，态度好。
3,1,方便，快捷，味道可口，快递给力
4,1,菜味道很棒！送餐很及时！


In [8]:
# 查看有无缺失值
df.isnull().sum()

label     0
review    0
dtype: int64

In [9]:
# 查看负面评论和正面评论的数量
for name, group in df.groupby("label"):
    print(name,'\t', len(group))

0 	 7987
1 	 4000


# 2.数据清洗

In [10]:
# 提取数据
label = []
review = []
# 提取标签数据
for i in df["label"]:
    label.append(i)
# 提取评论数据
for i in df["review"]:
    review.append(i)

In [11]:
# 去除标点符号和数字等无关字符，仅匹配中文字符
import re
for i in range(len(review)):
    pattern = re.compile("[^\u4e00-\u9fa5]")
    sentence = re.sub(pattern,'',review[i])
    review[i] = sentence

# 3.分词

## 3.1 HMM分词

In [12]:
def train():
    # 初始化参数
    trans_prob = {}  # 转移概率
    emit_prob = {}  # 发射概率
    init_prob = {}  # 状态出现次数
    Count_dict={}
    state_list = ['B', 'M', 'E', 'S']
    for state in state_list:
        trans = {}
        for s in state_list:
            trans[s] = 0
        trans_prob[state] = trans
        emit_prob[state] = {}
        init_prob[state] = 0
        Count_dict[state] = 0
    count = -1
    # 读取并处理单词、计算概率矩阵
    path = './data/trainCorpus.txt'
    for line in open(path, 'r'):
        count += 1
        line = line.strip()
        if not line:
            continue

        # 读取每一行的单词
        word_list = []
        for i in line:
            if i != ' ':
                word_list.append(i)

        # 标注每个单词的位置标签
        word_label = []
        for word in line.split():
            label = []
            if len(word) == 1:
                label.append('S')
            else:
                label += ['B'] + ['M'] * (len(word) - 2) + ['E']
            word_label.extend(label)

        # 统计各个位置状态下的出现次数，用于计算概率
        for index, value in enumerate(word_label):
            Count_dict[value] += 1
            if index == 0:
                init_prob[value] += 1
            else:
                trans_prob[word_label[index - 1]][value] += 1
                emit_prob[word_label[index]][word_list[index]] = (
                        emit_prob[word_label[index]].get(
                            word_list[index], 0) + 1.0)
    # 初始概率
    for key, value in init_prob.items():
        init_prob[key] = value * 1 / count
        # 转移概率
    for key, value in trans_prob.items():
        for k, v in value.items():
            value[k] = v / Count_dict[key]
        trans_prob[key] = value
    # 发射概率，采用加1平滑
    for key, value in emit_prob.items():
        for k, v in value.items():
            value[k] = (v + 1) / Count_dict[key]
        emit_prob[key] = value
    # 将3个概率矩阵保存至json文件


    model = './tmp/hmm_model.json'
    f = open(model, 'w')
    f.write(json.dumps(trans_prob) + '\n' + json.dumps(emit_prob) +
            '\n' + json.dumps(init_prob))
    f.close()

In [13]:
def viterbi(text, state_list, init_prob, trans_prob, emit_prob):
    V = [{}]
    path = {}
    # 初始概率
    for state in state_list:
        #print('=========================')
        #print(state)
        #print('**********')
        #print(init_prob)
        V[0][state] = init_prob[state] * emit_prob[state].get(text[0], 0)
        path[state] = [state]

    # 当前语料中所有的字
    key_list = []
    for key, value in emit_prob.items():
        for k, v in value.items():
            key_list.append(k)

    # 计算待分词文本的状态概率值，得到最大概率序列
    for t in range(1, len(text)):
        V.append({})
        newpath = {}
        for state in state_list:
            if text[t] in key_list:
                emit_count = emit_prob[state].get(text[t], 0)
            else:
                emit_count = 1
            (prob, a) = max(
                [(V[t - 1][s] * trans_prob[s].get(state, 0)* emit_count, s)
                             for s in state_list if V[t - 1][s] > 0], default=(0.0, "S"))
            V[t][state] = prob
            newpath[state] = path[a] + [state]
        path = newpath
    # 根据末尾字的状态，判断最大概率状态序列
    if emit_prob['M'].get(text[-1], 0) > emit_prob['S'].get(text[-1], 0):
        (prob, a) = max([(V[len(text) - 1][s], s) for s in ('E', 'M')])
    else:
        (prob, a) = max([(V[len(text) - 1][s], s) for s in state_list])

    return (prob, path[a])

In [19]:
def cut(text):
    state_list = ['B', 'M', 'E', 'S']
    model = './tmp/hmm_model.json'
    # 先检查当前路径下是否有json文件，如果有json文件，需要删除

    if os.path.exists(model):
        f = open(model, 'rb')
        trans_prob = json.loads(f.readline())
        emit_prob = json.loads(f.readline())
        init_prob = json.loads(f.readline())
        f.close()
    else:
        trans_prob = {}
        emit_prob = {}
        init_prob = {}
    # 利用维特比算法，求解最大概率状态序列
    prob, pos_list = viterbi(text, state_list, init_prob, trans_prob, emit_prob)
    # 判断待分词文本每个字的状态，输出结果
    begin, follow = 0, 0
    for index, char in enumerate(text):
        state = pos_list[index]
        if state == 'B':
            begin = index
        elif state == 'E':
            yield text[begin: index+1]
            follow = index + 1
        elif state == 'S':
            yield char
            follow = index + 1
    if follow < len(text):
        yield text[follow:]

In [20]:
# 训练
import os
import json
import datetime
starttime = datetime.datetime.now()
train()
endtime = datetime.datetime.now()
print('training times is '+str((endtime-starttime).seconds)+' seconds')

training times is 0 seconds


In [23]:
# HMM分词
from tqdm import tqdm
HMM_words = []
for sentence in tqdm(review):
    if sentence != "":
        HMM_words.append(list(cut(sentence)))
HMM_words

100%|███████████████████████████████████████████████████████████████████████████| 11987/11987 [01:07<00:00, 176.74it/s]


[['很', '快好', '吃味道足', '量大'],
 ['没有', '送水', '没有', '送水', '没有', '送水'],
 ['非常', '快态度', '好'],
 ['方便', '快捷', '味道', '可口', '快递', '给力'],
 ['菜味道', '很棒', '送餐', '很及', '时'],
 ['今天', '师傅', '是', '不是', '手抖', '了', '微', '辣格外', '辣'],
 ['送', '餐快', '态度', '也', '特别', '好辛', '苦', '啦', '谢谢'],
 ['超级',
  '快',
  '就',
  '送到',
  '了',
  '这么',
  '冷',
  '的',
  '天气',
  '骑士',
  '们',
  '辛',
  '苦',
  '了',
  '谢',
  '谢',
  '你',
  '们',
  '麻',
  '辣',
  '香',
  '锅',
  '依',
  '然',
  '很',
  '好',
  '吃'],
 ['经过', '上次', '晚了', '小时', '这次', '超级', '快', '分钟', '就', '送到', '了'],
 ['最后', '五分', '钟订', '的', '卖', '家', '特别', '好', '接单', '了', '谢谢'],
 ['量', '大好', '吃', '每次', '点', '的', '都够', '吃', '两次'],
 ['挺', '辣的', '吃', '着', '还', '可以', '吧'],
 ['味道', '好', '送餐', '快分', '量足'],
 ['量', '足好', '吃', '送餐', '也', '快'],
 ['特别', '好', '吃量', '特大', '而且', '送餐', '特别', '快', '特别', '特别棒'],
 ['口', '感好', '的', '很速', '度快'],
 ['相当', '好', '吃', '的', '香锅', '分', '量够', '足味道', '也', '没', '的', '说'],
 ['好', '吃', '速度', '包装', '也', '有品质', '不出', '家门', '就能', '吃到', '餐厅', '的', '味道'],
 ['味道', '好

In [28]:
HMM_data = []
for i in HMM_words:
    t = (" ").join([j for j in i])
    HMM_data.append(t)

## 3.2 jieba分词

In [24]:
import jieba
jieba_words = []
for sentence in review:
    if sentence != "":
        jieba_words.append(jieba.lcut(sentence))
jieba_words

Building prefix dict from the default dictionary ...
Loading model from cache C:\Users\lst\AppData\Local\Temp\jieba.cache
Loading model cost 0.561 seconds.
Prefix dict has been built successfully.


[['很快', '好吃', '味道', '足量', '大'],
 ['没有', '送水', '没有', '送水', '没有', '送水'],
 ['非常', '快', '态度', '好'],
 ['方便快捷', '味道', '可口', '快', '递给', '力'],
 ['菜', '味道', '很棒', '送餐', '很', '及时'],
 ['今天', '师傅', '是不是', '手抖', '了', '微辣', '格外', '辣'],
 ['送餐', '快', '态度', '也', '特别', '好', '辛苦', '啦', '谢谢'],
 ['超级',
  '快',
  '就',
  '送到',
  '了',
  '这么',
  '冷',
  '的',
  '天气',
  '骑士',
  '们',
  '辛苦',
  '了',
  '谢谢你们',
  '麻辣',
  '香锅',
  '依然',
  '很',
  '好吃'],
 ['经过', '上次', '晚', '了', '小时', '这次', '超级', '快', '分钟', '就', '送到', '了'],
 ['最后', '五分钟', '订', '的', '卖家', '特别', '好', '接单', '了', '谢谢'],
 ['量', '大', '好吃', '每次', '点', '的', '都', '够吃', '两次'],
 ['挺辣', '的', '吃', '着', '还', '可以', '吧'],
 ['味道', '好', '送', '餐快', '分量', '足'],
 ['量足', '好吃', '送餐', '也', '快'],
 ['特别', '好吃', '量', '特大', '而且', '送餐', '特别', '快', '特别', '特别', '棒'],
 ['口感', '好', '的', '很', '速度', '快'],
 ['相当', '好吃', '的', '香锅', '分量', '够', '足', '味道', '也', '没', '的', '说'],
 ['好吃',
  '速度',
  '包装',
  '也',
  '有',
  '品质',
  '不',
  '出',
  '家门',
  '就',
  '能',
  '吃',
  '到',
  '餐厅',
  '的',
  '味道'],


In [29]:
jieba_data = []
for i in jieba_words:
    t = (" ").join([j for j in i])
    jieba_data.append(t)

# 4.提取特征

In [26]:
# 加载停止词
stopword_list = [k.strip() for k in open('./data/stopwords.txt', encoding='utf-8') if k.strip() != '']

## 4.1 TF-IDF

In [27]:
# 创建TF-IDF模型
from sklearn.feature_extraction.text import TfidfVectorizer
tf_idf = TfidfVectorizer(stop_words = stopword_list, ngram_range=(1, 3))

In [31]:
# HMM
HMM_tfidf = tf_idf.fit_transform(HMM_data)
# jieba
jieba_tfidf = tf_idf.fit_transform(jieba_data)

## 4.2 CountVectorizer

In [32]:
# 使用CountVectorizer模型
from sklearn.feature_extraction.text import CountVectorizer
countVectorizer = CountVectorizer(min_df = 2, stop_words = stopword_list)

In [33]:
# HMM
HMM_count = countVectorizer.fit_transform(HMM_data)
# jieba
jieba_count = countVectorizer.fit_transform(jieba_data)

# 5.模型选择

## 5.1 SVM + TF-IDF + HMM

In [34]:
# 按8:2划分训练集和测试集
from sklearn.model_selection import train_test_split
train_data, test_data, train_label, test_label = train_test_split(HMM_tfidf, label, test_size = 0.2, shuffle=True)

In [35]:
from sklearn.svm import SVC
svm_model = SVC(kernel = 'linear')
svm_model.fit(train_data, train_label)

SVC(kernel='linear')

In [36]:
# 输出准确率
score = svm_model.score(test_data, test_label)
print("准确率:%.2f" % score)

准确率:0.83


In [38]:
# 各个分类的精确率、召回率、F1值
from sklearn.metrics import classification_report
pred = svm_model.predict(test_data)
print(classification_report(test_label, pred, target_names = ["负面评论", "正面评论"]))

              precision    recall  f1-score   support

           0       0.82      0.96      0.88      1583
           1       0.88      0.59      0.70       815

    accuracy                           0.83      2398
   macro avg       0.85      0.77      0.79      2398
weighted avg       0.84      0.83      0.82      2398

