<h1>Table of Contents<span class="tocSkip"></span></h1>
<div class="toc" style="margin-top: 1em;"><ul class="toc-item"></ul></div>

In [1]:
import os
import pandas as pd

# 1. 导入数据
path = r"../data/"
origin_file = "comment_nm.xlsx"
comments_df = pd.read_excel(os.path.join(path, origin_file), encoding="utf8")
comments_df.shape
print(comments_df.shape, comments_df.head())
# 留个备份, 这样之后覆盖写错了就不用重新覆盖
comments_df2 = comments_df.copy()


(77273, 8)      团购活动ID       用户ID     用户名                 评价时间                      评价内容  \
0  10051080  437407990  嗯****～  2018-04-04 08:35:30          这家店竟然没有玉米饼和杠子馍。。   
1  10051080  434997839  露****4  2017-10-25 08:18:22           味道很不错，今天还要再去，很棒   
2  10051080  424757158  礼****来  2017-01-19 02:38:39      很划算，适合5、6个人吃不浪费，很喜欢！   
3  10051080  424646918  请****7  2017-01-17 09:15:15           味道不错，喜欢吃肉的朋友可以去   
4  10051080  423190677  1****0  2016-12-26 01:43:18  环境优雅，爱吃肉的朋友可以去体验一下，味道不错！   

   评分         消费门店   用户排名  
0   4  姥家大锅台(交通路店)  10161  
1   5  姥家大锅台(交通路店)   9681  
2   5  姥家大锅台(交通路店)   8847  
3   5  姥家大锅台(交通路店)   8835  
4   5  姥家大锅台(交通路店)   8772  


In [2]:

# 2. 清洗数据, 删除空的数据
def clean_sents(txt):
    txt = str(txt) if txt is not None else ""
    if len(txt) == 0:
        return None
    else:
        return txt

comments_df2["评价内容"] = comments_df2["评价内容"].apply(clean_sents)
comments_df2 = comments_df2[comments_df2["评价内容"] != "nan"]
len(comments_df2)

58117

In [3]:

# 2. 引入停用词文本
import jieba

stopwords_file = "stopwords.txt"
with open(os.path.join(path, stopwords_file), "r", encoding="utf8") as f:
    stopwords_list = [word.strip() for word in f.read()]


In [4]:

def filter_stopwords(txt):
    """过滤停用词"""
    sent = jieba.lcut(txt)
    words = []
    for word in sent:
        word = word.strip()
        if(word in stopwords_list):
            continue
        else:
            words.append(word)
    return words

comments_df2["评价内容"] = comments_df2["评价内容"].apply(filter_stopwords)
comments_df2.head()


Building prefix dict from the default dictionary ...
Loading model from cache C:\Users\Public\Documents\Wondershare\CreatorTemp\jieba.cache
Loading model cost 1.077 seconds.
Prefix dict has been built succesfully.


Unnamed: 0,团购活动ID,用户ID,用户名,评价时间,评价内容,评分,消费门店,用户排名
0,10051080,437407990,嗯****～,2018-04-04 08:35:30,"[这家, 店, 竟然, 没有, 玉米饼, 杠子, 馍]",4,姥家大锅台(交通路店),10161
1,10051080,434997839,露****4,2017-10-25 08:18:22,"[味道, 很, 不错, 今天, 还要, 去, 很棒]",5,姥家大锅台(交通路店),9681
2,10051080,424757158,礼****来,2017-01-19 02:38:39,"[很, 划算, 适合, 个人, 吃, 浪费, 很, 喜欢]",5,姥家大锅台(交通路店),8847
3,10051080,424646918,请****7,2017-01-17 09:15:15,"[味道, 不错, 喜欢, 吃, 肉, 朋友, 可以, 去]",5,姥家大锅台(交通路店),8835
4,10051080,423190677,1****0,2016-12-26 01:43:18,"[环境, 优雅, 爱, 吃, 肉, 朋友, 可以, 去, 体验, 一下, 味道, 不错]",5,姥家大锅台(交通路店),8772


In [5]:

# 3. 切分训练集和验证集和测试集
from sklearn.model_selection import train_test_split

train_X, val_X, train_y, val_y = train_test_split(comments_df2["评价内容"], comments_df2["评分"], test_size=0.3)
val_X, test_X, val_y, test_y = train_test_split(val_X, val_y, test_size=0.5)


In [6]:
# 4. 统计词频
from nltk import FreqDist

all_words = []
for comment in comments_df2["评价内容"]:
    all_words.extend(comment)

len(all_words)

fdisk = FreqDist(all_words)
TOP_COMMON_WORDS = 1000
most_common_words = fdisk.most_common(TOP_COMMON_WORDS)
most_common_words[:10]


[('很', 23350),
 ('不错', 21834),
 ('味道', 17218),
 ('吃', 15025),
 ('好吃', 11842),
 ('环境', 9027),
 ('服务', 9025),
 ('去', 9000),
 ('可以', 7723),
 ('都', 7231)]

In [7]:
# 5. 生成前N个高频词的词云
from wordcloud import WordCloud
import matplotlib.pyplot as plt
from PIL import Image
import numpy as np

mask = np.array(Image.open(os.path.join(path, "火锅图片.png")))
wc = WordCloud(font_path=os.path.join(path, "simkai.ttf"),
               background_color="white",
               contour_width=3,
               contour_color='steelblue',
               mask=mask,
               width=1000,
               height=1000)

wc.generate_from_frequencies(dict(most_common_words))
fig = plt.figure(figsize=(10, 10))
plt.imshow(wc)
plt.axis("off")
plt.savefig(os.path.join(path, "火锅词云.png"), dpi=1000)
plt.show()



<matplotlib.figure.Figure at 0x19247589d30>

In [10]:
# 生成TF-IDF和词袋模型
# TODO 用nltk调用tf-idf
from nltk.text import TextCollection

tfidf_generator = TextCollection(comments_df2["评价内容"].values.tolist())

def extract_tfidf(texts, targets, text_collection, common_words):
    """
    提取文本的tf-idf.
        texts: 输入的文本.
        targets: 对应的评价.
        text_collection: 预先初始化的TextCollection.
        common_words: 输入的前N个词作为特征进行计算.
    """
    # 得到行向量的维度
    n_sample = len(texts)
    # 得到列向量的维度
    n_feat = len(common_words)

    # 初始化X矩阵, X为最后要输出的TF-IDF矩阵
    X = np.zeros([n_sample, n_feat])
    y = np.zeros(n_sample)
    for i, text in enumerate(texts):
        if i % 5000 == 0:
            print("已经完成{}个样本的特征提取.".format(i))

        # 每一行对应一个文档, 计算这个文档中的词的tf-idf, 没出现的词则为0
        feature_vector = []
        for word in common_words:
            if word in text:
                tf_idf = text_collection.tf_idf(word, text)
            else:
                tf_idf = 0.0

            feature_vector.append(tf_idf)

        X[i, :] = np.array(feature_vector)
        y[i] = targets.iloc[i]

    return X, y


cleaned_train_X, cleaned_train_y = extract_tfidf(train_X, train_y, tfidf_generator, dict(most_common_words).keys())
cleaned_val_X, cleaned_val_y = extract_tfidf(val_X, val_y, tfidf_generator, dict(most_common_words).keys())


已经完成0个样本的特征提取.
已经完成5000个样本的特征提取.
已经完成10000个样本的特征提取.
已经完成15000个样本的特征提取.
已经完成20000个样本的特征提取.
已经完成25000个样本的特征提取.
已经完成30000个样本的特征提取.
已经完成35000个样本的特征提取.
已经完成40000个样本的特征提取.
已经完成0个样本的特征提取.
已经完成5000个样本的特征提取.
