In [1]:
import pandas as pd

In [9]:
# 导入数据
data=pd.read_csv('ratings.csv')
data.head()

Unnamed: 0,userId,restId,rating,rating_env,rating_flavor,rating_service,timestamp,comment
0,0,0,,3.0,3.0,2.0,1250584020000,经常去的，不过我自己的卡很久不用，被冻了，只能用爸爸的。吉利莲的巧克力以前选择多些，最近一次...
1,1,0,,4.0,4.0,4.0,1145639040000,"喜欢在这里购物的感觉~~不在市中心,又是凭会员卡购物,因此不会有像家乐福\沃尔马那种人多很挤..."
2,2,0,5.0,3.0,3.0,3.0,1299115500000,
3,3,0,,3.0,4.0,4.0,1162821060000,很适合有车一族来采购，因为没有袋子给的，只有纸箱子！感觉有点像仓库，可是东西好啊，买得多的话...
4,4,0,,3.0,4.0,3.0,1201107000000,里面有一些进口食品还是不错的，但个人感觉商品种类比较少，而且管理不是很灵活，退货比较麻烦。价...


In [16]:
# 缺失值处理
data = data[['comment','rating_env','rating_service','rating_flavor']]
total = data.isnull().sum().sort_values(ascending=False)
percent = ((data.isnull().sum() / data.isnull().count()) * 100).sort_values(ascending=False)
missing_data = pd.concat([total, percent], axis=1, keys=['Total', 'Percent'])
missing_data = missing_data.reset_index()
missing_data.columns = ['Name', 'Total', 'Percent']
missing_data

Unnamed: 0,Name,Total,Percent
0,rating_env,346253,7.829398
1,rating_service,346253,7.829398
2,rating_flavor,328654,7.431453
3,comment,315064,7.124159


In [44]:
def get_data(file_path,sample=15000):
    data = pd.read_csv(file_path)
    data.dropna(inplace=True)
    sampledata =  data.sample(n=sample,random_state=1024)
    sample_data = sampledata.reset_index().drop(['index'],axis=1)
    return sample_data

In [56]:
# 根据环境、口味、服务评分生成新的标签
def get_labels(data):
    data['label'] = ((data['rating_env']+data['rating_flavor']+data['rating_service'])//3)
    return data.drop(['rating_env','rating_flavor','rating_service'],axis=1)
dataset = get_labels(sample_data)
dataset.head()

Unnamed: 0,comment,label
0,"海鲜石锅拌饭可以,要很多锅巴就更不摆了,炸鱿鱼须不不错,关键是辣百菜狂爱!!!!!\n",2.0
1,这里的生意真是好，晚到一些可能就要排队了……虽然点名叫做油爆虾，但是其他的小菜其实也不错，都...,3.0
2,非常普通的一家店，纯粹是被忽悠去的。本来同事请客在正大，后来嫌交通不方便该在张江。查了点评觉...,2.0
3,说实话这是我去过最让我不想再去的地方了。服务质量那叫个差劲啊！菜得味道很普通，量也一般，装修...,1.0
4,同事推荐的这家店，淘宝上的实体店，我是到店铺自取手机的，因此事先打了N个电话，接线的服务人员...,3.0


In [59]:
# 替换文本中的特殊符号
import re
def clear_str(string):
    string = re.sub(r'\W', ' ',string)
    string = re.sub(r'_', ' ',string)
    return string.strip()
dataset['comment'] = dataset['comment'].apply(clear_str)
dataset.head()

Unnamed: 0,comment,label
0,海鲜石锅拌饭可以 要很多锅巴就更不摆了 炸鱿鱼须不不错 关键是辣百菜狂爱,2.0
1,这里的生意真是好 晚到一些可能就要排队了 虽然点名叫做油爆虾 但是其他的小菜其实也不错 都...,3.0
2,非常普通的一家店 纯粹是被忽悠去的 本来同事请客在正大 后来嫌交通不方便该在张江 查了点评觉...,2.0
3,说实话这是我去过最让我不想再去的地方了 服务质量那叫个差劲啊 菜得味道很普通 量也一般 装修...,1.0
4,同事推荐的这家店 淘宝上的实体店 我是到店铺自取手机的 因此事先打了N个电话 接线的服务人员...,3.0


In [66]:
# 数据集的分割
from sklearn.model_selection import train_test_split

def split_data(dataset):
    X=dataset['comment']
    y=dataset['label'].astype(str)
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.1, random_state=1024)
    X_train, X_dev, y_train, y_dev = train_test_split(
        X_train, y_train, test_size=0.1, random_state=1024)
    traindf=pd.DataFrame({'comment':X_train,'label':y_train}).reset_index(drop=True)
    devdf=pd.DataFrame({'comment':X_dev,'label':y_dev}).reset_index(drop=True)
    testdf=pd.DataFrame({'comment':X_test}).reset_index(drop=True)
    test_label = pd.DataFrame({'label':y_test}).reset_index(drop=True)
    return traindf,devdf,testdf,test_label

traindf,devdf,testdf,test_label = split_data(dataset=dataset)
# 保存训练集、验证集、测试集样本,为了节省时间取部分数据进行训练
traindf.to_csv('../data/sa/train.txt',sep='\t',index=False,header=None)
devdf.to_csv('../data/sa/dev.txt',sep='\t',index=False,header=None)
testdf.to_csv('../data/sa/test.txt',sep='\t',index=False,header=None)
test_label.to_csv('../data/sa/label.txt',index=False,header=None)

In [72]:
def get_stop_words(stop_words_path):
    stop_words = []
    with open(stop_words_path, 'r', encoding='utf-8') as f:
        for line in f:
            stop_words.append(line.strip())
    return stop_words

stop_words_path =  '../data/sa/baidu_stopwords.txt'
stop_words = get_stop_words(stop_words_path)
stop_words[-5:]

['首先', '高兴', '是不是', '说说', '']

In [86]:
x = devdf.comment.apply(len)

In [76]:
def cut_words(sentence, stop_words):
    import jieba
    cut_word = jieba.cut(str(sentence).strip())
    if stop_words:
        words = [word for word in cut_word if word not in stop_words]
    else:
        words = list(cut_word)
    return words

In [80]:
traindf['comment'] = traindf['comment'].apply(cut_words, stop_words=stop_words)

Building prefix dict from the default dictionary ...
Dumping model to file cache C:\Users\wang\AppData\Local\Temp\jieba.cache
Loading model cost 1.028 seconds.
Prefix dict has been built successfully.


In [91]:
from gensim.models.word2vec import Word2Vec
import logging
import multiprocessing

def train_w2v(sentences,embbedding_size=216):
    logging.basicConfig(filename='word2vec.log',level=logging.INFO)
    logging.info('Training word2vec embedding...')
    w2v_model = Word2Vec(vector_size=embbedding_size, workers=multiprocessing.cpu_count(), min_count=10, sg=0)
    w2v_model.build_vocab(sentences)
    w2v_model.train(sentences, total_examples=w2v_model.corpus_count, epochs=100)
    w2v_model.save('../model/w2vmodel')
sentences = traindf['comment'].to_list()
train_w2v(sentences)

In [96]:
from gensim.models import KeyedVectors

w2v_model = Word2Vec.load('../model/w2vmodel')
word_vectors = w2v_model.wv
word_vectors.save("../model/word2vec.wordvectors")

In [98]:
wv = KeyedVectors.load("../model/word2vec.wordvectors", mmap='r')

In [123]:
# 词向量空间
word_vectors = wv.vectors
# 词表
word2index =  wv.key_to_index

In [125]:
vocab_path = '../model/vocab.txt'
with open(vocab_path, 'w', encoding='utf-8') as f:
    for word, index in word2index.items():
        f.write(word + '\t' + str(index) + '\n')