In [2]:
# coding=utf-8
import numpy as np
import tensorflow as tf
import random
from collections import Counter
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

In [19]:
path = r"E:\PycharmProjects\untitled\src\main\resource"
pos_file = path + r'\pos.txt'
neg_file = path + r'\neg.txt'

In [31]:
def create_lexicon(pos_file, neg_file):
    lex = []
    english_punctuations = [',', '.', ':', ';', '?', '(', ')', '[', ']', '&', '!', '*', '@', '#', '$', '%','``']
    stops = set(stopwords.words("english"))
    lemmatizer = WordNetLemmatizer()
    # 读取文件
    def process_file(f):
        with open(pos_file, 'r') as f:
            lex = []
            lines = f.readlines()
            # print(lines)
            for line in lines:
                # word = re.sub("[^a-zA-Z]", " ", line.lower()).split()    按空格分词，且只保留字母，大写转小写，数字符号都不要
                text_list = word_tokenize(line.lower())  #分词，大写转小写
                # 去掉标点符号
                text_list = [word for word in text_list if word not in english_punctuations]
                # 去掉停用词
                text_list = [word for word in text_list if word not in stops]
                # 词性还原
                text_list = [lemmatizer.lemmatize(word) for word in text_list]
                lex += text_list
            return lex

    lex += process_file(pos_file)
    lex += process_file(neg_file)
    word_count = Counter(lex)
    # print(word_count)
    # {'.': 13944, ',': 10536, 'the': 10120, 'a': 9444, 'and': 7108, 'of': 6624, 'it': 4748, 'to': 3940......}
    # 去掉一些常用词,像the,a and等等，和一些不常用词; 这些词对判断一个评论是正面还是负面没有做任何贡献
    #过滤低词频
    lex = []
    for word in word_count:
        if word_count[word] < 2000 and word_count[word] > 20:  # 这写死了，好像能用百分比
            lex.append(word)  # 齐普夫定律-使用Python验证文本的Zipf分布 http://blog.topspeedsnail.com/archives/9546
    return lex
lex = create_lexicon(pos_file, neg_file)

In [33]:
def normalize_dataset(lex):
    dataset = []

    # lex:词汇表；review:评论；clf:评论对应的分类，[0,1]代表负面评论 [1,0]代表正面评论
    def string_to_vector(lex, review, clf):
        words = word_tokenize(review.lower())
        lemmatizer = WordNetLemmatizer()
        words = [lemmatizer.lemmatize(word) for word in words]

        features = np.zeros(len(lex))
        for word in words:
            if word in lex:
                features[lex.index(word)] = 1  # 一个句子中某个词可能出现两次,可以用+=1，其实区别不大
        return [features, clf]

    with open(pos_file, 'r') as f:
        lines = f.readlines()
        for line in lines:
            one_sample = string_to_vector(lex, line, [1, 0])  # [array([ 0.,  1.,  0., ...,  0.,  0.,  0.]), [1,0]]
            dataset.append(one_sample)
    with open(neg_file, 'r') as f:
        lines = f.readlines()
        for line in lines:
            one_sample = string_to_vector(lex, line, [0, 1])  # [array([ 0.,  0.,  0., ...,  0.,  0.,  0.]), [0,1]]]
            dataset.append(one_sample)

    # print(len(dataset))
    return dataset


dataset = normalize_dataset(lex)

In [41]:
#把整理好的数据保存到文件，方便使用。到此完成了数据的整理工作
import pickle
with open('save.pickle', 'wb') as f:
    pickle.dump(dataset, f)

In [42]:
random.shuffle(dataset)

# 取样本中的10%做为测试数据
test_size = int(len(dataset) * 0.1)
dataset = np.array(dataset)
train_dataset = dataset[:-test_size]
test_dataset = dataset[-test_size:]

In [43]:
n_input_layer = len(lex)  #input
n_layer_1 = 1000  #hidden layer
n_layer_2 = 1000
n_output_layer = 2   #output

In [54]:
random.shuffle(train_dataset)  # 数据集随机
train_x = train_dataset[:, 0]  # train_x   (10662,)
train_y = train_dataset[:, 1]  # 标签

In [56]:

# 定义待训练的神经网络
def neural_network(data):
    # 定义第一层"神经元"的权重和biases
    layer_1_w_b = {'w_': tf.Variable(tf.random_normal([n_input_layer, n_layer_1])),
                   'b_': tf.Variable(tf.random_normal([n_layer_1]))}
    # 定义第二层"神经元"的权重和biases
    layer_2_w_b = {'w_': tf.Variable(tf.random_normal([n_layer_1, n_layer_2])),
                   'b_': tf.Variable(tf.random_normal([n_layer_2]))}
    # 定义输出层"神经元"的权重和biases
    layer_output_w_b = {'w_': tf.Variable(tf.random_normal([n_layer_2, n_output_layer])),
                        'b_': tf.Variable(tf.random_normal([n_output_layer]))}

    # w·x+b
    layer_1 = tf.add(tf.matmul(data, layer_1_w_b['w_']), layer_1_w_b['b_'])
    layer_1 = tf.nn.relu(layer_1)  # 激活函数
    layer_2 = tf.add(tf.matmul(layer_1, layer_2_w_b['w_']), layer_2_w_b['b_'])
    layer_2 = tf.nn.relu(layer_2)  # 激活函数
    layer_output = tf.add(tf.matmul(layer_2, layer_output_w_b['w_']), layer_output_w_b['b_'])

    return layer_output


X = tf.placeholder('float', [None, len(train_dataset[0][0])])
# [None, len(train_x)]代表数据数据的高和宽（矩阵），好处是如果数据不符合宽高，tensorflow会报错，不指定也可以。
Y = tf.placeholder('float')



In [59]:
predict = neural_network(X)  # 得到预测结果（通过神经网络）
cost_func = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(logits = predict, labels = Y))  # 定义损失函数
optimizer = tf.train.AdamOptimizer().minimize(cost_func)
epochs = 13  # 13次整体迭代
batch_size = 50  

Instructions for updating:

Future major versions of TensorFlow will allow gradients to flow
into the labels input on backprop by default.

See @{tf.nn.softmax_cross_entropy_with_logits_v2}.



In [63]:
init=tf.global_variables_initializer()
sess=tf.Session()
sess.run(init)


In [64]:
epoch_loss = 0

i = 0
random.shuffle(train_dataset)  # 数据集随机
train_x = train_dataset[:, 0]  # train_x   (10662,)
train_y = train_dataset[:, 1]  # 标签