In [1]:
from sklearn.model_selection import train_test_split
import numpy as np
import pandas as pd
import os
import collections
import re
import random

In [2]:
stopword_ls = []
def getStopWord():
    with open('lib/stopwords_utf8.txt', 'r',encoding='UTF-8') as file:
        for line in file:
            stopword_ls.append(line.split('\n')[0])
getStopWord()

def isStopWord(word):
    for i in range(len(stopword_ls)):
        if word == stopword_ls[i]:
            return True
    return False

In [3]:
import jieba
def load_data():   #获取数据集
    data = []
    getStopWord()
    positive_line = 0
    negative_line = 0
    total_positive_line = 7000
    total_negative_line = 3000

    with open('data/total/pos.txt', 'r', encoding='utf-8') as f:
        print('File Directory: data/total/pos.txt')
        sentences = f.readlines()
        for sentence in sentences[:total_positive_line]:
            positive_line += 1
            if positive_line == total_positive_line:
                end_val = '\n'
            else:
                end_val = '\r'
            print('Getting positive sentence {}/{}'.format(positive_line,total_positive_line),end=end_val)
            word_ls = []
            words = sentence.replace('\n','').split('    ')   #get chinese sentence
            tmp_ls = list(jieba.cut(words[1], cut_all=True))   #segmentation
            for i in range(len(tmp_ls)):
                if not isStopWord(tmp_ls[i]):
                       word_ls.append(tmp_ls[i]) 
            data.append([word_ls, 1])

    with open('data/total/neg.txt', 'r', encoding='utf-8') as f:
        print('File Directory: data/total/neg.txt')
        sentences = f.readlines()
        for sentence in sentences[:total_negative_line]:
            negative_line += 1
            if negative_line == total_negative_line:
                end_val = '\n'
            else:
                end_val = '\r'
            print('Getting negative sentence {}/{}'.format(negative_line,total_negative_line),end=end_val)
            word_ls = []
            words = sentence.replace('\n','').split('    ')   #get chinese sentence
            tmp_ls = list(jieba.cut(words[1], cut_all=True))   #segmentation
            for i in range(len(tmp_ls)):
                if not isStopWord(tmp_ls[i]):
                       word_ls.append(tmp_ls[i]) 
            data.append([word_ls, 0])

    print('Positive Line: {} | Negative Line: {}'.format(positive_line,negative_line))
    random.shuffle(data)
    return data


In [4]:
data_train, data_test = train_test_split(load_data(), test_size=0.10)

Building prefix dict from the default dictionary ...
Loading model from cache C:\Users\User\AppData\Local\Temp\jieba.cache


File Directory: data/total/pos.txt
Getting positive sentence 1/7000

Loading model cost 0.762 seconds.
Prefix dict has been built successfully.


Getting positive sentence 7000/7000
File Directory: data/total/neg.txt
Getting negative sentence 3000/3000
Positive Line: 7000 | Negative Line: 3000


In [5]:
all_training_words = [word for tokens,_ in data_train for word in tokens]
training_sentence_lengths = [len(tokens) for tokens,_ in data_train]
TRAINING_VOCAB = sorted(list(set(all_training_words)))
print("TRAIN DATASET | Total Words:{} | Total Vocabulary:{} | Max Sentence Length:{}".format(len(all_training_words),len(TRAINING_VOCAB),max(training_sentence_lengths)))

all_test_words = [word for tokens,_ in data_test for word in tokens]
test_sentence_lengths = [len(tokens) for tokens,_ in data_test]
TEST_VOCAB = sorted(list(set(all_test_words)))
print("TEST  DATASET | Total Words:{}  | Total Vocabulary:{} | Max Sentence Length:{}" .format(len(all_test_words),len(TEST_VOCAB),max(test_sentence_lengths)))

TRAIN DATASET | Total Words:397486 | Total Vocabulary:27639 | Max Sentence Length:1005
TEST  DATASET | Total Words:42659  | Total Vocabulary:9486 | Max Sentence Length:366


In [6]:
def list_to_dict(cleaned_tokens):
    return dict([token, True] for token in cleaned_tokens)

In [7]:
final_train = [(list_to_dict(tokens),label) for tokens,label in data_train ]
final_test = [(list_to_dict(tokens),label) for tokens,label in data_test ]
final_test[0]

({'字': True,
  '差': True,
  '入住': True,
  '时': True,
  '开': True,
  '房卡': True,
  '刷': True,
  '不开': True,
  '开门': True,
  '管理': True,
  '从前': True,
  '前台': True,
  '房间': True,
  '换乘': True,
  '电梯': True,
  '地铁': True,
  '硬件': True,
  '没中': True,
  '中央': True,
  '中央空调': True,
  '空调': True,
  '4XX': True,
  '元': True,
  '性价比': True,
  '一分': True},
 0)

In [8]:
from time import time
start_time = time()
from nltk import classify
from nltk import NaiveBayesClassifier
classifier = NaiveBayesClassifier.train(final_train)

In [9]:
# Output the model accuracy on the train and test data
print('Accuracy on train data: {:.4f}'.format(classify.accuracy(classifier, final_train)))
print('Accuracy on test  data: {:.4f}'.format(classify.accuracy(classifier, final_test)))

# Output Top 20 Sentiment Word
print(classifier.show_most_informative_features(20))
print('CPU Time:', time() - start_time)

Accuracy on train data: 0.7836
Accuracy on test  data: 0.7240
Most Informative Features
                      恶劣 = True                0 : 1      =     60.0 : 1.0
                    再也不会 = True                0 : 1      =     38.6 : 1.0
                      黑店 = True                0 : 1      =     38.0 : 1.0
                      最差 = True                0 : 1      =     29.4 : 1.0
                      欺骗 = True                0 : 1      =     29.3 : 1.0
                    强烈要求 = True                0 : 1      =     27.1 : 1.0
                       烂 = True                0 : 1      =     26.0 : 1.0
                      不堪 = True                0 : 1      =     25.6 : 1.0
                      最糟 = True                0 : 1      =     25.6 : 1.0
                      责任 = True                0 : 1      =     24.0 : 1.0
                      质问 = True                0 : 1      =     24.0 : 1.0
                      高价 = True                0 : 1      =     24.0 : 1.0
            

In [10]:
def jieba_segment(text):
    word_ls = []
    tmp_ls = list(jieba.cut(text, cut_all=False))   #segmentation
    for i in range(len(tmp_ls)):
        if not isStopWord(tmp_ls[i]):
            word_ls.append(tmp_ls[i]) 
    return word_ls

In [11]:
#TEST DATASET
print('---------Naive Bayes Test Data----------')
dataset_size = 1000
correct_count = 0
type = ['pos','neg']
for j in range(len(type)):
      positive_count,negative_count = 0,0
      if type[j] == 'pos':
            st = 'positive'
      else:
            st = 'negative'
      for i in range(dataset_size):
            with open('data/'+st+'/'+type[j]+'.'+str(i)+'.txt','r',encoding='UTF-8') as file:
                  text = file.read().replace('\n', '')
                  if i == dataset_size-1:
                        end_val = '\n'
                  else:
                        end_val = '\r'
                  
                  print('Getting '+st+' data: {}/{}'.format(i+1,dataset_size),end=end_val)

            custom_tokens = jieba_segment(text)

            sentiment = classifier.classify(dict([token, True] for token in custom_tokens))
            if sentiment == 0:
                  #print("Positive Sentiment")
                  positive_count += 1
            else:
                  #print("Negative Sentiment")
                  negative_count += 1

      print('Positive Count:',positive_count,end = ' | ')
      print('Negative Count:',negative_count)
      if type[j] == 'pos':
            correct_count += positive_count
      else:
            correct_count += negative_count
print('Accuracy: {}'.format(correct_count/(2*dataset_size)))

      

---------Naive Bayes Test Data----------
Getting positive data: 1000/1000
Positive Count: 278 | Negative Count: 722
Getting negative data: 1000/1000
Positive Count: 971 | Negative Count: 29
Accuracy: 0.1535


In [12]:
text = "房间设施难以够得上五星级，服务还不错，有送水果。"
custom_tokens = jieba_segment(text)
sentiment = classifier.classify(dict([token, True] for token in custom_tokens))
if sentiment == 0:
      print("Positive Sentiment 正面情绪")
else:
      print("Negative Sentiment 负面情绪")

Negative Sentiment 负面情绪


In [13]:
text = "前台服务较差，不为客户着想。房间有朋友来需要打扫，呼叫了两个小时也未打扫。房间下水道臭气熏天，卫生间漏水堵水。"
custom_tokens = jieba_segment(text)
sentiment = classifier.classify(dict([token, True] for token in custom_tokens))
if sentiment == 0:
      print("Positive Sentiment 正面情绪")
else:
      print("Negative Sentiment 负面情绪")

Positive Sentiment 正面情绪
