本文读取电影评论数据然后进行情感分类，主要利用词向量的概念。其主要思路和代码参考了CSDN博客[使用word2vec对新浪微博进行情感分析和分类](http://blog.csdn.net/liugallup/article/details/51164962)。

In [1]:
#读取数据
import os
corpus_root = 'imdb/train'
pos_dir = corpus_root+'/pos'
neg_dir = corpus_root+'/neg'
unsup_dir = corpus_root+'/unsup'
def readDocs(directory):
    reviews = []
    fileids = os.listdir(directory)
    for fileid in fileids:
        path = directory + '/' + fileid
        with open(path, encoding='utf8') as fi:
            reviews.append(fi.read())
    return reviews
pos_reviews = readDocs(pos_dir)
neg_reviews = readDocs(neg_dir)
unsup_reviews = readDocs(unsup_dir)

In [2]:
pos_reviews[0]

'Bromwell High is a cartoon comedy. It ran at the same time as some other programs about school life, such as "Teachers". My 35 years in the teaching profession lead me to believe that Bromwell High\'s satire is much closer to reality than is "Teachers". The scramble to survive financially, the insightful students who can see right through their pathetic teachers\' pomp, the pettiness of the whole situation, all remind me of the schools I knew and their students. When I saw the episode in which a student repeatedly tried to burn down the school, I immediately recalled ......... at .......... High. A classic line: INSPECTOR: I\'m here to sack one of your teachers. STUDENT: Welcome to Bromwell High. I expect that many adults of my age think that Bromwell High is far fetched. What a pity that it isn\'t!'

文本预处理，给每段文字加上标签。

In [3]:
import gensim

LabeledSentence = gensim.models.doc2vec.LabeledSentence

from sklearn.cross_validation import train_test_split
import numpy as np

#use 1 for positive sentiment, 0 for negative
y = np.concatenate((np.ones(len(pos_reviews)), np.zeros(len(neg_reviews))))

x_train, x_test, y_train, y_test = train_test_split(np.concatenate((pos_reviews, neg_reviews)), y, test_size=0.2)

#Do some very minor text preprocessing
def cleanText(corpus):
    punctuation = """.,?!:;(){}[]"""
    corpus = [z.lower().replace('\n','') for z in corpus]
    corpus = [z.replace('<br />', ' ') for z in corpus]

    #treat punctuation as individual words
    for c in punctuation:
        corpus = [z.replace(c, ' %s '%c) for z in corpus]
    corpus = [z.split() for z in corpus]
    return corpus

x_train = cleanText(x_train)
x_test = cleanText(x_test)
unsup_reviews = cleanText(unsup_reviews)

#Gensim's Doc2Vec implementation requires each document/paragraph to have a label associated with it.
#We do this by using the LabeledSentence method. The format will be "TRAIN_i" or "TEST_i" where "i" is
#a dummy index of the review.
def labelizeReviews(reviews, label_type):
    labelized = []
    for i,v in enumerate(reviews):
        label = '%s_%s'%(label_type,i)
        labelized.append(LabeledSentence(v, [label]))
    return labelized

x_train = labelizeReviews(x_train, 'TRAIN')
x_test = labelizeReviews(x_test, 'TEST')
unsup_reviews = labelizeReviews(unsup_reviews, 'UNSUP')



In [5]:
all_corpus = []
all_corpus.extend(x_train)
all_corpus.extend(x_test)
all_corpus.extend(unsup_reviews)

In [27]:
print(len(x_train))
print(len(x_test))
print(len(unsup_reviews))
print(len(all_corpus))

19264
4817
50000
74081


建立训练模型。

In [6]:
import random

size = 400

#instantiate our DM and DBOW models
model_dm = gensim.models.Doc2Vec(min_count=1, window=10, size=size, sample=1e-3, negative=5, workers=3)
model_dbow = gensim.models.Doc2Vec(min_count=1, window=10, size=size, sample=1e-3, negative=5, dm=0, workers=3)

#build vocab over all reviews
model_dm.build_vocab(all_corpus)
model_dbow.build_vocab(all_corpus)

#We pass through the data set multiple times, shuffling the training reviews each time to improve accuracy.
all_train_reviews = []
all_train_reviews.extend(x_train)
all_train_reviews.extend(unsup_reviews)

In [28]:
print(len(all_train_reviews))
np.hstack((a,b))

69264


array([1, 2, 3, 4, 5, 6])

训练

In [15]:
#Ten epeochs
for epoch in range(10):
    model_dm.train(all_train_reviews)
    model_dbow.train(all_train_reviews)

#Fetch doc vector for each article
def getVecs(model, tagged_corpus, size):
    vecs = [model.docvecs[z.tags[0]].reshape((1, size)) for z in tagged_corpus]
    return np.concatenate(vecs)

train_vecs_dm = getVecs(model_dm, x_train, size)
train_vecs_dbow = getVecs(model_dbow, x_train, size)

train_vecs = np.hstack((train_vecs_dm, train_vecs_dbow))

#train over test set
#x_test = np.array(x_test)

for epoch in range(10):
    #perm = np.random.permutation(x_test.shape[0])
    model_dm.train(x_test)
    model_dbow.train(x_test)

#Construct vectors for test reviews
test_vecs_dm = getVecs(model_dm, x_test, size)
test_vecs_dbow = getVecs(model_dbow, x_test, size)

test_vecs = np.hstack((test_vecs_dm, test_vecs_dbow))

In [29]:
train_vecs_dm = getVecs(model_dm, x_train, size)
train_vecs_dbow = getVecs(model_dbow, x_train, size)
train_vecs = np.hstack((train_vecs_dm, train_vecs_dbow))

In [30]:
train_vecs_dbow.shape

(19264, 400)

In [19]:
y_train.shape

(19264,)

In [31]:
#model prediction
from sklearn.linear_model import SGDClassifier

lr = SGDClassifier(loss='log', penalty='l1')
lr.fit(train_vecs, y_train)

print('Test Accuracy: %.2f'%lr.score(test_vecs, y_test))

Test Accuracy: 0.84
