### https://radimrehurek.com/gensim/models/doc2vec.html


In [1]:
from gensim import utils
from gensim.models.doc2vec import TaggedDocument 
from gensim.models import Doc2Vec
import numpy
import random
import os
import nltk.stem as stem


In [2]:
class LabeledLineSentence(object):
    def __init__(self, sources):
        self.sources = sources
        
        flipped = {}
        
        # make sure that keys are unique
        for key, value in sources.items():
            if value not in flipped:
                flipped[value] = [key]
            else:
                raise Exception('Non-unique prefix encountered')
    
    def __iter__(self):
        for source, prefix in self.sources.items():
            with utils.smart_open(source) as fin:
                for item_no, line in enumerate(fin):
                    yield LabeledSentence(utils.to_unicode(line).split(), [prefix + '_%s' % item_no])
    
    def to_array(self):
        self.sentences = []
        for source, prefix in self.sources.items():
            with utils.smart_open(source) as fin:
                for item_no, line in enumerate(fin):
                    self.sentences.append(TaggedDocument (utils.to_unicode(line).split(), [prefix + '_%s' % item_no]))
        return self.sentences
    
    def sentences_perm(self):
        shuffled = list(self.sentences)
        random.shuffle(shuffled)
        return shuffled

In [3]:
# sources = {'test-neg.txt':'TEST_NEG', 'test-pos.txt':'TEST_POS', 'train-neg.txt':'TRAIN_NEG', 'train-pos.txt':'TRAIN_POS', 'train-unsup.txt':'TRAIN_UNS'}
rootPath='data/dataset/'
sources = {
            rootPath + 'Animals & Pet Supplies_new.txt':'TRAIN_1', 
            rootPath + 'Apparel & Accessories_new.txt':'TRAIN_2',
            rootPath + 'Arts & Entertainment_new.txt':'TRAIN_3',
            rootPath + 'Baby & Toddler_new.txt':'TRAIN_4',
            rootPath + 'Business & Industrial_new.txt':'TRAIN_5',
            rootPath + 'Cameras & Optics_new.txt':'TRAIN_6',
            rootPath + 'Electronics_new.txt':'TRAIN_7',
            rootPath + 'Food, Beverages & Tobacco_new.txt':'TRAIN_8',
            rootPath + 'Furniture_new.txt':'TRAIN_9',
            rootPath + 'Hardware_new.txt':'TRAIN_10',
            rootPath + 'Health & Beauty_new.txt':'TRAIN_11',
            rootPath + 'Home & Garden_new.txt':'TRAIN_12',
            rootPath + 'Luggage & Bags_new.txt':'TRAIN_13',
            rootPath + 'Mature_new.txt':'TRAIN_14',
            rootPath + 'Media_new.txt':'TRAIN_15',
            rootPath + 'Office Supplies_new.txt':'TRAIN_16',
            rootPath + 'Religious & Ceremonial_new.txt':'TRAIN_17',
            rootPath + 'Software_new.txt':'TRAIN_18',
            rootPath + 'Sporting Goods_new.txt':'TRAIN_19',
            rootPath + 'Toys & Games_new.txt':'TRAIN_20',
            rootPath + 'Vehicles & Parts_new.txt':'TRAIN_21'
          }
#sources

In [4]:
sentences = LabeledLineSentence(sources)

In [18]:
feature_size=200
model = Doc2Vec(min_count=1, window=10, size=feature_size, sample=1e-4, negative=5, workers=8)
# model = Doc2Vec(min_count=1, window=10, size=feature_size, sample=1e-4, hs=1, workers=8)
model.build_vocab(sentences.to_array())

In [19]:
model.train(sentences.sentences_perm(), total_examples=model.corpus_count, epochs=model.iter+30)

7394961

In [7]:
#总样本量
model.corpus_count
model.iter

5

In [20]:
#TRAIN_1的第一条记录 的 特征
model.docvecs['TRAIN_20_503']

array([ 0.01168581, -0.4367393 ,  0.16919021,  0.22594388, -0.30299303,
        0.11121316,  0.21889901,  0.13416478, -0.37329695,  0.31481901,
       -0.34598714,  0.12764886,  0.12630624, -0.15212995, -0.09703127,
        0.49712393,  0.12650813,  0.06289864, -0.07523122,  0.42763376,
       -0.15300679, -0.38832864, -0.32271993, -0.52706462, -0.20038989,
        0.20220351, -0.21860264,  0.32495293, -0.03611407,  0.09168439,
        0.49789193, -0.53510505, -0.27024904, -0.34584367,  0.4115319 ,
        0.38744739,  0.21230787, -0.29203212, -0.36506402,  0.02270904,
       -0.46085957, -0.38591906,  0.42643157, -0.12298442, -0.07797596,
        0.37000018,  0.06252047,  0.02975133,  0.00896106,  0.36006603,
        0.13792478,  0.45265916, -0.29365671,  0.06642924,  0.0770029 ,
       -0.00942502,  0.28842881,  0.11663827, -0.29112527, -0.2240781 ,
        0.59489971, -0.0053976 , -0.01053664, -0.27593428,  0.27380082,
       -0.2012037 ,  0.22448991, -0.2355428 ,  0.3568624 , -0.02

## Classifying Sentiments

### Training Vectors

Now let's use these vectors to train a classifier. First, we must extract the training vectors. Remember that we have a total of 25000 training reviews, with equal numbers of positive and negative ones (12500 positive, 12500 negative).

Hence, we create a `numpy` array (since the classifier we use only takes numpy arrays. There are two parallel arrays, one containing the vectors (`train_arrays`) and the other containing the labels (`train_labels`).

We simply put the positive ones at the first half of the array, and the negative ones at the second half.

In [21]:
train_arrays = numpy.zeros((37666, feature_size))
train_labels = numpy.zeros(37666)


In [22]:
new_sources = {v:k for k,v in sources.items()} 
#new_sources


In [23]:
index = 0
label_index = 0
for i in range(1,22,1):
    prefix_train_pos = 'TRAIN_' + str(i) + '_'
    count = len(open(new_sources['TRAIN_'+str(i)]).readlines())
    print(index,index+count)
    for j in range(index,index+count,1):
        #print(prefix_train_pos+str(j))
        train_arrays[j] = model.docvecs[prefix_train_pos+str(j-index)]
        train_labels[j] = label_index
    index = index+count
    label_index =label_index+1
    

0 429
429 6241
6241 16014
16014 16155
16155 19701
19701 20313
20313 24755
24755 24935
24935 25023
25023 25343
25343 27018
27018 30303
30303 30354
30354 30388
30388 30852
30852 31175
31175 31206
31206 31556
31556 35179
35179 37472
37472 37666


In [24]:
#label to one-hot
# from keras.utils import np_utils
# train_labels = np_utils.to_categorical(train_labels)

In [25]:
# index = 6240
# print(train_arrays[index])
# print(train_labels[index])

In [26]:
from sklearn.utils import shuffle
X,Y = shuffle(train_arrays,train_labels,random_state=1)

In [27]:
from sklearn.model_selection import train_test_split
X_train , X_test , y_train,y_test = train_test_split(X,Y,test_size=0.25) 

### Classification

Now we train a logistic regression classifier using the training data.

In [65]:
import sklearn.ensemble as ensemble
gbdt = ensemble.GradientBoostingClassifier(n_estimators=20,subsample=0.7,verbose =1)
#gbdt.fit(X_train,y_train)

In [None]:
gbdt.score(X_test,y_test)

In [28]:
import sklearn
import sklearn.ensemble as ensemble

rf_classifer = sklearn.ensemble.RandomForestClassifier(n_estimators=100, 
                                                       max_depth=None,
                                                       min_samples_split=2,
                                                       min_samples_leaf=1,
                                                       min_weight_fraction_leaf=0.0,
                                                       max_leaf_nodes=None, 
                                                       min_impurity_decrease=0.0, 
                                                       min_impurity_split=None,
                                                       bootstrap=True,
                                                       oob_score=False,
                                                       n_jobs=8,
                                                       random_state=None, 
                                                       verbose=1,
                                                       warm_start=False,
                                                       class_weight=None)
rf_classifer.fit(X_train,y_train)

[Parallel(n_jobs=8)]: Done  34 tasks      | elapsed:    2.9s
[Parallel(n_jobs=8)]: Done 100 out of 100 | elapsed:    7.7s finished


RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=8,
            oob_score=False, random_state=None, verbose=1,
            warm_start=False)

In [29]:
rf_classifer.score(X_test,y_test)

[Parallel(n_jobs=8)]: Done  34 tasks      | elapsed:    0.1s
[Parallel(n_jobs=8)]: Done 100 out of 100 | elapsed:    0.2s finished


0.38791547201868959

In [23]:
#单层 神经网络  73%
from keras.models import Sequential
from keras.layers import Dense, Activation

model = Sequential()

model.add(Dense(units=150, input_dim=feature_size))
model.add(Activation("relu"))
model.add(Dense(units=21))
model.add(Activation("softmax"))


from keras.optimizers import SGD
model.compile(loss='categorical_crossentropy', optimizer=SGD(lr=0.01, momentum=0.9, nesterov=True),metrics=[ 'acc'])
model.fit(X_train, y_train, epochs=100, batch_size=128,verbose=1,validation_data=(X_test,y_test))


Using TensorFlow backend.
  return f(*args, **kwds)


ValueError: Error when checking target: expected activation_2 to have shape (None, 21) but got array with shape (28249, 1)

In [58]:
f = open('data/dataset/Mature.txt','r')
#f_new = open('data/dataset/Mature.txt','w')


In [62]:
# for line in f:
#     l = line.split(" ")
#     j_new = str(l).replace("," ,"").replace("'","").replace("]","").replace("[","").replace("\\n","\n")
#     print(j_new)


In [25]:
s=stem.SnowballStemmer('english')

In [18]:
listt = os.listdir('./data/dataset')[1:]
for i in listt:
    path = os.path.join("./data/dataset",i)
    f = open(path,'r')
    f_new = open(path.replace(".txt","")+"_new.txt",'w')
    for j in f:
        j=j.lower().replace(","," ").replace("'"," ").replace("    "," ").replace("   "," ").replace("  "," ")
        #提取词干
#         js = j.split(" ")
#         js_new = [s.stem(i) for i in js]
#         j_new = str(js_new).replace("," ,"").replace("'","").replace("]","").replace("[","").replace("\\n","\n")
        #j_new=j_new.lower().replace(","," ").replace("'"," ").replace("    "," ").replace("   "," ").replace("  "," ")
        f_new.write(j)
    f.close()
    f_new.close()