### https://radimrehurek.com/gensim/models/doc2vec.html


In [22]:
from gensim import utils
from gensim.models.doc2vec import TaggedDocument 
from gensim.models import Doc2Vec
import numpy
import random
import os
import nltk.stem as stem



In [23]:
class LabeledLineSentence(object):
    def __init__(self, sources):
        self.sources = sources
        
        flipped = {}
        
        # make sure that keys are unique
        for key, value in sources.items():
            if value not in flipped:
                flipped[value] = [key]
            else:
                raise Exception('Non-unique prefix encountered')
    
    def __iter__(self):
        for source, prefix in self.sources.items():
            with utils.smart_open(source) as fin:
                for item_no, line in enumerate(fin):
                    yield LabeledSentence(utils.to_unicode(line).split(), [prefix + '_%s' % item_no])
    
    def to_array(self):
        self.sentences = []
        for source, prefix in self.sources.items():
            with utils.smart_open(source) as fin:
                for item_no, line in enumerate(fin):
                    self.sentences.append(TaggedDocument (utils.to_unicode(line).split(), [prefix + '_%s' % item_no]))
        return self.sentences
    
    def sentences_perm(self):
        shuffled = list(self.sentences)
        random.shuffle(shuffled)
        return shuffled

In [25]:
# sources = {'test-neg.txt':'TEST_NEG', 'test-pos.txt':'TEST_POS', 'train-neg.txt':'TRAIN_NEG', 'train-pos.txt':'TRAIN_POS', 'train-unsup.txt':'TRAIN_UNS'}
rootPath='../data/dataset/'
# sources = {
#             rootPath + 'Animals & Pet Supplies_new.txt':'TRAIN_1', 
#             rootPath + 'Apparel & Accessories_new.txt':'TRAIN_2',
#             rootPath + 'Arts & Entertainment_new.txt':'TRAIN_3',
#             rootPath + 'Baby & Toddler_new.txt':'TRAIN_4',
#             rootPath + 'Business & Industrial_new.txt':'TRAIN_5',
#             rootPath + 'Cameras & Optics_new.txt':'TRAIN_6',
#             rootPath + 'Electronics_new.txt':'TRAIN_7',
#             rootPath + 'Food, Beverages & Tobacco_new.txt':'TRAIN_8',
#             rootPath + 'Furniture_new.txt':'TRAIN_9',
#             rootPath + 'Hardware_new.txt':'TRAIN_10',
#             rootPath + 'Health & Beauty_new.txt':'TRAIN_11',
#             rootPath + 'Home & Garden_new.txt':'TRAIN_12',
#             rootPath + 'Luggage & Bags_new.txt':'TRAIN_13',
#             rootPath + 'Mature_new.txt':'TRAIN_14',
#             rootPath + 'Media_new.txt':'TRAIN_15',
#             rootPath + 'Office Supplies_new.txt':'TRAIN_16',
#             rootPath + 'Religious & Ceremonial_new.txt':'TRAIN_17',
#             rootPath + 'Software_new.txt':'TRAIN_18',
#             rootPath + 'Sporting Goods_new.txt':'TRAIN_19',
#             rootPath + 'Toys & Games_new.txt':'TRAIN_20',
#             rootPath + 'Vehicles & Parts_new.txt':'TRAIN_21'
#           }
sources = {
            rootPath + 'Animals & Pet Supplies_new.txt':'TRAIN_1', 
            rootPath + 'Apparel & Accessories_new.txt':'TRAIN_2',
            rootPath + 'Arts & Entertainment_new.txt':'TRAIN_3',
            rootPath + 'Business & Industrial_new.txt':'TRAIN_4',
            rootPath + 'Cameras & Optics_new.txt':'TRAIN_5',
            rootPath + 'Electronics_new.txt':'TRAIN_6',
            rootPath + 'Food, Beverages & Tobacco_new.txt':'TRAIN_7',
            rootPath + 'Health & Beauty_new.txt':'TRAIN_8',
            rootPath + 'Home & Garden_new.txt':'TRAIN_9',
            rootPath + 'Mature_new.txt':'TRAIN_10',
            rootPath + 'Media_new.txt':'TRAIN_11',
            rootPath + 'Office Supplies_new.txt':'TRAIN_12',
            rootPath + 'Religious & Ceremonial_new.txt':'TRAIN_13',
            rootPath + 'Software_new.txt':'TRAIN_14',
            rootPath + 'Sporting Goods_new.txt':'TRAIN_15',
            rootPath + 'Toys & Games_new.txt':'TRAIN_16',
          }
#sources

In [26]:
sentences = LabeledLineSentence(sources)

In [27]:
feature_size=200
model = Doc2Vec(min_count=1, window=10, size=feature_size, sample=1e-4, negative=5, workers=8)
# model = Doc2Vec(min_count=1, window=10, size=feature_size, sample=1e-4, hs=1, workers=8)
model.build_vocab(sentences.to_array())

In [28]:
model.train(sentences.sentences_perm(), total_examples=model.corpus_count, epochs=60)

12388471

In [29]:
#总样本量
model.corpus_count

36872

In [71]:
#TRAIN_1的第一条记录 的 特征
model.docvecs['TRAIN_20_503']

array([-0.42974174, -0.84499753,  0.38913134, -0.07190589,  0.19583772,
       -0.21709339,  0.36377141,  0.95978892, -0.7192595 , -0.37560827,
       -0.176626  ,  0.26879093,  0.31063902, -0.6518997 , -0.26445186,
        0.75989532,  0.56949019, -0.27620557,  0.93298906, -0.31902829], dtype=float32)

## Classifying Sentiments

### Training Vectors

Now let's use these vectors to train a classifier. First, we must extract the training vectors. Remember that we have a total of 25000 training reviews, with equal numbers of positive and negative ones (12500 positive, 12500 negative).

Hence, we create a `numpy` array (since the classifier we use only takes numpy arrays. There are two parallel arrays, one containing the vectors (`train_arrays`) and the other containing the labels (`train_labels`).

We simply put the positive ones at the first half of the array, and the negative ones at the second half.

In [30]:
train_arrays = numpy.zeros((model.corpus_count, feature_size))
train_labels = numpy.zeros(model.corpus_count)


In [31]:
new_sources = {v:k for k,v in sources.items()} 
#new_sources


In [32]:
index = 0
label_index = 0
for i in range(1,len(sources)+1,1):
    prefix_train_pos = 'TRAIN_' + str(i) + '_'
    count = len(open(new_sources['TRAIN_'+str(i)]).readlines())
    print(index,index+count)
    for j in range(index,index+count,1):
        #print(prefix_train_pos+str(j))
        train_arrays[j] = model.docvecs[prefix_train_pos+str(j-index)]
        train_labels[j] = label_index
    index = index+count
    label_index =label_index+1
    

0 429
429 6241
6241 16014
16014 19560
19560 20172
20172 24614
24614 24794
24794 26469
26469 29754
29754 29788
29788 30252
30252 30575
30575 30606
30606 30956
30956 34579
34579 36872


In [33]:
# from collections import Counter
# import pprint
# pprint.pprint(sorted(Counter(Y).items()))


In [37]:
print(model.docvecs['TRAIN_10_3'][:10])
print(train_arrays[29757][:10])

[-0.0623354  -0.04678584 -0.00510533 -0.01090491 -0.0113547   0.01019038
 -0.09172204 -0.09480663  0.05127002 -0.08035768]
[-0.0623354  -0.04678584 -0.00510533 -0.01090491 -0.0113547   0.01019038
 -0.09172204 -0.09480663  0.05127002 -0.08035768]


In [134]:
#label to one-hot
# from keras.utils import np_utils
# train_labels = np_utils.to_categorical(train_labels)

In [135]:
# index = 6240
# print(train_arrays[index])
# print(train_labels[index])

In [38]:
from sklearn.utils import shuffle
from sklearn.model_selection import train_test_split

X,Y = shuffle(train_arrays,train_labels,random_state=1)
X_train , X_test , y_train,y_test = train_test_split(X,Y,test_size=0.3,stratify =Y) 




In [39]:
# from imblearn.combine import SMOTEENN
# smote_enn = SMOTEENN(random_state=0)
# X_resampled, y_resampled = smote_enn.fit_sample(X, Y)
# print(sorted(Counter(y_resampled).items()))

from collections import Counter
from imblearn.combine import SMOTETomek
smote_tomek = SMOTETomek(random_state=0)
X_resampled, y_resampled = smote_tomek.fit_sample(X_train, y_train)
print(sorted(Counter(y_resampled).items()))

[(0.0, 6841), (1.0, 6825), (2.0, 6795), (3.0, 6833), (4.0, 6841), (5.0, 6824), (6.0, 6841), (7.0, 6841), (8.0, 6833), (9.0, 6841), (10.0, 6840), (11.0, 6841), (12.0, 6841), (13.0, 6841), (14.0, 6836), (15.0, 6838)]


### Classification

Now we train a logistic regression classifier using the training data.

In [65]:
import sklearn.ensemble as ensemble
gbdt = ensemble.GradientBoostingClassifier(n_estimators=20,subsample=0.7,verbose =1)
#gbdt.fit(X_train,y_train)

In [None]:
gbdt.score(X_test,y_test)

In [40]:
import sklearn
import sklearn.ensemble as ensemble

rf_classifer = sklearn.ensemble.RandomForestClassifier(n_estimators=300, 
                                                       max_depth=None,
                                                       min_samples_split=2,
                                                       min_samples_leaf=1,
                                                       min_weight_fraction_leaf=0.0,
                                                       max_leaf_nodes=None, 
                                                       min_impurity_decrease=0.0, 
                                                       min_impurity_split=None,
                                                       bootstrap=True,
                                                       oob_score=False,
                                                       n_jobs=8,
                                                       random_state=None, 
                                                       verbose=1,
                                                       warm_start=False,
                                                       class_weight=None)
rf_classifer.fit(X_resampled,y_resampled)

[Parallel(n_jobs=8)]: Done  34 tasks      | elapsed:   13.7s
[Parallel(n_jobs=8)]: Done 184 tasks      | elapsed:  1.1min
[Parallel(n_jobs=8)]: Done 300 out of 300 | elapsed:  1.8min finished


RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=300, n_jobs=8,
            oob_score=False, random_state=None, verbose=1,
            warm_start=False)

In [41]:
rf_classifer.score(X_test,y_test)

[Parallel(n_jobs=8)]: Done  34 tasks      | elapsed:    0.1s
[Parallel(n_jobs=8)]: Done 184 tasks      | elapsed:    0.4s
[Parallel(n_jobs=8)]: Done 300 out of 300 | elapsed:    0.6s finished


0.77472428132344962

In [42]:
from imblearn.metrics import classification_report_imbalanced

y_pred = rf_classifer.predict(X_test)
print(classification_report_imbalanced(y_test, y_pred))

[Parallel(n_jobs=8)]: Done  34 tasks      | elapsed:    0.1s


                   pre       rec       spe        f1       geo       iba       sup

        0.0       0.71      0.82      1.00      0.76      0.84      0.68       129
        1.0       0.81      0.85      0.96      0.83      0.89      0.77      1744
        2.0       0.79      0.78      0.92      0.79      0.85      0.72      2932
        3.0       0.72      0.65      0.97      0.68      0.83      0.67      1064
        4.0       0.74      0.83      1.00      0.78      0.86      0.72       184
        5.0       0.80      0.82      0.97      0.81      0.88      0.77      1333
        6.0       0.58      0.48      1.00      0.53      0.76      0.55        54
        7.0       0.89      0.82      1.00      0.86      0.94      0.87       502
        8.0       0.71      0.72      0.97      0.72      0.83      0.68       985
        9.0       0.67      0.40      1.00      0.50      0.82      0.64        10
       10.0       0.67      0.73      1.00      0.70      0.82      0.64       139
   

[Parallel(n_jobs=8)]: Done 184 tasks      | elapsed:    0.3s
[Parallel(n_jobs=8)]: Done 300 out of 300 | elapsed:    0.5s finished


In [21]:
help(classification_report_imbalanced)

Help on function classification_report_imbalanced in module imblearn.metrics.classification:

classification_report_imbalanced(y_true, y_pred, labels=None, target_names=None, sample_weight=None, digits=2, alpha=0.1)
    Build a classification report based on metrics used with imbalanced
    dataset
    
    Specific metrics have been proposed to evaluate the classification
    performed on imbalanced dataset. This report compiles the
    state-of-the-art metrics: precision/recall/specificity, geometric
    mean, and index balanced accuracy of the
    geometric mean.
    
    Parameters
    ----------
    y_true : ndarray, shape (n_samples, )
        Ground truth (correct) target values.
    
    y_pred : ndarray, shape (n_samples, )
        Estimated targets as returned by a classifier.
    
    labels : list, optional
        The set of labels to include when ``average != 'binary'``, and their
        order if ``average is None``. Labels present in the data can be
        excluded, fo

In [23]:
#单层 神经网络  73%
from keras.models import Sequential
from keras.layers import Dense, Activation

model = Sequential()

model.add(Dense(units=150, input_dim=feature_size))
model.add(Activation("relu"))
model.add(Dense(units=21))
model.add(Activation("softmax"))


from keras.optimizers import SGD
model.compile(loss='categorical_crossentropy', optimizer=SGD(lr=0.01, momentum=0.9, nesterov=True),metrics=[ 'acc'])
model.fit(X_train, y_train, epochs=100, batch_size=128,verbose=1,validation_data=(X_test,y_test))


Using TensorFlow backend.
  return f(*args, **kwds)


ValueError: Error when checking target: expected activation_2 to have shape (None, 21) but got array with shape (28249, 1)

In [58]:
f = open('data/dataset/Mature.txt','r')
#f_new = open('data/dataset/Mature.txt','w')


In [62]:
# for line in f:
#     l = line.split(" ")
#     j_new = str(l).replace("," ,"").replace("'","").replace("]","").replace("[","").replace("\\n","\n")
#     print(j_new)


In [25]:
s=stem.SnowballStemmer('english')

In [18]:
listt = os.listdir('./data/dataset')[1:]
for i in listt:
    path = os.path.join("./data/dataset",i)
    f = open(path,'r')
    f_new = open(path.replace(".txt","")+"_new.txt",'w')
    for j in f:
        j=j.lower().replace(","," ").replace("'"," ").replace("    "," ").replace("   "," ").replace("  "," ")
        #提取词干
#         js = j.split(" ")
#         js_new = [s.stem(i) for i in js]
#         j_new = str(js_new).replace("," ,"").replace("'","").replace("]","").replace("[","").replace("\\n","\n")
        #j_new=j_new.lower().replace(","," ").replace("'"," ").replace("    "," ").replace("   "," ").replace("  "," ")
        f_new.write(j)
    f.close()
    f_new.close()