In [118]:
# using Barcelona hotel review data and as to (5-folder sentiment analysis classifier) to train lstm classifier
import tensorflow as tf
from pymongo import MongoClient
import pandas as pd
import nltk
from nltk.corpus import stopwords
from keras.layers import LSTM
from keras import optimizers
from keras.models import Sequential, Model
import gensim
from gensim.models.doc2vec import TaggedDocument

In [70]:
def getCollection(collName = ""):
    '''
    return pandas dataframe.
    '''
    cursor = db[collName].find({})
    df = pd.DataFrame(list(cursor))
    return df

In [71]:
# connect to mongoclient and fetch Barcelona tripadvisor
client = MongoClient()
db = client.sentimentAnalysis
BarcelonaTripDF = getCollection("barcelonaTripadvisor")

# using hotel reviews in amsterdam as test reviews
amsterdamTrip = getCollection("amsterdamTripadvisor")

In [119]:
negReviews = list()
posReviews = list()

for key, group in BarcelonaTripDF.groupby("score"):
    if key in [0, 1, 2]:
        print("negtive: ", key)
        print("group len: ", len(group))
        negReviews += [i.split() for i in group["review"]]
    elif key in [4, 5]:
        print("postive: ", key)
        print("group len: ", len(group))
        posReviews += [i.split() for i in group["review"]]
    else:
        pass

negtive:  1.0
group len:  113
negtive:  2.0
group len:  137
postive:  4.0
group len:  809
postive:  5.0
group len:  1674


In [120]:
testNegReviews = list()
testPosReviews = list()

for key, group in amsterdamTrip.groupby("score"):
    if key in [0, 1, 2]:
        print("negtive: ", key)
        print("group len: ", len(group))
        testNegReviews += [i.split() for i in group["review"]]
    elif key in [4, 5]:
        print("postive: ", key)
        print("group len: ", len(group))
        testPosReviews += [i.split() for i in group["review"]]
    else:
        pass

negtive:  1.0
group len:  87
negtive:  2.0
group len:  85
postive:  4.0
group len:  728
postive:  5.0
group len:  1340


In [121]:
len(posReviews)

2483

In [122]:
len(negReviews)

250

In [123]:
len(testPosReviews)

2068

In [124]:
len(testNegReviews)

172

In [102]:
# LabeledSentence
pos_docs = list()
neg_docs = list()


for i in range(len(posReviews)):
    pos_docs.append(TaggedDocument(words=posReviews[i], tags=['TRAIN_POS_'+str(i)]))
for i in range(len(negReviews)):
    neg_docs.append(TaggedDocument(words=negReviews[i], tags=['TRAIN_NEG_'+str(i)]))    

for i in range(len(testPosReviews)):
    pos_docs.append(TaggedDocument(words=posReviews[i], tags=['TEST_POS_'+str(i)]))
for i in range(len(testNegReviews)):
    neg_docs.append(TaggedDocument(words=testNegReviews[i], tags=['TEST_NEG_'+str(i)]))

In [103]:
# train doc2vec classifier
model = gensim.models.Doc2Vec(neg_docs+pos_docs, min_count=1, window=10, size=100,
                              sample=1e-4, negative=5, workers=7)



In [105]:
model.save("./Reviews.d2v")

In [125]:
model = gensim.models.Doc2Vec.load('./Reviews.d2v')

In [126]:
for epoch in range(5):
    model.train(neg_docs+pos_docs, total_examples=model.corpus_count, epochs=model.iter)

  


In [127]:
import numpy

In [128]:
TRAIN_SIZE = 2733

train_arrays = numpy.zeros((2733, 100))
train_labels = numpy.zeros(2733)

for i in range(2483):
    prefix_train_pos = "TRAIN_POS_" + str(i)
    train_arrays[i] = model[prefix_train_pos]
    train_labels[i] = 1
for i in range(250):
    prefix_train_neg = 'TRAIN_NEG_' + str(i)
    train_arrays[2483 + i] = model[prefix_train_neg]
    train_labels[2483 + i] = 0

In [130]:
# for test hotel reviews dataset
TEST_SIZE = 2240

test_arrays = numpy.zeros((TEST_SIZE, 100))
test_labels = numpy.zeros(TEST_SIZE)

for i in range(2068):
    prefix_train_pos = "TEST_POS_" + str(i)
    test_arrays[i] = model[prefix_train_pos]
    test_labels[i] = 1
for i in range(172):
    prefix_train_neg = 'TEST_NEG_' + str(i)
    test_arrays[2068 + i] = model[prefix_train_neg]
    test_labels[2068 + i] = 0

In [131]:
from sklearn.linear_model import LogisticRegression
# using logistic regression as classifier 
classifier = LogisticRegression()
classifier.fit(train_arrays, train_labels)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [132]:
classifier.score(test_arrays, test_labels)

0.9267857142857143

In [133]:
# using svm as classifier
from sklearn import svm
svmClf = svm.SVC()
svmClf.fit(train_arrays, train_labels)

SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape=None, degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)

In [134]:
svmClf.score(test_arrays, test_labels)

0.934375

In [140]:
# using keras to implement lstm sentiment analysis
from keras.preprocessing import sequence
from keras.models import Sequential
from keras.layers.embeddings import Embedding
from keras.layers.recurrent import LSTM
from keras.layers.core import Dense, Dropout,Activation
from keras.models import model_from_yaml
import multiprocessing

# set parameters:
vocab_dim = 100
maxlen = 100
n_iterations = 1  # ideally more..
n_exposures = 10
window_size = 7
batch_size = 32
n_epoch = 4
input_length = 100
cpu_count = multiprocessing.cpu_count()


# below start apply lstm(long short term memory neural network)
##定义网络结构
def train_lstm(n_symbols,embedding_weights,x_train,y_train,x_test,y_test):
    print('Defining a Simple Keras Model...')
    model = Sequential()  # or Graph or whatever
    model.add(Embedding(output_dim=vocab_dim,
                        input_dim=n_symbols,
                        mask_zero=True,
                        weights=[embedding_weights],
                        input_length=input_length))  # Adding Input Length
    model.add(LSTM(output_dim=50, activation='sigmoid', inner_activation='hard_sigmoid'))
    model.add(Dropout(0.5))
    model.add(Dense(1))
    model.add(Activation('sigmoid'))

    print('Compiling the Model...')
    model.compile(loss='binary_crossentropy',
                  optimizer='adam',metrics=['accuracy'])

    print("Train...")
    model.fit(x_train, y_train, batch_size=batch_size, 
              nb_epoch=n_epoch,verbose=1, validation_data=(x_test, y_test),show_accuracy=True)

    print("Evaluate...")
    score = model.evaluate(x_test, y_test,
                                batch_size=batch_size)

    yaml_string = model.to_yaml()
    with open('lstm_data/lstm.yml', 'w') as outfile:
        outfile.write( yaml.dump(yaml_string, default_flow_style=True) )
    model.save_weights('lstm_data/lstm.h5')
    print('Test score:', score)
    
    
    
def lstm_predict(string):
    print('loading model......')
    with open('lstm_data/lstm.yml', 'r') as f:
        yaml_string = yaml.load(f)
    model = model_from_yaml(yaml_string)

    print('loading weights......')
    model.load_weights('lstm_data/lstm.h5')
    model.compile(loss='binary_crossentropy',
                  optimizer='adam',metrics=['accuracy'])
    data=input_transform(string)
    data.reshape(1,-1)
    #print data
    result=model.predict_classes(data)
    if result[0][0]==1:
        print(string,' positive')
    else:
        print(string,' negative')

In [141]:
# using tensorflow to implement LSTM sentiment analysis


In [142]:
cpu_count

8

In [62]:
# if score <= 2 then, the review is considered as negative, if socre >= 4, then we say these reviews are positive
model = gensim.models.Word2Vec(words)

In [63]:
model.save('./BarcelonaGensimWord2Vec')

In [64]:
modelBarcelona = gensim.models.Word2Vec.load("./BarcelonaGensimWord2Vec")

In [66]:
type(modelBarcelona.wv.vocab)

dict

In [68]:
print(modelBarcelona.wv.vocab.keys)

<built-in method keys of dict object at 0x7f865996e048>


In [50]:
model.vector_size

100

In [49]:
len(model.wv.vocab)

38

In [52]:
model.wv.vocab

{'Camper': <gensim.models.keyedvectors.Vocab at 0x7f8663baa0b8>,
 'Casa': <gensim.models.keyedvectors.Vocab at 0x7f8663ba6e48>,
 'I': <gensim.models.keyedvectors.Vocab at 0x7f8663ba6be0>,
 'The': <gensim.models.keyedvectors.Vocab at 0x7f8663ba6b38>,
 'We': <gensim.models.keyedvectors.Vocab at 0x7f8663ba69e8>,
 'a': <gensim.models.keyedvectors.Vocab at 0x7f8663ba6ac8>,
 'all': <gensim.models.keyedvectors.Vocab at 0x7f8663ba6cf8>,
 'and': <gensim.models.keyedvectors.Vocab at 0x7f8663ba6d30>,
 'are': <gensim.models.keyedvectors.Vocab at 0x7f8663ba6978>,
 'as': <gensim.models.keyedvectors.Vocab at 0x7f8663baa048>,
 'but': <gensim.models.keyedvectors.Vocab at 0x7f8663ba6ef0>,
 'for': <gensim.models.keyedvectors.Vocab at 0x7f8663ba6b00>,
 'had': <gensim.models.keyedvectors.Vocab at 0x7f8663ba6f60>,
 'have': <gensim.models.keyedvectors.Vocab at 0x7f8663ba6dd8>,
 'hotel': <gensim.models.keyedvectors.Vocab at 0x7f8663baa278>,
 'in': <gensim.models.keyedvectors.Vocab at 0x7f8663baa128>,
 'is': <

In [55]:
model["you"]

  """Entry point for launching an IPython kernel.


array([-2.7176237e-03, -4.2124532e-04, -2.9958170e-03,  1.7631396e-03,
        1.6334026e-04, -2.3682714e-03,  5.2180332e-03, -6.6152535e-04,
       -3.6912810e-03,  1.7486790e-03,  2.4528317e-03, -4.9182242e-03,
        4.6032267e-03, -2.8638407e-03,  1.1333899e-03,  1.3599114e-03,
       -3.5398798e-03,  3.2488762e-03,  5.4170075e-03,  3.7243727e-03,
       -4.4740876e-03,  2.2812365e-03,  3.8590247e-03,  3.4430004e-03,
       -5.7903898e-04, -2.9486124e-03, -2.0798463e-03, -2.6243101e-03,
        9.5090445e-04,  7.0516166e-04, -3.9693285e-03,  4.6384539e-03,
       -8.3733542e-04,  1.0782755e-04,  3.6009306e-03, -2.3609167e-03,
       -1.0568458e-03,  4.7210283e-03,  2.3235232e-03, -8.5816864e-04,
       -2.6506307e-03, -4.4114804e-03,  2.9152636e-03, -2.0264154e-03,
       -7.8529329e-04, -5.0564236e-03,  2.0137927e-03, -4.8121922e-03,
       -4.5178170e-04, -2.4561922e-03, -4.4617970e-03, -5.3392071e-03,
        4.2272461e-04, -2.7396725e-03, -9.6188608e-04,  9.8271517e-04,
      

In [42]:
model.wv.most_similar(positive=["", "king"], negative="man")

KeyError: "word 'woman' not in vocabulary"

In [36]:
model.wv.vocab

{'!': <gensim.models.keyedvectors.Vocab at 0x7f8663bf5d68>,
 "'": <gensim.models.keyedvectors.Vocab at 0x7f8663c002e8>,
 ',': <gensim.models.keyedvectors.Vocab at 0x7f8663bfb198>,
 '-': <gensim.models.keyedvectors.Vocab at 0x7f8663c04080>,
 '.': <gensim.models.keyedvectors.Vocab at 0x7f8663c00518>,
 'A': <gensim.models.keyedvectors.Vocab at 0x7f8663c006a0>,
 'B': <gensim.models.keyedvectors.Vocab at 0x7f8663c04f60>,
 'C': <gensim.models.keyedvectors.Vocab at 0x7f8663bfb278>,
 'E': <gensim.models.keyedvectors.Vocab at 0x7f8663bfb780>,
 'F': <gensim.models.keyedvectors.Vocab at 0x7f8663c00550>,
 'G': <gensim.models.keyedvectors.Vocab at 0x7f8663bfb3c8>,
 'I': <gensim.models.keyedvectors.Vocab at 0x7f8663c00f60>,
 'L': <gensim.models.keyedvectors.Vocab at 0x7f8663c040f0>,
 'N': <gensim.models.keyedvectors.Vocab at 0x7f8663c00eb8>,
 'O': <gensim.models.keyedvectors.Vocab at 0x7f8663c00c50>,
 'R': <gensim.models.keyedvectors.Vocab at 0x7f8663c04fd0>,
 'S': <gensim.models.keyedvectors.Vocab 

In [17]:
model["X"]

  """Entry point for launching an IPython kernel.


array([-0.0652876 , -0.02379496,  0.28427863,  0.22806762, -0.0402721 ,
        0.22190231, -0.28464034,  0.1134514 ,  0.01343165, -0.35749656,
        0.0014141 , -0.24401037,  0.14565443,  0.22039108, -0.06951722,
        0.00307713,  0.12986048,  0.12285537, -0.02502411,  0.1432696 ,
        0.09000761,  0.07506921, -0.10763656, -0.08317614,  0.03427026,
        0.16992094, -0.08538722,  0.12460461, -0.02682418, -0.00146557,
       -0.0702408 , -0.07357073, -0.29178527,  0.13437575, -0.0272955 ,
       -0.01592053,  0.02521549,  0.2799897 ,  0.0790484 , -0.00777822,
       -0.01319677, -0.1299203 , -0.2593838 , -0.16250567, -0.08295222,
       -0.13591553,  0.16659208,  0.17449483,  0.19507363, -0.0390104 ,
       -0.01474262, -0.05659554,  0.00456479,  0.01641093,  0.07135551,
       -0.0871382 , -0.05405086, -0.16807927,  0.00228584,  0.01154468,
        0.09966996,  0.08097436,  0.17029196,  0.01451492,  0.0965705 ,
       -0.03664728, -0.03110873, -0.1515383 ,  0.09444817,  0.01

In [22]:
model.wv.vocab

{' ': <gensim.models.keyedvectors.Vocab at 0x7f8663c759b0>,
 '!': <gensim.models.keyedvectors.Vocab at 0x7f8663c83a58>,
 '"': <gensim.models.keyedvectors.Vocab at 0x7f8663c72a58>,
 '#': <gensim.models.keyedvectors.Vocab at 0x7f8663c83f98>,
 '$': <gensim.models.keyedvectors.Vocab at 0x7f8663c7c3c8>,
 '%': <gensim.models.keyedvectors.Vocab at 0x7f8663c83550>,
 '&': <gensim.models.keyedvectors.Vocab at 0x7f8663c79390>,
 "'": <gensim.models.keyedvectors.Vocab at 0x7f8663c79128>,
 '(': <gensim.models.keyedvectors.Vocab at 0x7f8663c7cda0>,
 ')': <gensim.models.keyedvectors.Vocab at 0x7f8663c72080>,
 '*': <gensim.models.keyedvectors.Vocab at 0x7f8663c7c320>,
 '+': <gensim.models.keyedvectors.Vocab at 0x7f8663c83cf8>,
 ',': <gensim.models.keyedvectors.Vocab at 0x7f8663c75d30>,
 '-': <gensim.models.keyedvectors.Vocab at 0x7f8663c72b00>,
 '.': <gensim.models.keyedvectors.Vocab at 0x7f8663c7c860>,
 '/': <gensim.models.keyedvectors.Vocab at 0x7f8663c79898>,
 '0': <gensim.models.keyedvectors.Vocab 

In [23]:
model.accuracy

<bound method Word2Vec.accuracy of <gensim.models.word2vec.Word2Vec object at 0x7f8663c75f28>>

In [24]:
model.estimate_memory

<bound method BaseWordEmbeddingsModel.estimate_memory of <gensim.models.word2vec.Word2Vec object at 0x7f8663c75f28>>

In [10]:
model.wv["location"

KeyError: "word 'location' not in vocabulary"

In [18]:
model.vocabulary.sample

0.001

In [None]:
model

In [16]:
say_vector = model['say']

  """Entry point for launching an IPython kernel.


KeyError: "word 'say' not in vocabulary"

In [None]:
stop = stopwords.words('english')

In [16]:
#对每个句子的所有词向量取均值
def buildWordVector(text, size,imdb_w2v):
    vec = np.zeros(size).reshape((1, size))
    count = 0.
    for word in text:
        try:
            vec += imdb_w2v[word].reshape((1, size))
            count += 1.
        except KeyError:
            continue
    if count != 0:
        vec /= count
    return vec


In [18]:
#计算词向量
def get_train_vecs(x_train,x_test):
    n_dim = 300
    #Initialize model and build vocab
    imdb_w2v = Word2Vec(size=n_dim, min_count=10)
    imdb_w2v.build_vocab(x_train)

    #Train the model over train_reviews (this may take several minutes)
    imdb_w2v.train(x_train)

    train_vecs = np.concatenate([buildWordVector(z, n_dim,imdb_w2v) for z in x_train])
    #train_vecs = scale(train_vecs)

    np.save('svm_data/train_vecs.npy',train_vecs)
    print train_vecs.shape
    #Train word2vec on test tweets
    imdb_w2v.train(x_test)
    imdb_w2v.save('svm_data/w2v_model/w2v_model.pkl')
    #Build test tweet vectors then scale
    test_vecs = np.concatenate([buildWordVector(z, n_dim,imdb_w2v) for z in x_test])
    #test_vecs = scale(test_vecs)
    np.save('svm_data/test_vecs.npy',test_vecs)
    print(test_vecs.shape)

SyntaxError: Missing parentheses in call to 'print' (<ipython-input-18-410d0a97d5fc>, line 15)

In [14]:
# Model Hyperparameters

sequence_length = 50
embedding_dim = 300        
filter_sizes = (3, 4)
num_filters = 50
dropout_prob = (0.25, 0.5)
hidden_dims = 50

In [15]:
model = Sequential()
model.add(Dropout(dropout_prob[0], input_shape=(sequence_length, embedding_dim)

SyntaxError: unexpected EOF while parsing (<ipython-input-15-92f31af118d9>, line 2)

In [None]:
#  Recurrent Neural Network (RNN) using the Long Short Term Memory (LSTM) to calculate sentiment score on Barcelona
# hotel reviews
def lstm_sent(data):
    