This script uses hotel reviews of Helsinki(Norway) in Tripadvisor website to demonstrate sentiment analysis.
### 1st section
The statistics data of Helsinki hotel reviews after preprocessing. The preprocessing includes "duplication elimination", "review record tuple with missing review".
### 2nd section
In 2nd section, applying machine learning and natural language processing technologies to these processed data.


In [1]:
from pyspark.sql import SQLContext, SparkSession
import json

In [2]:
spark = SparkSession.builder.appName("sentiment analysis demo").config(
            "spark.some.config.option", "sentAnalysis").getOrCreate()

In [3]:
source = spark.read.json("../reviewData/Tripadvisor/005_Helsinki_Tripadvisor.json")

### display the first review item, the schema of dataframe shows below:

"hotelStars" attribute means the hotel is on which level, one-star, two-star, three-star.....
"score"

In [4]:
source.first().asDict()

{'_id': Row($oid='598c7e7e9b1f26716b0b33c2'),
 'date': None,
 'hotelLocation': 'Pieni Roobertinkatu 1-3, Helsinki 00130, Finland',
 'hotelName': ' Hotel Lilla Roberts ',
 'hotelStars': '4.5',
 'hotelUrl': 'https://www.tripadvisor.com/Hotel_Review-g189934-d7940665-Reviews-Hotel_Lilla_Roberts-Helsinki_Uusimaa.html',
 'review': 'Lilla Roberts is a brand new Art Deco creation. Materials and craftsmanship of very high quality, built to last and age well. A very welcome and needed addition to the not-so-crowded Helsinki luxury hotel scene. Goes to the very top.Thoughtful details in the room and very comfortable beds, probably the best mattress in Helsinki.Gym yet to open in August 15, though you can use one in the sister hotel.',
 'score': 5.0,
 'title': 'Breathes quality and style',
 'url': 'https://www.tripadvisor.com/ShowUserReviews-g189934-d7940665-r298267980-Hotel_Lilla_Roberts-Helsinki_Uusimaa.html',
 'userId': 'Janos R'}

### The number of reviews

In [5]:
source.count()

23410

### Reduce duplicate reviews items, and eliminate record items with missing info, including "hotelUrl", "review", "usererId"

In [9]:
data = source.drop_duplicates()
data = data.filter("hotelLocation is not Null")
data = data.filter("hotelName is not Null")
data = data.filter("hotelStars is not Null")
data = data.filter("review is not Null")
data = data.filter("score is not Null")
data = data.filter("url is not Null")
data.count()

23231

get Helsinki Hotel basic statistics, the distribution for different hotel stars
[0,1], (1,2], (2,3), [3, 4), [4, 5]

In [5]:
data = source.select("review", "score")

In [6]:
data.head()

Row(review='Lilla Roberts is a brand new Art Deco creation. Materials and craftsmanship of very high quality, built to last and age well. A very welcome and needed addition to the not-so-crowded Helsinki luxury hotel scene. Goes to the very top.Thoughtful details in the room and very comfortable beds, probably the best mattress in Helsinki.Gym yet to open in August 15, though you can use one in the sister hotel.', score=5.0)

In [7]:
data = data.filter("review is not Null")

In [8]:
data = data.filter("score is not Null")

In [9]:
datasplits = data.randomSplit([0.8, 0.2], 24)
train, test = datasplits[0], datasplits[1]

In [10]:
type(train)

pyspark.sql.dataframe.DataFrame

In [11]:
train.count()

18643

In [12]:
data.select("score").rdd.flatMap(lambda x: x).collect()[:10]

[5.0, 4.0, 5.0, 4.0, 4.0, 5.0, 5.0, 3.0, 5.0, 5.0]

In [13]:
from pyspark.mllib.feature import IDF

In [14]:
import re
import string

In [15]:
remove_spl_char_regex = re.compile('[%s]' % re.escape(string.punctuation))  # regex to remove special characters

In [16]:
string.punctuation

'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'

In [17]:
stopwords = [u'rt', u're', u'i', u'me', u'my', u'myself', u'we', u'our', u'ours', u'ourselves', u'you', u'your',
             u'yours', u'yourself', u'yourselves', u'he', u'him', u'his', u'himself', u'she', u'her', u'hers',
             u'herself', u'it', u'its', u'itself', u'they', u'them', u'their', u'theirs', u'themselves', u'what',
             u'which', u'who', u'whom', u'this', u'that', u'these', u'those', u'am', u'is', u'are', u'was', u'were',
             u'be', u'been', u'being', u'have', u'has', u'had', u'having', u'do', u'does', u'did', u'doing', u'a',
             u'an', u'the', u'and', u'but', u'if', u'or', u'because', u'as', u'until', u'while', u'of', u'at', u'by',
             u'for', u'with', u'about', u'against', u'between', u'into', u'through', u'during', u'before', u'after',
             u'above', u'below', u'to', u'from', u'up', u'down', u'in', u'out', u'on', u'off', u'over', u'under',
             u'again', u'further', u'then', u'once', u'here', u'there', u'when', u'where', u'why', u'how', u'all',
             u'any', u'both', u'each', u'few', u'more', u'most', u'other', u'some', u'such', u'no', u'nor', u'not',
             u'only', u'own', u'same', u'so', u'than', u'too', u'very', u's', u't', u'can', u'will', u'just', u'don',
             u'should', u'now']

In [18]:
# tokenize函数对tweets内容进行分词
def tokenize(text):
    tokens = []
#     print(text)
#     text = text.encode('ascii', 'ignore')  # to decode
#     print("-------")
#     print(text)
    text = remove_spl_char_regex.sub(" ", text)  # Remove special characters
    text = text.lower()

    for word in text.split():
        if word not in stopwords \
                and word not in string.punctuation \
                and len(word) > 1 \
                and word != '``':
            tokens.append(word)
    return tokens

In [19]:
def doc2vec(document):
    # 100维的向量
    doc_vec = np.zeros(100)
    tot_words = 0

    for word in document:
        try:
        # 查找该词在预训练的word2vec模型中的特征值
            vec = np.array(lookup_bd.value.get(word)) + 1
            # print(vec)
            # 若该特征词在预先训练好的模型中，则添加到向量中
            if vec != None:
                doc_vec += vec
                tot_words += 1
        except:
            continue

    vec = doc_vec / float(tot_words)
    return vec

In [20]:
import numpy as np

In [21]:
lookup = spark.read.parquet("/home/yi/Music/Sentiment-Analysis/word2vecM_simple/data").alias("lookup")

In [22]:
lookup.printSchema()

root
 |-- word: string (nullable = true)
 |-- vector: array (nullable = true)
 |    |-- element: float (containsNull = true)



In [23]:
from pyspark import SparkContext

In [24]:
sc = spark.sparkContext

In [25]:
lookup_bd = sc.broadcast(lookup.rdd.collectAsMap())

In [26]:
trn_data = []

In [27]:
from pyspark.mllib.regression import LabeledPoint

In [28]:
for row in train.collect():
#     print("review: ", row["review"])
    token_text = tokenize(row["review"])# 规范化评论文本，进行分词
#     print("segmentation")
#     print(token_text)
    review_text = doc2vec(token_text)# 将文本转换为向量
#     print(review_text)
    trn_data.append(LabeledPoint(row["score"], review_text))
    



In [29]:
trn_data[1]

LabeledPoint(1.0, [nan,nan,nan,nan,nan,nan,nan,nan,nan,nan,nan,nan,nan,nan,nan,nan,nan,nan,nan,nan,nan,nan,nan,nan,nan,nan,nan,nan,nan,nan,nan,nan,nan,nan,nan,nan,nan,nan,nan,nan,nan,nan,nan,nan,nan,nan,nan,nan,nan,nan,nan,nan,nan,nan,nan,nan,nan,nan,nan,nan,nan,nan,nan,nan,nan,nan,nan,nan,nan,nan,nan,nan,nan,nan,nan,nan,nan,nan,nan,nan,nan,nan,nan,nan,nan,nan,nan,nan,nan,nan,nan,nan,nan,nan,nan,nan,nan,nan,nan,nan])

In [30]:
type(lookup)

pyspark.sql.dataframe.DataFrame

In [31]:
trnData = sc.parallelize(trn_data)

In [32]:
tst_data = []

In [33]:
for row in test.collect():
#     print("review: ", row["review"])
    token_text = tokenize(row["review"])# 规范化评论文本，进行分词
#     print("segmentation")
#     print(token_text)
    review_text = doc2vec(token_text)# 将文本转换为向量
#     print(review_text)
    tst_data.append(LabeledPoint(row["score"], review_text))



In [34]:
tst_data[:1]

[LabeledPoint(4.0, [nan,nan,nan,nan,nan,nan,nan,nan,nan,nan,nan,nan,nan,nan,nan,nan,nan,nan,nan,nan,nan,nan,nan,nan,nan,nan,nan,nan,nan,nan,nan,nan,nan,nan,nan,nan,nan,nan,nan,nan,nan,nan,nan,nan,nan,nan,nan,nan,nan,nan,nan,nan,nan,nan,nan,nan,nan,nan,nan,nan,nan,nan,nan,nan,nan,nan,nan,nan,nan,nan,nan,nan,nan,nan,nan,nan,nan,nan,nan,nan,nan,nan,nan,nan,nan,nan,nan,nan,nan,nan,nan,nan,nan,nan,nan,nan,nan,nan,nan,nan])]

In [35]:
type(tst_data)

list

In [36]:
tstData = sc.parallelize(tst_data)

In [37]:
from pyspark.mllib.tree import RandomForest

In [38]:
# train random forest classifier model
model = RandomForest.trainClassifier(trnData, numClasses=6, categoricalFeaturesInfo={},
                                     numTrees=3, featureSubsetStrategy="auto",
                                     impurity='gini', maxDepth=4, maxBins=32)

In [39]:
# predict the rest 20% test data
predictions = model.predict(tstData.map(lambda x: x.features))
labelsAndPredictions = tstData.map(lambda lp: lp.label).zip(predictions)

In [41]:
testErr = labelsAndPredictions.filter(lambda v: v[0] != v[1]).count() / float(tstData.count())

In [42]:
print('Test Error = ' + str(testErr))
print('Learned classification tree model:')
# print the random forest classifier model
print(model.toDebugString())

Test Error = 0.5892394122731202
Learned classification tree model:
TreeEnsembleModel classifier with 3 trees

  Tree 0:
    Predict: 4.0
  Tree 1:
    Predict: 4.0
  Tree 2:
    Predict: 4.0



In [43]:
# using SVM methods
from pyspark.mllib.classification import SVMWithSGD


In [44]:
# has error,
# svm = SVMWithSGD.train(sc.parallelize(tst_data))

In [45]:
sc.parallelize(tst_data)

ParallelCollectionRDD[56] at parallelize at PythonRDD.scala:475

In [46]:
# using bag of words method
from sklearn.feature_extraction.text import CountVectorizer
vectorizer = CountVectorizer(ngram_range=(1, 2), max_features=100000)


In [47]:
vectorizer.fit(trnData[)

SyntaxError: invalid syntax (<ipython-input-47-ca15ba20a31b>, line 1)

In [None]:
trnData.map(lambda x : x[0]).toList()

In [56]:
trainScores = [i.score for i in train.select("score").collect()]

In [59]:
trainScores[2]

3.0

In [51]:
trainReviews = [i.review for i in train.select("review").collect()]

In [58]:
trainReviews[2]

'+ Good location. Delicious breakfast. Convenient parking. This hotel is good for travelers and business meetings.- Sanitary equipment was not working. Standing water was In the sink. The minibar was empty.'

In [65]:
vec = vectorizer.fit_transform(trainReviews).toarray()

In [69]:
vec[100][:100]

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0], dtype=int64)

In [61]:
testReviews = [i.review for i in test.select("review").collect()]

In [63]:
testScore = [i.score for i in test.select("score").collect()]

CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=100000, min_df=1,
        ngram_range=(1, 2), preprocessor=None, stop_words=None,
        strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
        tokenizer=None, vocabulary=None)

In [70]:
example1 = trainReviews[1]

In [71]:
example1

'(See in English below)Yöpyminen oli hyvitys täysin pilalle menneestä matkasta, koska hotelli halusi näyttää, että pystyvät parempaan. Hyvitys hyvitettiin hyvityksellä, joka hyvitettiin hyvityksellä ja jossa epäonnistuttiin. Yhdessäkään asiassa ei oltu otettu opiksi ensimmäisestä matkasta. Nyt on siis kaksi kertaa kokeiltu ja voin sanoa, etten suosittele mistään hinnasta. Palvelut eivät keskustele keskenään ja jäävät irrallisiksi kikkareiksi. Siitä syystä palveluiden välille tulee katkoksia, joista asiakas kärsii. Työntekijät hoitavat oman "tonttinsa", mutta eivät mieti, miten asia näyttäytyy asiakkaalle. Hotellissa kukaan ei ota viimeistä vastuuta ja huolehdi, että lopulta asiat onnistuvat. Puitteet olisivat huikeat ja mahdollisuudet olisivat rajattomat, mutta koska hotellia ei osata johtaa, ei noita mahdollisuuksia saada käyttöön. Prosessit ontuvat ja pahasti. The hotel itself is fine and new but everything else doesn\'t work. Don\'t bother. This trip was a compensation of the other 

In [73]:
letters_only = re.sub("[^a-zA-Z]", " ", example1)
print(letters_only)

 See in English below Y pyminen oli hyvitys t ysin pilalle menneest  matkasta  koska hotelli halusi n ytt    ett  pystyv t parempaan  Hyvitys hyvitettiin hyvityksell   joka hyvitettiin hyvityksell  ja jossa ep onnistuttiin  Yhdess k  n asiassa ei oltu otettu opiksi ensimm isest  matkasta  Nyt on siis kaksi kertaa kokeiltu ja voin sanoa  etten suosittele mist  n hinnasta  Palvelut eiv t keskustele kesken  n ja j  v t irrallisiksi kikkareiksi  Siit  syyst  palveluiden v lille tulee katkoksia  joista asiakas k rsii  Ty ntekij t hoitavat oman  tonttinsa   mutta eiv t mieti  miten asia n ytt ytyy asiakkaalle  Hotellissa kukaan ei ota viimeist  vastuuta ja huolehdi  ett  lopulta asiat onnistuvat  Puitteet olisivat huikeat ja mahdollisuudet olisivat rajattomat  mutta koska hotellia ei osata johtaa  ei noita mahdollisuuksia saada k ytt  n  Prosessit ontuvat ja pahasti  The hotel itself is fine and new but everything else doesn t work  Don t bother  This trip was a compensation of the other tri

In [74]:
lower_case = letters_only.lower()
words = lower_case.split()

In [76]:
import nltk

In [95]:
from nltk.corpus import stopwords

print(len(stopwords.words("english")))

153


In [98]:
words = [i for i in words if not i in stopwords.words("english")]

In [82]:
print(words)

['see', 'english', 'pyminen', 'oli', 'hyvitys', 'ysin', 'pilalle', 'menneest', 'matkasta', 'koska', 'hotelli', 'halusi', 'n', 'ytt', 'ett', 'pystyv', 'parempaan', 'hyvitys', 'hyvitettiin', 'hyvityksell', 'joka', 'hyvitettiin', 'hyvityksell', 'ja', 'jossa', 'ep', 'onnistuttiin', 'yhdess', 'k', 'n', 'asiassa', 'ei', 'oltu', 'otettu', 'opiksi', 'ensimm', 'isest', 'matkasta', 'nyt', 'siis', 'kaksi', 'kertaa', 'kokeiltu', 'ja', 'voin', 'sanoa', 'etten', 'suosittele', 'mist', 'n', 'hinnasta', 'palvelut', 'eiv', 'keskustele', 'kesken', 'n', 'ja', 'j', 'v', 'irrallisiksi', 'kikkareiksi', 'siit', 'syyst', 'palveluiden', 'v', 'lille', 'tulee', 'katkoksia', 'joista', 'asiakas', 'k', 'rsii', 'ty', 'ntekij', 'hoitavat', 'oman', 'tonttinsa', 'mutta', 'eiv', 'mieti', 'miten', 'asia', 'n', 'ytt', 'ytyy', 'asiakkaalle', 'hotellissa', 'kukaan', 'ei', 'ota', 'viimeist', 'vastuuta', 'ja', 'huolehdi', 'ett', 'lopulta', 'asiat', 'onnistuvat', 'puitteet', 'olisivat', 'huikeat', 'ja', 'mahdollisuudet', 'olisi

In [101]:
# preprocess the review data
def review_to_words(raw_review):
#     print("raw review:  ", raw_review)
#     print(type(raw_review))
# 1. Remove non-letters
    letters_only = re.sub("[^a-zA-Z]", " ", raw_review)
# 2. convert to lower case, split into individual words
    letters_only = letters_only.lower().split()
# 3. in python, searching a set is much faster than searching a list, so convert the stopwords to a set.
    words = set(stopwords.words("english"))
# 4. remove stop words
    words = [i for i in letters_only if not i in words]
# 5. join the words back into one string separated by space, and return the result.
    result = " ".join(words)
    return result

In [102]:
clean_trainReviews = [review_to_words(i) for i in trainReviews]

In [103]:
# Creating Features from a Bag of Words (Using scikit-learn)
print("creating the bag of words....")
from sklearn.feature_extraction.text import CountVectorizer
# countVectorizer is scikit-learn's bag of words tool
vectorizer2 = CountVectorizer(analyzer="word", tokenizer= None, 
                              preprocessor= None, stop_words= None, max_features= 5000)
clean_trainReviews_features = vectorizer2.fit_transform(clean_trainReviews)
clean_trainReviews_features = clean_trainReviews_features.toarray()

creating the bag of words....


In [104]:
clean_trainReviews_features.shape

(18643, 5000)

In [107]:
clean_trainReviews_features[1][:500]

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1,
       1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 4, 3, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 3, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 2, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0,

In [108]:
vocab = vectorizer2.get_feature_names()

In [110]:
print(len(vocab))

5000


In [111]:
print(vocab[:100])

['aalto', 'ability', 'abit', 'able', 'abroad', 'absence', 'absent', 'absolute', 'absolutely', 'abundance', 'abundant', 'ac', 'accept', 'acceptable', 'accepted', 'access', 'accessed', 'accessibility', 'accessible', 'accidentally', 'accommodate', 'accommodated', 'accommodating', 'accommodation', 'accommodations', 'accomodate', 'accomodating', 'accomodation', 'accompanied', 'according', 'accordingly', 'account', 'accross', 'accurate', 'accustomed', 'across', 'act', 'action', 'active', 'activities', 'activity', 'actual', 'actually', 'adapter', 'adaptor', 'add', 'added', 'adding', 'addition', 'additional', 'additionally', 'address', 'addressed', 'adds', 'adequate', 'adequately', 'adjacent', 'adjoining', 'adjust', 'adjustable', 'adjusted', 'admit', 'admittedly', 'adult', 'adults', 'advance', 'advantage', 'advantages', 'adventure', 'advertise', 'advertised', 'advertises', 'advertising', 'advice', 'advisable', 'advise', 'advised', 'advising', 'advisor', 'affair', 'affect', 'afford', 'affordabl

In [114]:
import numpy as np

# Sum up the counts of each vocabulary word
dist = np.sum(clean_trainReviews_features, axis=0)

# For each, print the vocabulary word and the number of times it 
# appears in the training set
i = 0
for tag, count in zip(vocab, dist):
    if i == 30:
        break
    else:
        i += 1
        print(count, tag)

26 aalto
26 ability
11 abit
695 able
22 abroad
20 absence
12 absent
55 absolute
508 absolutely
23 abundance
43 abundant
129 ac
35 accept
152 acceptable
39 accepted
1027 access
34 accessed
15 accessibility
180 accessible
11 accidentally
50 accommodate
43 accommodated
198 accommodating
155 accommodation
45 accommodations
14 accomodate
21 accomodating
37 accomodation
16 accompanied
64 according


In [116]:
len(trainScores)

18643

In [117]:
from sklearn.ensemble import RandomForestClassifier
forest = RandomForestClassifier(n_estimators = 100)


In [118]:
forest = forest.fit(clean_trainReviews_features, trainScores)

In [121]:
# clean test reviews
clean_test_reviews = [review_to_words(i) for i in testReviews]

# vectorize the reviews using trained vectorizer2
clean_test_reviews_feature = vectorizer2.transform(clean_test_reviews)
clean_test_reviews_feature = clean_test_reviews_feature.toarray()

In [122]:
test_predict = forest.predict(clean_test_reviews_feature)

In [123]:
print(test_predict[:100])

[ 5.  5.  4.  5.  4.  4.  4.  5.  5.  4.  4.  4.  4.  4.  4.  4.  4.  4.
  4.  4.  5.  4.  4.  4.  4.  4.  5.  5.  4.  3.  4.  5.  4.  3.  4.  4.
  4.  4.  5.  4.  5.  4.  4.  5.  5.  4.  4.  5.  4.  4.  4.  5.  4.  4.
  4.  4.  4.  4.  4.  5.  4.  4.  4.  4.  4.  5.  3.  5.  5.  4.  5.  4.
  5.  5.  5.  4.  3.  5.  5.  4.  4.  4.  5.  5.  5.  5.  5.  4.  4.  4.
  4.  4.  4.  5.  4.  4.  4.  4.  5.  5.]


In [124]:
from sklearn.metrics import accuracy_score
accuracy = accuracy_score(testScore, test_predict)
print("accuracy:   ", accuracy)

accuracy:    0.589671564391
