In [2]:
!pip install nltk

Looking in indexes: https://pypi.tuna.tsinghua.edu.cn/simple
Collecting nltk
  Downloading https://pypi.tuna.tsinghua.edu.cn/packages/a6/0a/0d20d2c0f16be91b9fa32a77b76c60f9baf6eba419e5ef5deca17af9c582/nltk-3.8.1-py3-none-any.whl (1.5 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.5/1.5 MB[0m [31m8.3 MB/s[0m eta [36m0:00:00[0m:00:01[0m00:01[0m
Installing collected packages: nltk
Successfully installed nltk-3.8.1


In [11]:
import time
import pandas as pd
import nltk as nltk
from nltk.corpus import stopwords
from nltk.sentiment.vader import SentimentIntensityAnalyzer
nltk.download('vader_lexicon')
nltk.download('stopwords')


[nltk_data] Downloading package vader_lexicon to
[nltk_data]     /Users/zhaoweiguo/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/zhaoweiguo/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [5]:
# Load the hotel reviews from CSV
df = pd.read_csv("../data/Hotel_Reviews_Filtered.csv")


## 删除停用词

In [13]:
# 删除停用词或不会改变句子情绪的常见英语单词。通过删除它们，情感分析应该运行得更快，但准确性不会降低（因为停用词不会影响情感，但它们确实会减慢分析速度）
# Remove stop words - can be slow for a lot of text!
# Ryan Han (ryanxjhan on Kaggle) has a great post measuring performance of different stop words removal approaches
# https://www.kaggle.com/ryanxjhan/fast-stop-words-removal # using the approach that Ryan recommends
cache = set(stopwords.words("english"))


In [14]:
def remove_stopwords(review):
    text = " ".join([word for word in review.split() if word not in cache])
    return text

In [15]:
# Remove the stop words from both columns
start = time.time()
df.Negative_Review = df.Negative_Review.apply(remove_stopwords)   
df.Positive_Review = df.Positive_Review.apply(remove_stopwords)
end = time.time()
print("Removing stop words took " + str(round(end - start, 2)) + " seconds")


Removing stop words took 2.99 seconds


## 执行情绪分析

In [16]:
# There are 3 possibilities of input for a review:
# It could be "No Negative", in which case, return 0
# It could be "No Positive", in which case, return 0
# It could be a review, in which case calculate the sentiment
def calc_sentiment(review):    
    if review == "No Negative" or review == "No Positive":
        return 0
    return vader_sentiment.polarity_scores(review)["compound"]    

In [18]:
# Create the vader sentiment analyser (there are others in NLTK you can try too)
vader_sentiment = SentimentIntensityAnalyzer()

In [19]:
# Add a negative sentiment and positive sentiment column
print("Calculating sentiment columns for both positive and negative reviews")
start = time.time()
df["Negative_Sentiment"] = df.Negative_Review.apply(calc_sentiment)
df["Positive_Sentiment"] = df.Positive_Review.apply(calc_sentiment)
end = time.time()
print("Calculating sentiment took " + str(round(end - start, 2)) + " seconds")


Calculating sentiment columns for both positive and negative reviews
Calculating sentiment took 124.46 seconds


In [21]:
df = df.sort_values(by=["Negative_Sentiment"], ascending=True)
print(df[["Negative_Review", "Negative_Sentiment"]])

                                          Negative_Review  Negative_Sentiment
186584  So bad experience memories I hotel The first n...             -0.9920
129503  First charged twice room booked booking second...             -0.9896
307286  The staff Had bad experience even booking Janu...             -0.9889
201953  Everything DO NOT STAY AT THIS HOTEL I never i...             -0.9886
452092  No WLAN room Incredibly rude restaurant staff ...             -0.9884
...                                                   ...                 ...
79215   I find anything hotel first I walked past hote...              0.9938
138365  Wifi terribly slow I speed test network upload...              0.9938
278506  The property great location There bakery next ...              0.9945
339189  Guys I like hotel I wish return next year Howe...              0.9948
480509  I travel lot far visited countless number hote...              0.9957

[515738 rows x 2 columns]


In [22]:
df = df.sort_values(by=["Positive_Sentiment"], ascending=False)
print(df[["Positive_Review", "Positive_Sentiment"]])

                                          Positive_Review  Positive_Sentiment
179007  We went Andaz 40th birthday celebration This a...              0.9991
287419  When first arrived hotel staff incredibly frie...              0.9987
132492  We arrived super cute boutique hotel area expl...              0.9987
322920  From moment stepped doors Guesthouse Hotel sta...              0.9985
293710  This place surprise expected good actually gre...              0.9985
...                                                   ...                 ...
489137  Very rude manager abusive staff reception Dirt...             -0.9703
124178  I didnt like anythig Room small Asked upgrade ...             -0.9721
64158   get everything extra internet parking breakfas...             -0.9751
5839    I completely disappointed mad since reception ...             -0.9780
137893  Bathroom Shower We going stay twice hotel 2 ni...             -0.9820

[515738 rows x 2 columns]


In [23]:
# Reorder the columns (This is cosmetic, but to make it easier to explore the data later)
df = df.reindex(["Hotel_Name", "Hotel_Address", "Total_Number_of_Reviews", "Average_Score", "Reviewer_Score", "Negative_Sentiment", "Positive_Sentiment", "Reviewer_Nationality", "Leisure_trip", "Couple", "Solo_traveler", "Business_trip", "Group", "Family_with_young_children", "Family_with_older_children", "With_a_pet", "Negative_Review", "Positive_Review"], axis=1)


In [24]:
print("Saving results to Hotel_Reviews_NLP.csv")
df.to_csv(r"../data/Hotel_Reviews_NLP.csv", index = False)


Saving results to Hotel_Reviews_NLP.csv
