## Through topic modelling, we want to find out the common topics derived from people's negative sentiments regarding public spaces in Paya Lebar

In [1]:
# Standard dataframe packages
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# Text analytics packages
import gensim
import nltk
from sklearn.feature_extraction.text import CountVectorizer

In [2]:
#Note: Try either of the following encodings: encoding='utf-8' or encoding='ISO-8859-1'
#import data
data=pd.read_excel('Sentiment.xlsx',sheet_name="Main",skipfooter=1)

In [3]:
#quick peek at the data
data.head()

Unnamed: 0,Social Offerings,Openness,Amenities,Cultural Heritage,Full text
0,We are staying in Geylang near the river. We g...,Very nice. So far I haven't encountered bad pe...,I think it’s already nice just the mosquitoes....,Usually we will go to the mall. I don’t get to...,We are staying in Geylang near the river. We g...
1,Not so really bonded to community. Paya Lebar ...,Last time my impression was this place was for...,The most modern building will be Paya Lebar Qu...,Yeah then the URA got a certain development pl...,Not so really bonded to community. Paya Lebar ...
2,OK actually we go to the shopping mall. This i...,Oh yeah absolutely absolutely welcome people i...,Let's say I was very surprised by PLQ. I think...,"Absolutely important, yes that's why I think P...",OK actually we go to the shopping mall. This i...
3,I started liking this place after 4 months sta...,"Around the condo. Just like family, we know ea...","Playground one, two things very boring. I thin...",Don't really know any cultural heritage here. ...,I started liking this place after 4 months sta...
4,My house is near to church. I’m a christian so...,I feel connected because mostly food. I usuall...,The river reminds me of my hometown.,No I don’t go there.,My house is near to church. I’m a christian so...


In [4]:
data2=data.loc[:,"Full text"].astype("str")

In [5]:
#import external packages for sentiment analysis
from nltk.sentiment.vader import SentimentIntensityAnalyzer

In [6]:
# nltk.download('vader_lexicon')

In [7]:
analyzer = SentimentIntensityAnalyzer()
#quick peek at data generated from vader
# for i in range(len(data2)):
#     scores = analyzer.polarity_scores(data2.iloc[i])
#     print(scores)

In [8]:
# VADER produces four sentiment metrics from these word ratings The first three, positive, neutral and negative,
# represent the proportion of the text that falls into those categories. 
# The polarity_scores gives us numerical values for use of negative, neutral, and positive word choice. 
# The compound value reflects the overall sentiment normalized to range of -1 being very negative to +1 being very positive.
#Printing the sentiment nicely in a table format. 
my_vader_score_compound = [ ] 
my_vader_score_positive = [ ] 
my_vader_score_negative = [ ] 
my_vader_score_neutral = [ ] 

for i in range(len(data2)):
    my_analyzer = analyzer.polarity_scores(data2.iloc[i])
    my_vader_score_compound.append(my_analyzer['compound'])
    my_vader_score_positive.append(my_analyzer['pos'])
    my_vader_score_negative.append(my_analyzer['neg']) 
    my_vader_score_neutral.append(my_analyzer['neu']) 


#converting sentiment values to numpy for easier usage
my_vader_score_compound = np.array(my_vader_score_compound)
my_vader_score_positive = np.array(my_vader_score_positive)
my_vader_score_negative = np.array(my_vader_score_negative)
my_vader_score_neutral = np.array(my_vader_score_neutral)

In [9]:
dataVader=data2.copy()
dataVader=pd.DataFrame(dataVader)

In [10]:
dataVader['Score'] = my_vader_score_compound
dataVader['postve'] = my_vader_score_positive
dataVader['neg'] = my_vader_score_negative
dataVader['neu'] = my_vader_score_neutral
dataVader.head()

Unnamed: 0,Full text,Score,postve,neg,neu
0,We are staying in Geylang near the river. We g...,0.9668,0.184,0.0,0.816
1,Not so really bonded to community. Paya Lebar ...,0.9886,0.11,0.036,0.855
2,OK actually we go to the shopping mall. This i...,0.9996,0.24,0.012,0.748
3,I started liking this place after 4 months sta...,0.9914,0.205,0.022,0.773
4,My house is near to church. I’m a christian so...,0.942,0.19,0.037,0.773


In [11]:
#Setting threshold for what comment is considered negative sentiment
neg_data=dataVader[dataVader["Score"]<=0.40]
neg_text=neg_data["Full text"]

In [12]:
#create stop words list
stop_words = nltk.corpus.stopwords.words('english')
stop_words.extend(['quite', 'paya', 'lebar', 'PLQ', 'would','go','come','already'])
# stop_words.extend(['quite', 'paya', 'lebar', 'PLQ', 'would','go','come',
#                    'already','feel','like','place','around','area','this',
#                    'know','also','really','nice','enough','good','yeah','important','love','absolutely','still','okay','even','though','cuz'
#                   ,'make'])

#create tokenizer
#\w matches [a-zA-Z0-9_]. This tokenizer splits the string using regular expressions. E.g - '[A-Z]\w+' will select just the capital words
wtk = nltk.tokenize.RegexpTokenizer(r'\w+')

#create lemmatizer
wnl = nltk.stem.wordnet.WordNetLemmatizer()

In [13]:
#create bigrams
from nltk.util import bigrams

def process_bigrams(documents):
    process_docs = []
    doc_tokens = []
    for doc in documents:
        doc = doc.lower()
        doc_tokens = wtk.tokenize(doc)
        doc_tokens = [token for token in doc_tokens if token not in stop_words]
        doc_tokens = [wnl.lemmatize(token) for token in doc_tokens] #Can try spacy or see how to use the (nltk.pos_tag()) with lemmatizer
        doc_tokens= ["_".join(t) for t in bigrams(doc_tokens)]
        print(doc_tokens)
        doc_tokens = [token for token in doc_tokens if len(token) > 2]
        process_docs.append(doc_tokens)
    return process_docs
process_neg_bigrams=process_bigrams(neg_text)

['many_shopping', 'shopping_mall', 'mall_big', 'big_change', 'change_last', 'last_4', '4_year', 'year_never', 'never_visit', 'visit_realised', 'realised_completely', 'completely_changed', 'changed_cultural', 'cultural_heritage', 'heritage_see']
['open_space', 'space_kid', 'kid_move', 'move_play', 'play_around', 'around_compared', 'compared_river', 'river_valley', 'valley_convenience', 'convenience_ecp', 'ecp_know', 'know_cultural', 'cultural_heritage']
['face_slight', 'slight_discrimination', 'discrimination_enter', 'enter_shopping', 'shopping_mall', 'mall_ask', 'ask_check', 'check_id', 'id_go', 'go_gym', 'gym_people', 'people_talk', 'talk_behind', 'behind_back', 'back_ask', 'ask_away', 'away_much', 'much_amenity', 'amenity_sport', 'sport_proper', 'proper_running', 'running_track', 'track_apart', 'apart_nearby', 'nearby_indoor', 'indoor_gym', 'gym_know', 'know_cultural', 'cultural_heritage']
['difficult_take', 'take_rest', 'rest_outside', 'outside_kopitiam', 'kopitiam_settle', 'settle_

In [14]:
# Create a dictionary representation of the documents. Each word gets an id
dictionary = gensim.corpora.Dictionary(process_neg_bigrams)

In [15]:
#filtering out bigrams based on their occurrence in the whole sentiments dataset
dictionary.filter_extremes(no_below=1, no_above=0.6)

In [16]:
# Transforming corpus into bag of words vectors
corpus_vect = [dictionary.doc2bow(text) for text in process_neg_bigrams]

In [17]:
#Train lda model
num_topics =3
ldamodel = gensim.models.ldamodel.LdaModel(corpus_vect, num_topics = num_topics, id2word=dictionary, passes=25, random_state= 0)


In [18]:
#formatting the topics in print
for num ,topic in ldamodel.show_topics(formatted=True, num_topics=num_topics, num_words=17):
    print(str(num)+": "+ topic)

0: 0.016*"know_cultural" + 0.009*"settle_public" + 0.009*"strict_cannot" + 0.009*"cannot_talk" + 0.009*"regulation_strict" + 0.009*"cannot_cross" + 0.009*"eat_cannot" + 0.009*"eat_know" + 0.009*"cannot_bring" + 0.009*"kopitiam_settle" + 0.009*"leg_cannot" + 0.009*"heavy_food" + 0.009*"talk_loudly" + 0.009*"bring_heavy" + 0.009*"area_sometimes" + 0.009*"cross_leg" + 0.009*"difficult_take"
1: 0.011*"shopping_mall" + 0.011*"know_cultural" + 0.011*"gym_people" + 0.011*"ask_away" + 0.011*"ask_check" + 0.011*"running_track" + 0.011*"back_ask" + 0.011*"nearby_indoor" + 0.011*"id_go" + 0.011*"much_amenity" + 0.011*"talk_behind" + 0.011*"go_gym" + 0.011*"indoor_gym" + 0.011*"behind_back" + 0.011*"apart_nearby" + 0.011*"people_talk" + 0.011*"enter_shopping"
2: 0.013*"last_time" + 0.010*"hari_raya" + 0.010*"geylang_serai" + 0.007*"complex_uncle" + 0.007*"family_day" + 0.007*"chiat_complex" + 0.007*"malay_culture" + 0.007*"still_see" + 0.007*"passed_away" + 0.007*"joo_chiat" + 0.007*"never_see" + 

In [19]:
#Evaluate the performance of trained lda model 
from gensim.models import CoherenceModel
#perplexity score the lower the better
log_perplexity = ldamodel.log_perplexity(corpus_vect)
perplexity = 2**(-log_perplexity)
print('Perplexity: ',perplexity)

# Compute Coherence Score
#coherence score the higher the better
coherence_model_lda = CoherenceModel(model=ldamodel, texts=process_neg_bigrams, dictionary=dictionary, coherence='c_v')
coherence_lda = coherence_model_lda.get_coherence()
print('\nCoherence Score: ', coherence_lda)

Perplexity:  75.63623549360358

Coherence Score:  0.7930765307846085
