### Reviews about Grab

In this notebook, we are going to perform topic modelling based on sentiments from various review sites. 

In [1]:
# Base
import os, re, string
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
# from wordcloud import WordCloud

# NLTK
import nltk
nltk.download('stopwords')
from nltk.tokenize import sent_tokenize, word_tokenize, RegexpTokenizer
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.metrics.distance import edit_distance
from spellchecker import SpellChecker
from nltk.corpus import stopwords
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from nltk.stem.wordnet import WordNetLemmatizer

# Topic Modelling
import pyLDAvis
import pyLDAvis.gensim
import gensim
from gensim import corpora
pyLDAvis.enable_notebook()

[nltk_data] Downloading package stopwords to D:\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


scipy.sparse.sparsetools is a private module for scipy.sparse, and should not be used.
  _deprecated()


### Data Preparation

We will first get the relevant files we need and thereafter extract them out into a list of reviews.

In [2]:
# Merge into 

path='.'

filename_sentiment_list=[]

for filename in os.listdir(path):
    if filename.endswith(".csv"):
        # Check what are the files in the folder
        print(os.path.join(path, filename))
        
        #Read the files and put to df
        df = pd.read_csv(filename,encoding = "ISO-8859-1")
        
        #Add it into a list
        try:
            filename_sentiment_list.extend(df['comment'].tolist())
            filename_sentiment_list.extend(df['tweet'].tolist())
            filename_sentiment_list.extend(df['title'].tolist())
        except:
            pass

print(filename_sentiment_list)

.\fb_groups_grab_en.csv
.\gplay_grab_en.csv
.\reddit_grabcar.csv
.\reddit_grabcar_extracted.csv
.\reddit_GrabSG.csv
.\reddit_GrabSG_extracted.csv
.\reddit_grabtaxi.csv
.\reddit_grabtaxi_extracted.csv
.\reddit_grab_driver.csv
.\reddit_grab_driver_extracted.csv
.\reddit_grab_taxi_ride.csv
.\reddit_grab_taxi_ride_extracted.csv
.\reddit_justgrab.csv
.\reddit_justgrab_extracted.csv
.\twitter_GrabSG.csv
.\twitter_grabshare.csv
.\twitter_grabtaxi.csv
.\twitter_grab_driver.csv
.\twitter_grab_taxi_ride.csv
.\twitter_justgrab.csv
.\twitter_str_grabcar_en.csv
.\twitter_str_grabhitch_en.csv


In [3]:
filename_sentiment_list[5:7]

['Maybe army regular, talk as though he is in army like that.',
 'I think he is trying to be funny with you.']

### Cleaning

We will also perform cleaning

In [4]:
def clean_list_tokenise(reviews_list):
    # Tokenise the words
    tokenizer = RegexpTokenizer(r'\w+')
    return [tokenizer.tokenize(sentence) for sentence in reviews_list]

def clean_list_lemma(reviews_list):
    # Lemmatization
    lemma = WordNetLemmatizer()
    return [[lemma.lemmatize(word) for word in sentence] for sentence in reviews_list]

def clean_list_stopwords(reviews_list, stop_other=[]):
    def stopword_condition(word):
        word = word.lower()
        
        return word not in stopwords.words('english') \
            and word not in stop_other \
            and word[:5] != 'http' \
            and word[:5] != 'https' \
            and word[:2] != 'RT' \
            and word[0] != '@'
    
    return [[w for w in s if stopword_condition(w)] for s in reviews_list]

In [5]:
sentiment_filtered = clean_list_stopwords(clean_list_tokenise(filename_sentiment_list))
len(sentiment_filtered)

49720

In [6]:
sentiment_filtered_sentences = [' '.join(s) for s in sentiment_filtered]
sentiment_filtered_sentences[:5]

['sounds little dodgy',
 'dodgy Report see ryde says get bf inform coming despite responses gave weird IMHO',
 'ur bf use acc book',
 'sorry dun find anything wrong msg complain',
 'expedite request pls indicate notes booking behalf']

### Segregating by Sentiment

We will segregate the reviews by sentiment. This is achieved by the `analyse_sentiment_vader` function, which uses vader to help us analyse the degree to which the particular review or comment is positive, negative or neutral. It also computes a compound score ranging from -1 to 1 which takes into account all the scores. More reading for the sentiment analysis are available at:

- https://opensourceforu.com/2016/12/analysing-sentiments-nltk/
- http://t-redactyl.io/blog/2017/04/using-vader-to-handle-sentiment-analysis-with-social-media-text.html

In [7]:
def analyse_sentiment_vader(df, col_name):
    sid = SentimentIntensityAnalyzer()
    vader = lambda text: sid.polarity_scores(text)
    
    df['vader'] = df[col_name].apply(vader)
    df = pd.merge(df, df['vader'].apply(pd.Series), left_index=True, right_index=True)
    return df.drop(['vader'], axis=1)

In [8]:
df_sentiment_filtered = pd.DataFrame(sentiment_filtered_sentences, columns=['comment'])
df_sentiment_filtered = analyse_sentiment_vader(df_sentiment_filtered, 'comment')
df_sentiment_filtered.head(10)

Unnamed: 0,comment,neg,neu,pos,compound
0,sounds little dodgy,0.446,0.554,0.0,-0.1548
1,dodgy Report see ryde says get bf inform comin...,0.123,0.778,0.098,-0.0982
2,ur bf use acc book,0.0,1.0,0.0,0.0
3,sorry dun find anything wrong msg complain,0.633,0.367,0.0,-0.7096
4,expedite request pls indicate notes booking be...,0.0,0.822,0.178,0.0772
5,Maybe army regular talk though army like,0.0,0.706,0.294,0.3612
6,think trying funny,0.0,0.408,0.592,0.4404
7,Born cheap much sex,0.0,1.0,0.0,0.0
8,mother must whore,0.683,0.317,0.0,-0.6486
9,0330am booking u slept U must really2 tired fu...,0.22,0.53,0.25,0.1027


In [9]:
df_sentiment_filtered.tail(10)

Unnamed: 0,comment,neg,neu,pos,compound
49710,need worry grab n uber around taxi less picky ...,0.102,0.81,0.088,-0.1027
49711,ComfortDelgro one fav cab service im rush call...,0.0,0.765,0.235,0.5106
49712,Comfort Delgro also got phone app works simila...,0.0,0.762,0.238,0.7579
49713,im sure option pick,0.0,0.566,0.434,0.3182
49714,deleted 0 6431 pastebin com FcrFs94k 97666,0.0,1.0,0.0,0.0
49715,support sg grab com go Grab app help centre re...,0.0,0.671,0.329,0.6597
49716,complained shit Grab,0.863,0.137,0.0,-0.743
49717,show booking id,0.0,1.0,0.0,0.0
49718,kindness monetary incentive see driver deliver...,0.111,0.536,0.353,0.9643
49719,grab hitch driver spotted,0.0,1.0,0.0,0.0


In [10]:
pos_list = list(df_sentiment_filtered[df_sentiment_filtered['compound'] >= 0]['comment'])
neg_list = list(df_sentiment_filtered[df_sentiment_filtered['compound'] < 0]['comment'])

len(pos_list), len(neg_list)

(38954, 10766)

In [11]:
pos_list_clean = clean_list_lemma(clean_list_tokenise(pos_list))
neg_list_clean = clean_list_lemma(clean_list_tokenise(neg_list))

In [12]:
len(pos_list_clean), len(neg_list_clean)

(38954, 10766)

Ignore code below

### Topic Modelling (Negative)

In [13]:
neg_dict = corpora.Dictionary(neg_list_clean)
doc_term_matrix = [neg_dict.doc2bow(doc) for doc in neg_list_clean]

print(doc_term_matrix[0:10])

[[(0, 1), (1, 1), (2, 1)], [(0, 1), (3, 1), (4, 1), (5, 1), (6, 1), (7, 1), (8, 1), (9, 1), (10, 1), (11, 1), (12, 1), (13, 1), (14, 1), (15, 1)], [(16, 1), (17, 1), (18, 1), (19, 1), (20, 1), (21, 1), (22, 1)], [(23, 1), (24, 1), (25, 1)], [(26, 1), (27, 1)], [(27, 1), (28, 1), (29, 1), (30, 1), (31, 1), (32, 2), (33, 1), (34, 1), (35, 1), (36, 1), (37, 1)], [(30, 1), (32, 1), (34, 1), (38, 1), (39, 1), (40, 1)], [(14, 1), (16, 1), (22, 1), (41, 1), (42, 1), (43, 1), (44, 1), (45, 1), (46, 1), (47, 1), (48, 1), (49, 1), (50, 1), (51, 1), (52, 1), (53, 1), (54, 1), (55, 1), (56, 1), (57, 1)], [(9, 1), (12, 1), (13, 1), (27, 2), (32, 1), (52, 1), (58, 1), (59, 1), (60, 1), (61, 2), (62, 1), (63, 1), (64, 1), (65, 1), (66, 1), (67, 1), (68, 1), (69, 1), (70, 2), (71, 1), (72, 2), (73, 1), (74, 1), (75, 2), (76, 1), (77, 1), (78, 1), (79, 1), (80, 1), (81, 2), (82, 1), (83, 1), (84, 1), (85, 2), (86, 1), (87, 2), (88, 1), (89, 1), (90, 1), (91, 1), (92, 1), (93, 2)], [(17, 2), (51, 1), (6

In [14]:
# Creating the object for LDA model using gensim library
Lda = gensim.models.ldamodel.LdaModel

# Running and Trainign LDA model on the document term matrix.
#doc_term_matrix = frequency of terms of all documents
#dictionary = all unique terms
ldamodel = Lda(doc_term_matrix, num_topics=5, id2word=neg_dict, passes=50)

In [15]:
print(ldamodel.print_topics(num_topics=5, num_words=5))

[(0, '0.017*"driver" + 0.012*"grab" + 0.012*"Grab" + 0.012*"taxi" + 0.010*"uber"'), (1, '0.029*"Grab" + 0.023*"Uber" + 0.012*"service" + 0.009*"grab" + 0.008*"company"'), (2, '0.043*"app" + 0.017*"grab" + 0.016*"use" + 0.013*"time" + 0.013*"update"'), (3, '0.019*"ð" + 0.008*"open" + 0.007*"terrible" + 0.007*"version" + 0.006*"com"'), (4, '0.046*"driver" + 0.020*"time" + 0.014*"ride" + 0.014*"cancel" + 0.012*"get"')]


In [16]:
pyLDAvis.gensim.prepare(ldamodel, doc_term_matrix, neg_dict)

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=True'.


  return pd.concat([default_term_info] + list(topic_dfs))


# Topic Modelling (Positive)

In [17]:
pos_dict = corpora.Dictionary(pos_list_clean)
doc_term_matrix = [neg_dict.doc2bow(doc) for doc in pos_list_clean]

print(doc_term_matrix[0:10])

[[(5, 1), (139, 1), (190, 1), (554, 1), (3277, 1)], [(364, 1), (451, 1), (725, 1), (736, 1), (949, 1), (2745, 1)], [(304, 1), (392, 1), (600, 1), (611, 2), (711, 1), (1851, 1)], [(405, 1), (664, 1), (5458, 1)], [(64, 1), (202, 1), (10478, 1)], [(24, 1), (88, 1), (360, 1), (451, 1), (554, 1), (1598, 1), (8716, 1), (9094, 1), (11536, 1)], [(7399, 1), (8707, 1)], [(32, 1), (34, 1), (48, 1), (88, 1), (360, 1), (680, 1), (1602, 1)], [(30, 1), (31, 1), (32, 1), (75, 1), (2656, 1), (6563, 1)], [(31, 1), (48, 1), (139, 1), (240, 2), (1115, 1), (5543, 1)]]


In [18]:
# Creating the object for LDA model using gensim library
Lda = gensim.models.ldamodel.LdaModel

# Running and Trainign LDA model on the document term matrix.
#doc_term_matrix = frequency of terms of all documents
#dictionary = all unique terms
ldamodel = Lda(doc_term_matrix, num_topics=5, id2word=pos_dict, passes=50)

In [19]:
pyLDAvis.gensim.prepare(ldamodel, doc_term_matrix, pos_dict)

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=True'.


  return pd.concat([default_term_info] + list(topic_dfs))
