# Setup
- Here, you'll see most of the data prep ––i.e. importing the packages, combining the two datasets, inputing version numbers to missing entries

In [1]:
import pandas as pd
import numpy as np
import re
import plotly.express as px

## Forming the Dataset: Load/Combine

In [2]:
playstore = pd.read_csv('Data/gcash_playstore_reviews.csv')
appstore = pd.read_csv('Data/gcash_appstore_reviews.csv')
playstore = playstore.drop(columns='Unnamed: 0')
appstore = appstore.drop(columns='Unnamed: 0')
playstore['isgoogle'] = 1
appstore['isgoogle'] = 0

In [3]:
print(playstore.columns)
print(appstore.columns)

Index(['reviewId', 'userName', 'userImage', 'content', 'score',
       'thumbsUpCount', 'reviewCreatedVersion', 'at', 'replyContent',
       'repliedAt', 'isgoogle'],
      dtype='object')
Index(['review', 'isEdited', 'date', 'title', 'userName', 'rating',
       'developerResponse', 'isgoogle'],
      dtype='object')


 Similar:
 - userName
 - content (g) : review (a)
 - score (g) : rating (a)
 - at (g) : date (a)
 - thumbsUpCount (g) : null
 - reviewCreatedVersion (g) check correlation w at then fill
 - developerResponse (g): replyContent (a)

others:
 - title (a) can be useful for topic modelling




In [4]:
apple = appstore.copy()
apple = apple.rename(columns = {'review': 'content','rating':'score','date':'at','developerResponse':'replyContent'})
all_data = pd.concat([playstore, apple])
all_data['content'] = all_data['content'].apply(lambda x : str(x).lower())
all_data['at'] = pd.to_datetime(all_data['at'])

## Imputing versions

In [5]:
all_data[['reviewCreatedVersion', 'at']]

Unnamed: 0,reviewCreatedVersion,at
0,5.39.1,2021-04-17 11:06:19
1,,2021-04-17 11:05:15
2,,2021-04-17 11:05:13
3,,2021-04-17 11:04:56
4,5.37.0,2021-04-17 10:55:39
...,...,...
1995,,2017-10-17 23:31:53
1996,,2017-09-24 21:10:44
1997,,2017-09-21 16:51:35
1998,,2017-09-21 03:26:07


In [6]:
ver_date = all_data[['reviewCreatedVersion', 'at']].sort_values('at')
ver_date['reviewCreatedVersion'] = ver_date['reviewCreatedVersion'].fillna(method='ffill')
ver_date = ver_date.rename(columns={'reviewCreatedVersion':'imputedVersions'}).sort_values('at',ascending =False)
all_data = all_data.merge(ver_date, how='left',on='at')

# EDA
- observations on the dataset, if we're focusing on creating a data analysis report as an output then we need to focus a lot on this

In [7]:
all_data['rev_wcount'] = all_data['content'].apply(lambda x: str(x).count(' ')+1)

In [8]:
all_data[['rev_wcount','content']]

Unnamed: 0,rev_wcount,content
0,4,it's so nice app...
1,2,good app
2,20,i always use cash in and pay bills with gcash ...
3,1,ok
4,17,the app always needs to update for the new ver...
...,...,...
190089,18,"if we could transact from paypal to gcash, i h..."
190090,50,not here to make comment but a suggestion. may...
190091,22,i can't activate/register because it always sa...
190092,44,bring it back. convert prepaid load to gcash. ...


In [9]:
px.histogram(x = all_data['rev_wcount'],log_y = True)

In [10]:
px.histogram(x = all_data['score'])

In [None]:
# percentage of users who rated from 1 to 5

all_data['score'].value_counts(normalize=True) * 100

5    58.747251
1    21.648763
4     8.175955
3     6.671436
2     4.756594
Name: score, dtype: float64

# NLTK
- language processing part of this ntbk
- separated it from EDA cuz it's pretty big

##Prep (remove stopwords/tokenize/lemmatize)

In [11]:
import nltk
nltk.download('punkt')
nltk.download('stopwords')
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords

[nltk_data] Downloading package punkt to /Users/Jasper/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/Jasper/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [18]:
import json
with open('node_modules/stopwords-tl/stopwords-tl.json') as f:
  filipino_stopwords = json.load(f)
fil = ['yung','nyo','di','wala','naman','nag','pera','sana']  #added some more that were noticed 
stopwords_list = set(stopwords.words('english')).union(set(filipino_stopwords)).union(set(fil))

In [25]:
all_data['tokens']= all_data['content'].apply(lambda x: word_tokenize(re.sub('\W+', " ", x)))
all_data['tokens'] = all_data['tokens'].apply(lambda x: [w for w in x if (not w in stopwords_list)])

In [28]:
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')
from nltk.stem import WordNetLemmatizer, PorterStemmer
lemmatizer = WordNetLemmatizer()
stemmer = PorterStemmer()
nltk.download('omw-1.4')

[nltk_data] Downloading package wordnet to /Users/Jasper/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/Jasper/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package omw-1.4 to /Users/Jasper/nltk_data...
[nltk_data]   Unzipping corpora/omw-1.4.zip.


True

In [29]:
all_data['lemmas'] = all_data['tokens'].apply(lambda x: [lemmatizer.lemmatize(w) for w in x])

## Word Frequency (nltk freqdist) - moved to Gcash_dash

In [30]:
from nltk import FreqDist

Looking into each rating group

In [32]:
score_content= all_data.groupby('score')['content'].apply(lambda x: ' '.join(x)).reset_index()
score_content['tokens']= score_content['content'].apply(lambda x: list(word_tokenize(re.sub('(\W+)', " ", x.lower()))))
score_content['tokens']= score_content['tokens'].apply(lambda x: [w for w in x if not w in stopwords_list])
score_content['freqDist'] = score_content['tokens'].apply(lambda x: FreqDist(token for token in x))
score_content['freqDist'] = score_content['freqDist'].apply(lambda x: {k: v for k, v in sorted(x.items(), key=lambda item: item[1], reverse =True)})

## Features Freq (nltk POS tagging)

In [144]:
from nltk import pos_tag
# pos_tags = score_content[['tokens','score']].copy()
# pos_tags['tokens'] = pos_tags['tokens'].apply(lambda x: pos_tag(x))

In [145]:
# pos_tags['tokens'] = pos_tags['tokens'].apply(lambda x: [word[0] for word in x if word[1] in ['NN','NNP','NNPS','NNS','VB','VBG','VBD','VBN','VBP','VBZ','JJ','JJR','JJS']])

In [146]:
# pos_tags['freqDist'] = pos_tags['tokens'].apply(lambda x: FreqDist(token for token in x))
# pos_tags['freqDist'] = pos_tags['freqDist'].apply(lambda x: {k: v for k, v in sorted(x.items(), key=lambda item: item[1], reverse =True)})

In [147]:
# pos_tags

In [148]:
# [y for x in pos_tags['freqDist'] for y in x if x[y] in range(1000,9000)]

In [149]:
# f1 = [(x,y) for x,y in dict(pos_tags['freqDist'][0]).items()]
# f2 = [(x,y) for x,y in dict(pos_tags['freqDist'][1]).items()]
# f3 = [(x,y) for x,y in dict(pos_tags['freqDist'][2]).items()]
# f4 = [(x,y) for x,y in dict(pos_tags['freqDist'][3]).items()]
# f5 = [(x,y) for x,y in dict(pos_tags['freqDist'][4]).items()]
# f_intersection = set([x[0] for x in f1[:100]]).intersection(set([x[0] for x in f2[:100]])).\
#       intersection(set([x[0] for x in f3[:100]])).intersection(set([x[0] for x in f4[:100]])).\
#       intersection(set([x[0] for x in f5[:100]]))

In [150]:
# feature_1 = filter(f1, f_intersection)
# feature_2 = filter(f2, f_intersection)
# feature_3 = filter(f3, f_intersection)
# feature_4 = filter(f4, f_intersection)
# feature_5 = filter(f5, f_intersection)

In [151]:
# features = pd.DataFrame({'rank': [x for x in range(1,len(feature_1)+1)],
#                          'rated_1': [x[0] for x in feature_1],
#                          'rated_2': [x[0] for x in feature_2],
#                          'rated_3': [x[0] for x in feature_3],
#                          'rated_4': [x[0] for x in feature_4],
#                          'rated_5': [x[0] for x in feature_5],
#                          })

In [152]:
# features

In [153]:
# feat1 = pd.DataFrame({'rating':1, 'word': [x[0] for x in feature_1], 'count': [x[1] for x in feature_1]})
# feat2 = pd.DataFrame({'rating':2, 'word': [x[0] for x in feature_2], 'count': [x[1] for x in feature_2]})
# feat3 = pd.DataFrame({'rating':3, 'word': [x[0] for x in feature_3], 'count': [x[1] for x in feature_3]})
# feat4 = pd.DataFrame({'rating':4, 'word': [x[0] for x in feature_4], 'count': [x[1] for x in feature_4]})
# feat5 = pd.DataFrame({'rating':5, 'word': [x[0] for x in feature_5], 'count': [x[1] for x in feature_5]})
# all_feat = pd.concat([feat1,feat2,feat3,feat4,feat5])
# all_feat

# LDA


In [154]:
# import gensim

In [155]:
# ratings1 = all_data[all_data['score'] == 1]
# ratings5 = all_data[all_data['score'] == 5]

In [156]:
# ratings1_dict = gensim.corpora.Dictionary(ratings1['tokens'])
# ratings5_dict = gensim.corpora.Dictionary(ratings5['tokens'])

# ratings1_corpus = [ratings1_dict.doc2bow(x) for x in ratings1['tokens']]
# ratings5_corpus = [ratings5_dict.doc2bow(x) for x in ratings5['tokens']]

In [157]:
# # I played around the num of topics lang sorry HAHA lanz originally set this to 10 -yssa

# ratings1_lda = gensim.models.ldamodel.LdaModel(ratings1_corpus, 
#                                                id2word = ratings1_dict, 
#                                                num_topics = 6, 
#                                                random_state = 42)

# ratings5_lda = gensim.models.ldamodel.LdaModel(ratings5_corpus, 
#                                                id2word = ratings5_dict, 
#                                                num_topics = 6, 
#                                                random_state = 42)

In [158]:
# ratings1_lda.print_topics(num_topics = 6,
#                           num_words = 10)

In [159]:
# ratings5_lda.print_topics(num_topics = 6, 
#                           num_words = 10)

In [160]:
# # Sample on a rating 1 review

# sample_review = ratings1['tokens'][2]

# sample_review_bow = ratings1_dict.doc2bow(sample_review)

# # Identify the topic that each review corresponds to the most using max then we can base our pie chart on that ba?

# ratings1_lda.get_document_topics(sample_review_bow)
# # max(ratings1_lda.get_document_topics(sample_review_bow)) -> apply lambda after for all reviews
# # may idea ba kayo how we can get the max in this list? I realized it reads the numbers on the left first (0 to 5) so it always returns the value of topic 5

In [161]:
# #sample on a rating 5 review

# sample_review = ratings5['tokens'][1]

# sample_review_bow = ratings5_dict.doc2bow(sample_review)

# ratings5_lda.get_document_topics(sample_review_bow)

In [162]:
# sample_review_bow['ratings5'].value_counts().idxmax()

# LDA v2

In [163]:
# Additional cleaning

all_data['tokens'] = all_data['tokens'].apply(lambda x: [w for w in x if len(w) > 1]) # remove words with one letter

select_clean = ['gcash','app','nyo','niyo','ung','yung','lng','lang','daw','raw','din','rin','yan','yun','nag','mag','pag', 'po', 'ba', 'mo','ayaw'] # added more here -yssa

all_data['tokens'] = all_data['tokens'].apply(lambda x: [w for w in x if not w in select_clean]) # remove select words

In [164]:
import gensim

In [165]:
ratings1 = all_data[all_data['score'] == 1].reset_index().drop(['index'], axis = 1)
ratings5 = all_data[all_data['score'] == 5].reset_index().drop(['index'], axis = 1)

In [166]:
ratings1_dict = gensim.corpora.Dictionary(ratings1['tokens'])
ratings5_dict = gensim.corpora.Dictionary(ratings5['tokens'])

ratings1_corpus = [ratings1_dict.doc2bow(x) for x in ratings1['tokens']]
ratings5_corpus = [ratings5_dict.doc2bow(x) for x in ratings5['tokens']]

## Making the topic models

In [167]:
ratings1_lda = gensim.models.ldamodel.LdaModel(ratings1_corpus, 
                                               id2word = ratings1_dict, 
                                               num_topics = 4, 
                                               random_state = 42)

ratings5_lda = gensim.models.ldamodel.LdaModel(ratings5_corpus, 
                                               id2word = ratings5_dict, 
                                               num_topics = 4, 
                                               random_state = 42)

In [168]:
ratings1_lda.print_topics(num_words = 10)

[(0,
  '0.024*"money" + 0.018*"account" + 0.011*"cash" + 0.010*"already" + 0.009*"bank" + 0.009*"even" + 0.009*"mpin" + 0.008*"email" + 0.008*"transaction" + 0.008*"still"'),
 (1,
  '0.021*"always" + 0.020*"update" + 0.017*"use" + 0.015*"fix" + 0.014*"please" + 0.014*"load" + 0.014*"time" + 0.013*"cant" + 0.012*"even" + 0.011*"account"'),
 (2,
  '0.031*"service" + 0.026*"customer" + 0.021*"verified" + 0.019*"account" + 0.016*"card" + 0.014*"verify" + 0.013*"fully" + 0.013*"get" + 0.012*"id" + 0.011*"verification"'),
 (3,
  '0.013*"tapos" + 0.012*"kayo" + 0.011*"code" + 0.011*"update" + 0.009*"load" + 0.007*"kwenta" + 0.006*"mpin" + 0.006*"globe" + 0.006*"cash" + 0.006*"eh"')]

In [169]:
ratings5_lda.print_topics(num_words = 10)

[(0,
  '0.028*"best" + 0.014*"account" + 0.013*"globe" + 0.012*"update" + 0.010*"cool" + 0.009*"user" + 0.009*"happy" + 0.008*"paypal" + 0.008*"makes" + 0.007*"cash"'),
 (1,
  '0.294*"good" + 0.123*"useful" + 0.092*"apps" + 0.050*"helpful" + 0.049*"ok" + 0.039*"excellent" + 0.036*"amazing" + 0.019*"service" + 0.016*"job" + 0.011*"satisfied"'),
 (2,
  '0.065*"great" + 0.062*"love" + 0.056*"use" + 0.048*"easy" + 0.028*"thank" + 0.027*"like" + 0.023*"thanks" + 0.021*"convenient" + 0.020*"bills" + 0.016*"free"'),
 (3,
  '0.132*"nice" + 0.046*"load" + 0.044*"money" + 0.030*"convenient" + 0.029*"awesome" + 0.021*"pay" + 0.019*"buy" + 0.015*"bills" + 0.015*"please" + 0.013*"transfer"')]

## Finding optimal number of topics (coherence values)

The following code makes use of Mallet and was copied from multiple sources. Do not run this anymore. (Ratings 1 k ~ 2) (Ratings 5 k ~ 4)

In [170]:
# # DO NOT RUN
# # The following code was copied from multiple sources

# import os       #importing os to set environment variable
# def install_java():
#   !apt-get install -y openjdk-8-jdk-headless -qq > /dev/null      #install openjdk
#   os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"     #set environment variable
#   !java -version       #check java version
# install_java()


# !wget http://mallet.cs.umass.edu/dist/mallet-2.0.8.zip
# !unzip mallet-2.0.8.zip


# os.environ['MALLET_HOME'] = '/content/mallet-2.0.8'
# mallet_path = '/content/mallet-2.0.8/bin/mallet'


# from gensim.models.coherencemodel import CoherenceModel

In [171]:
# # DO NOT RUN
# def compute_coherence_values(dictionary, corpus, texts, limit, start=2, step=1):

#     """
#     Compute c_v coherence for various number of topics

#     Parameters:
#     ----------
#     dictionary : Gensim dictionary
#     corpus : Gensim corpus
#     texts : List of input texts
#     limit : Max num of topics

#     Returns:
#     -------
#     model_list : List of LDA topic models
#     coherence_values : Coherence values corresponding to the LDA model with respective number of topics
#     """

#     coherence_values = []
#     model_list = []
#     for num_topics in range(start, limit, step):
#         model = gensim.models.wrappers.LdaMallet(mallet_path, corpus=corpus, num_topics=num_topics)
#         model_list.append(model)
#         coherencemodel = CoherenceModel(model=model, texts=texts, dictionary=dictionary, coherence='c_v')
#         coherence_values.append(coherencemodel.get_coherence())

#     return model_list, coherence_values

In [172]:
# # DO NOT RUN
# # WARNING: TAKES TIME TO LOAD
# model_list, coherence_values = compute_coherence_values(ratings1_dict, ratings1_corpus, ratings1['tokens'], start=2, limit=10, step=2)

In [173]:
# # DO NOT RUN

# limit=10; start=2; step=2;
# x = range(start, limit, step)
# plt.plot(x, coherence_values)
# plt.xlabel("Num Topics")
# plt.ylabel("Coherence score")
# plt.legend(("coherence_values"), loc='best')
# plt.show()

In [174]:
# # DO NOT RUN

# for m, cv in zip(x, coherence_values):
#     print("Num Topics =", m, " has Coherence Value of", round(cv, 4))

**Verdict:** For ratings1, # of topics k ~ 2

In [175]:
# # DO NOT RUN

# # WARNING: TAKES TIME TO LOAD
# model_list, coherence_values = compute_coherence_values(ratings5_dict, ratings5_corpus, ratings5['tokens'], start=2, limit=10, step=2)

In [176]:
# # DO NOT RUN

# limit=10; start=2; step=2;
# x = range(start, limit, step)
# plt.plot(x, coherence_values)
# plt.xlabel("Num Topics")
# plt.ylabel("Coherence score")
# plt.legend(("coherence_values"), loc='best')
# plt.show()

In [177]:
# # DO NOT RUN

# for m, cv in zip(x, coherence_values):
#     print("Num Topics =", m, " has Coherence Value of", round(cv, 4))

**Verdict:** For ratings5, # of topics k ~ 4

## Topics to pie charts

In [178]:
ratings1_wtp = ratings1.loc[:,['content','at','tokens']]
ratings1_wtp['Topic 1'] = pd.Series([0 for x in range(len(ratings1_wtp.index))])
ratings1_wtp['Topic 2'] = pd.Series([0 for x in range(len(ratings1_wtp.index))])
ratings1_wtp['Topic 3'] = pd.Series([0 for x in range(len(ratings1_wtp.index))])
ratings1_wtp['Topic 4'] = pd.Series([0 for x in range(len(ratings1_wtp.index))])

ratings5_wtp = ratings5.loc[:,['content','at','tokens']]
ratings5_wtp['Topic 1'] = pd.Series([0 for x in range(len(ratings5_wtp.index))])
ratings5_wtp['Topic 2'] = pd.Series([0 for x in range(len(ratings5_wtp.index))])
ratings5_wtp['Topic 3'] = pd.Series([0 for x in range(len(ratings5_wtp.index))])
ratings5_wtp['Topic 4'] = pd.Series([0 for x in range(len(ratings5_wtp.index))])

In [179]:
def get_topics(entry, rating): #entry = list, rating = int (1 or 5)
  if rating == 1:
    rev_bow = ratings1_dict.doc2bow(entry)
    x = ratings1_lda.get_document_topics(rev_bow)
  else:
    rev_bow = ratings5_dict.doc2bow(entry)
    x = ratings5_lda.get_document_topics(rev_bow)
  return x

In [180]:
# get_topics(ratings1['tokens'][0],1)

In [181]:
# WARNING: TAKES TIME TO LOAD
# for j in range(len(ratings1)):
#   topics = get_topics(ratings1['tokens'][j],1)
#    for i in topics:
#      ratings1_wtp.iloc[j,3+i[0]] = i[1]

In [182]:
for j in range(len(ratings1)):
  topics = get_topics(ratings1['tokens'][j],1)
  for i in topics:
    ratings1_wtp.iloc[j,3+i[0]] = i[1]

In [183]:
# I uploaded the dataframe that came from the above code and downloaded it below
!gdown --id 1dFHd-_s0DWO912ASZvkLotFXvhEC6mZg
ratings1_wtp = pd.read_csv('ratings1_wtp.csv')
ratings1_wtp = ratings1_wtp.drop('Unnamed: 0', axis = 1)
ratings1_wtp.head()

Downloading...
From: https://drive.google.com/uc?id=1dFHd-_s0DWO912ASZvkLotFXvhEC6mZg
To: /content/ratings1_wtp.csv
100% 13.8M/13.8M [00:00<00:00, 64.4MB/s]


Unnamed: 0,content,at,tokens,Topic 1,Topic 2,Topic 3,Topic 4
0,i always use cash in and pay bills with gcash ...,2021-04-17 11:05:13,"['always', 'use', 'cash', 'pay', 'bills', 'gsc...",0.021094,0.219607,0.021512,0.737787
1,tang inang verification yan ang arte,2021-04-17 10:36:05,"['tang', 'inang', 'verification', 'arte']",0.683002,0.050853,0.21613,0.050015
2,👎,2021-04-17 10:27:23,[],0.25,0.25,0.25,0.25
3,this is scam when i put my money in save money...,2021-04-17 09:53:18,"['scam', 'put', 'money', 'save', 'money', 'won...",0.028416,0.172247,0.771362,0.027974
4,nangangain ng pera sayang naman ung 200 ko,2021-04-17 09:24:28,"['nangangain', 'pera', 'sayang', '200']",0.645074,0.05233,0.050078,0.252519


In [184]:
# WARNING: TAKES TIME TO LOAD
# for j in range(len(ratings5)):
#   topics = get_topics(ratings5['tokens'][j],1)
#   for i in topics:
#     ratings5_wtp.iloc[j,3+i[0]] = i[1]

In [None]:
for j in range(len(ratings5)):
  topics = get_topics(ratings5['tokens'][j],1)
  for i in topics:
    ratings5_wtp.iloc[j,3+i[0]] = i[1]

In [None]:
# I uploaded the dataframe that came from the above code and downloaded it below
!gdown --id 1-5COVjdAQCkwvc72J3huln00Oe2w5Ky4
ratings5_wtp = pd.read_csv('ratings5_wtp.csv')
ratings5_wtp = ratings5_wtp.drop('Unnamed: 0', axis = 1)
ratings5_wtp.head()

Downloading...
From: https://drive.google.com/uc?id=1-5COVjdAQCkwvc72J3huln00Oe2w5Ky4
To: /content/ratings5_wtp.csv
  0% 0.00/16.9M [00:00<?, ?B/s] 56% 9.44M/16.9M [00:00<00:00, 94.2MB/s]100% 16.9M/16.9M [00:00<00:00, 103MB/s] 


Unnamed: 0,content,at,tokens,Topic 1,Topic 2,Topic 3,Topic 4
0,it's so nice app...,2021-04-17 11:06:19,['nice'],0.62473,0.125002,0.125001,0.125268
1,good app,2021-04-17 11:05:15,['good'],0.125017,0.125071,0.125124,0.624788
2,ok,2021-04-17 11:04:56,[],0.25,0.25,0.25,0.25
3,amazing 😍,2021-04-17 10:52:33,['amazing'],0.125007,0.622588,0.125006,0.127399
4,hahaha,2021-04-17 10:52:08,['hahaha'],0.623746,0.125003,0.125002,0.126249


## Sample

In [None]:
r1topicprobs_data = {'Topic' : ['1', '2', '3', '4'],
                     'Probs' : [ratings1_wtp['Topic 1'].sum(),
                                ratings1_wtp['Topic 2'].sum(),
                                ratings1_wtp['Topic 3'].sum(),
                                ratings1_wtp['Topic 4'].sum()]
                    }

r1topicprobs_df = pd.DataFrame(r1topicprobs_data)
r1topicprobs_df

Unnamed: 0,Topic,Probs
0,1,8222.672424
1,2,10144.115026
2,3,12729.774489
3,4,10020.958749


In [None]:
px.pie(r1topicprobs_df,
       names = 'Topic',
       values = 'Probs')

In [None]:
r5topicprobs_data = {'Topic' : ['1', '2', '3', '4'],
                     'Probs' : [ratings5_wtp['Topic 1'].sum(),
                                ratings5_wtp['Topic 2'].sum(),
                                ratings5_wtp['Topic 3'].sum(),
                                ratings5_wtp['Topic 4'].sum()]
                    }

r5topicprobs_df = pd.DataFrame(r5topicprobs_data)
r5topicprobs_df

Unnamed: 0,Topic,Probs
0,1,25078.756006
1,2,19059.85187
2,3,23590.867011
3,4,43941.366313


In [None]:
px.pie(r5topicprobs_df,
       names = 'Topic',
       values = 'Probs')

# Dash

In [None]:
! pip install datetime-truncate
from datetime_truncate import truncate
import plotly.graph_objects as go



In [None]:
all_data['month'] = all_data['at'].apply(lambda x: truncate(x, 'month').date())

In [None]:
# features.columns

In [None]:
def see_Hist(word, word2, df=all_data): #df=all_data, word = word to search
  graph = go.Figure()
  if word:
    all = df[['tokens','month']].copy() 
    all['isin'] = all['tokens'].apply(lambda x: 1 if word in x else 0)
    all = all[all['isin']==1].drop('isin', axis = 1).reset_index(drop=True)
    month_freq = all.groupby('month')['tokens'].count().reset_index().sort_values('month')
    title = 'Distribution of Reviews with "' + word +'"'
    month_freq = month_freq.rename(columns={'tokens':'Use Frequency'})
    graph.add_trace(go.Scatter(x=month_freq['month'],y=month_freq['Use Frequency'],name = word))
  
  if word2:
    all = df[['tokens','month']].copy() 
    all['isin'] = all['tokens'].apply(lambda x: 1 if word2 in x else 0)
    all = all[all['isin']==1].drop('isin', axis = 1).reset_index(drop=True)
    month_freq = all.groupby('month')['tokens'].count().reset_index().sort_values('month')
    title = 'Distribution of Reviews with "' + word2 +'"'
    month_freq = month_freq.rename(columns={'tokens':'Use Frequency'})
    graph.add_trace(go.Scatter(x=month_freq['month'],y=month_freq['Use Frequency'], name=word2))
  if word and word2:
    title = 'Distribution of reviews with "' + word +'" vs reviews with "' + word2 +'"' 
  if not word and not word2:
    return graph
  graph.update_layout(title=title,
                   xaxis_title='Month',
                   yaxis_title='Use Count')

  return graph
  

In [None]:
def filter_top_features(time1, time2, df = all_data): #df - filterdate- postag- filter - grouby - filter - return #time1 = (yr,month)
  
  df = df[['score','content','month']].copy()
  df['month'] = df['month'].apply(lambda x: 1 if time1 <= tuple((x.year, x.month))<=time2 else 0)
  df = df[df['month']==1]
  score_content= df.groupby('score')['content'].apply(lambda x: ' '.join(x)).reset_index()
  score_content['tokens']= score_content['content'].apply(lambda x: list(word_tokenize(re.sub('(\W+)', " ", x.lower()))))
  score_content['tokens']= score_content['tokens'].apply(lambda x: [w for w in x if not w in stopwords_list])
  
  pos_tags = score_content[['tokens','score']].copy()
  pos_tags['tokens'] = pos_tags['tokens'].apply(lambda x: pos_tag(x))
  pos_tags['tokens'] = pos_tags['tokens'].apply(lambda x: [word[0] for word in x if word[1] in ['NN','NNP','NNPS','NNS','VB','VBG','VBD','VBN','VBP','VBZ','JJ','JJR','JJS']])
  pos_tags['freqDist'] = pos_tags['tokens'].apply(lambda x: FreqDist(token for token in x))
  #arrange freqDist descending
  pos_tags['freqDist'] = pos_tags['freqDist'].apply(lambda x: {k: v for k, v in sorted(x.items(), key=lambda item: item[1], reverse =True)})
  
  f1 = [(x,y) for x,y in dict(pos_tags['freqDist'][0]).items()]
  f2 = [(x,y) for x,y in dict(pos_tags['freqDist'][1]).items()]
  f3 = [(x,y) for x,y in dict(pos_tags['freqDist'][2]).items()]
  f4 = [(x,y) for x,y in dict(pos_tags['freqDist'][3]).items()]
  f5 = [(x,y) for x,y in dict(pos_tags['freqDist'][4]).items()]
  f_intersection = set([x[0] for x in f1[:100]]).intersection(set([x[0] for x in f2[:100]])).\
        intersection(set([x[0] for x in f3[:100]])).intersection(set([x[0] for x in f4[:100]])).\
        intersection(set([x[0] for x in f5[:100]]))
  
  features = pd.DataFrame({'rank': [x for x in range(1,len(filter(f1, f_intersection))+1)],
                         'rated_1': [x[0] for x in filter(f1, f_intersection)],
                         'rated_2': [x[0] for x in filter(f2, f_intersection)],
                         'rated_3': [x[0] for x in filter(f3, f_intersection)],
                         'rated_4': [x[0] for x in filter(f4, f_intersection)],
                         'rated_5': [x[0] for x in filter(f5, f_intersection)],
                         })
                         
  return features

In [None]:
def pie_maker(value, rating):
  time1 = tuple(first_mo + np.array((value[0]//12, value[0]%12)))
  time2 = tuple(first_mo + np.array((value[1]//12, value[1]%12)))
  if rating ==1: topics_df = ratings1_wtp.copy()
  if rating ==5: topics_df = ratings5_wtp.copy()
  topics_df['at'] = pd.to_datetime(topics_df['at']).apply(lambda x: truncate(x, 'month').date())
  topics_df['at'] = topics_df['at'].apply(lambda x: 1 if time1 <= tuple((x.year, x.month))<=time2 else 0)
  topics_df = topics_df[topics_df['at']==1]
  topics_df = pd.DataFrame({'Topic' : ['1', '2', '3', '4'],
                     'Probs' : [topics_df['Topic 1'].sum(),
                                topics_df['Topic 2'].sum(),
                                topics_df['Topic 3'].sum(),
                                topics_df['Topic 4'].sum()]
                    })
  pie = px.pie(topics_df,
       names = 'Topic',
       values = 'Probs')
  return pie

In [None]:
#slider stuff
time_length = np.array((all_data['month'].max().year,all_data['month'].max().month)) - np.array((all_data['month'].min().year,all_data['month'].min().month))
tot_mo = tuple(time_length)[0]*12+tuple(time_length)[1]
first_mo = np.array((all_data['month'].min().year,all_data['month'].min().month))

In [None]:
# features

In [None]:
!pip install jupyter-dash



In [None]:
import plotly.express as px
from jupyter_dash import JupyterDash
import dash_core_components as dcc
import dash_html_components as html
from dash.dependencies import Input, Output, State
import dash_table
import calendar

In [None]:
# #test value = [0,109] zero to last month
# def test_table(value):
#     time1 = tuple(first_mo + np.array((value[0]//12, value[0]%12)))
#     time2 = tuple(first_mo + np.array((value[1]//12, value[1]%12)))
#     return filter_top_features(time1,time2)

# test_feature = test_table([80,100])

In [None]:
app = JupyterDash(__name__)

markdown_text = '''
## Top Words and their History

Below are the most frequently used words under each rating for the GCash reviews dataset. Click on a cell or type a word on the search bar to find out more!
You can also move around the slider to change the timeframe of the table!
'''
app.layout = html.Div([
              html.Div([dcc.Markdown(markdown_text),
                        dcc.RangeSlider(
                            id='my-range-slider',
                            min=0,
                            max=tot_mo,
                            step=1,
                            value=[0, 109]
                        ),
                        html.Div(id='date-range-slider'),
                        html.Br(),
                        dash_table.DataTable(
                          id='table',
                          columns=[{"name": i, "id": i} for i in ['rank', 'rated_1', 'rated_2', 'rated_3', 'rated_4', 'rated_5']],
                          editable=True,
                          # data=features.to_dict('records'),
                          active_cell={'row': 4, 'column': 1, 'column_id': 'rated_2'},
                          fixed_rows={'headers': True},
                          style_table={'height': 300})],
                        
                     ),
              html.H4("Word History"),
              html.Div(["Search: ",
                        dcc.Input(id='search_word', value='verify', type='text')]),
              html.Br(),
              html.Div(["Search: ",
                        dcc.Input(id='search_word2', value='verify', type='text')]),
              dcc.Graph(id='graph'),
              dcc.RadioItems(id='pie_choice',
                                  options=[
                                      {'label': 'Rating: 1', 'value': 1},
                                      {'label': 'Rating: 5', 'value': 5},
                                  ],
                             value=1,
                             labelStyle={'display': 'inline-block'}
                              ), 
              dcc.Graph(id='pie'),
                      ],style={'margin-left': '5%',
                              'margin-right': '5%',
                               },)



@app.callback(
    Output('date-range-slider', 'children'),
    [Input('my-range-slider', 'value')]
    )
def update_slider(value):
    time1 = tuple(first_mo + np.array((value[0]//12, value[0]%12)))
    time2 = tuple(first_mo + np.array((value[1]//12, value[1]%12)))
    text1 = str(calendar.month_name[time1[1]]) + " " + str(time1[0])
    text2 = str(calendar.month_name[time2[1]]) + " " + str(time2[0])
    return 'Here are the top words from %s to %s'%(text1, text2)

@app.callback(
    Output("table", "data"),
    Input('my-range-slider', 'value')
    )
def update_table(value):
    time1 = tuple(first_mo + np.array((value[0]//12, value[0]%12)))
    time2 = tuple(first_mo + np.array((value[1]//12, value[1]%12)))
    ftable = filter_top_features(time1,time2).to_dict('records')
    return ftable

##________


@app.callback(
    Output('search_word', 'value'),
    [Input("table", "active_cell"),
     Input("table", "data")]
)
def update_search_value(active_cell, data):
    # return str(active_cell)
    # return str(features.iloc[active_cell['row'],active_cell['column']])
    return str(data[active_cell['row']]['rated_' + str(active_cell['column'])])

@app.callback(
    Output('graph', 'figure'),
    [Input("search_word", "value"),
     Input("search_word2",'value')]
)
def update_figure(search_word,search_word2):
    return see_Hist(search_word,search_word2)

@app.callback(
    Output('pie', 'figure'),
    [Input('my-range-slider', 'value'),
     Input('pie_choice', 'value')]
)
def update_figure(value, pie_choice):
    return pie_maker(value, pie_choice)


app.run_server()
# app.run_server(mode='inline')

Dash app running on:


<IPython.core.display.Javascript object>

# Failed Attempts

In [None]:
# app = JupyterDash(__name__)
# app.layout = html.Div(children=[
#     html.Div([
#         dcc.Dropdown(id='dropdown',
#         options=[{'label': 'Rating 1', 'value': '1'},
#                  {'label': 'Rating 2', 'value': '2'},
#                  {'label': 'Rating 3', 'value': '3'},
#                  {'label': 'Rating 4', 'value': '4'},
#                  {'label': 'Rating 5', 'value': '5'}],
#         value='1',
#         ),
#         dash_table.DataTable(id='data-table'
#             )],
#             style={'margin-left': '5%',
#                    'margin-right': '5%'},
#             ),
#     html.H1("Word History"),
#     html.Div(["Search: ",
#               dcc.Input(id='search_word', value='verify', type='text')]),
#     dcc.Graph(id='graph'),
    
# ])




# @app.callback(
#     Output('data-table', 'row'),
#     [Input('dropdown', 'value')]
# )

# def update_rows(selected_value):
#     data = all_feat[all_feat['rating'] == selected_value]
#     print(data)
#     columns = [{"name": i, "id": i} for i in data.columns]
#     return dash_table.DataTable(data=data, columns=columns)


# @app.callback(
#     Output('graph', 'figure'),
#     [Input("search_word", "value")]
# )
# def update_figure(search_word):
#     return see_Hist(all_data,search_word)



# app.run_server()
# # app.run_server(mode='inline')

In [None]:
# app = JupyterDash(__name__)
# app.layout = html.Div([
#         dcc.Dropdown(id='dropdown',
#         options=[{'label': 'Rating 1', 'value': '1'},
#                  {'label': 'Rating 2', 'value': '2'},
#                  {'label': 'Rating 3', 'value': '3'},
#                  {'label': 'Rating 4', 'value': '4'},
#                  {'label': 'Rating 5', 'value': '5'}],
#         value='1',
#         ),
#         html.Div(id='datatable',style={'margin-left': '5%', 'margin-right': '5%'},
#             )
#         ])


# @app.callback(
#     Output('datatable', 'children'),
#     [Input('dropdown', 'value')]
# )
# def update_output_div(selected_value):
#     data = all_feat[all_feat['rating'] == selected_value]
#     columns = [{"name": i, "id": i} for i in data.columns]
#     return dash_table.DataTable(data)

# app.run_server()

In [None]:
# app = JupyterDash(__name__)

# markdown_text = '''
# ## Top Words and their History

# Below are the most frequently used words under each rating for the GCash reviews dataset. Click on a cell or type a word on the search bar to find out more!
# You can also move around the slider to change the timeframe of the table!
# '''
# app.layout = html.Div([dcc.Markdown(markdown_text),
#               html.Div([
#                         html.Div([dcc.Graph(id="pie", style={'width': '40%', 'display': 'inline-block'}),
#                                   html.Div([
#                                             dcc.RangeSlider(
#                                                             id='my-range-slider',
#                                                             min=0,
#                                                             max=tot_mo,
#                                                             step=1,
#                                                             value=[0, 109]
#                                                         ),
#                                             html.Div(id='date-range-slider'),
#                                             html.Br(),
#                                             dash_table.DataTable(
#                                                                   id='table',
#                                                                   columns=[{"name": i, "id": i} for i in ['rank', 'rated_1', 'rated_2', 'rated_3', 'rated_4', 'rated_5']],
#                                                                   editable=True,
#                                                                   # data=features.to_dict('records'),
#                                                                   active_cell={'row': 4, 'column': 1, 'column_id': 'rated_2'},
#                                                                   fixed_rows={'headers': True},
#                                                                   style_table={'height': 300})],
#                                                                   style={'width': '40%', 'align': 'right', 'display': 'inline-block'}),
#                                   ]),
#                         ]),
#               html.H4("Word History"),
#               html.Div(["Search: ",
#                         dcc.Input(id='search_word', value='verify', type='text')]),
#               html.Br(),
#               html.Div(["Search: ",
#                         dcc.Input(id='search_word2', value='verify', type='text')]),
#               dcc.Graph(id='graph'),
#                       ])



# @app.callback(
#     Output('date-range-slider', 'children'),
#     [Input('my-range-slider', 'value')]
#     )
# def update_slider(value):
#     time1 = tuple(first_mo + np.array((value[0]//12, value[0]%12)))
#     time2 = tuple(first_mo + np.array((value[1]//12, value[1]%12)))
#     text1 = str(calendar.month_name[time1[1]]) + " " + str(time1[0])
#     text2 = str(calendar.month_name[time2[1]]) + " " + str(time2[0])
#     return 'Here are the top words from %s to %s'%(text1, text2)

# @app.callback(
#     Output("table", "data"),
#     Input('my-range-slider', 'value')
#     )
# def update_table(value):
#     time1 = tuple(first_mo + np.array((value[0]//12, value[0]%12)))
#     time2 = tuple(first_mo + np.array((value[1]//12, value[1]%12)))
#     ftable = filter_top_features(time1,time2).to_dict('records')
#     return ftable

# ##________


# @app.callback(
#     Output('search_word', 'value'),
#     [Input("table", "active_cell"),
#      Input("table", "data")]
# )
# def update_search_value(active_cell, data):
#     # return str(active_cell)
#     # return str(features.iloc[active_cell['row'],active_cell['column']])
#     return str(data[active_cell['row']]['rated_' + str(active_cell['column'])])

# @app.callback(
#     Output('graph', 'figure'),
#     [Input("search_word", "value"),
#      Input("search_word2",'value')]
# )
# def update_figure(search_word,search_word2):
#     return see_Hist(search_word,search_word2)

# @app.callback(
#     Output('pie', 'figure'),
#     [Input('my-range-slider', 'value')]
# )
# def update_figure(value):
#     return pie_maker(value, 1)


# app.run_server()
# # app.run_server(mode='inline')