In [58]:
!pip install vaderSentiment

import numpy as np 
import pandas as pd 
import re
import nltk
from nltk.stem import WordNetLemmatizer, snowball
from nltk.stem.porter import *
from nltk import word_tokenize
from nltk.tag import pos_tag
import gensim
from gensim.utils import simple_preprocess
from gensim.parsing.preprocessing import STOPWORDS
from gensim import corpora, models
from gensim.models import word2vec
import seaborn as sns
from pprint import pprint
import statsmodels.formula.api as smf

from sklearn.manifold import TSNE
import matplotlib.pyplot as plt
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

nltk.download('wordnet')
%matplotlib inline

In [2]:
df = pd.read_csv('../input/covid19-vaccine-news-reddit-discussions/comments.csv')
df.head()

## 2. Topic Modeling 

#### 2.1.1 Topic Modeling in Verb

In [3]:
""" Testing the function
# SnowballStemmer Example
#the stemmer requires a language parameter
snow_stemmer = snowball.SnowballStemmer(language='english')
  
#list of tokenized words
words = ['cared','university','fairly','easily','singing',
       'sings','sung','singer','sportingly']
  
#stem's of each word
stem_words = []
for w in words:
    x = snow_stemmer.stem(w)
    stem_words.append(x)
      
#print stemming results
for e1,e2 in zip(words,stem_words):
    print(e1+' ----> '+e2)

# SnowballStemmer trial
snow_stemmer = snowball.SnowballStemmer(language='english')
snow_stemmer.stem('post')  

# Creating function trial
result = []
i = 0
for token in gensim.utils.simple_preprocess(df['comment_body'][0]):
    print(f"{i}'th term'")
    if token not in gensim.parsing.preprocessing.STOPWORDS and len(token) > 3:
        print(token)
        token = WordNetLemmatizer().lemmatize(token, pos='v')
        print(token)
        snow_stemmer = snowball.SnowballStemmer(language='english')
        result.append(snow_stemmer.stem(token))
    i +=1
print(result)"""

In [4]:
def lemmatize_stemming_verb(text):
    snow_stemmer = snowball.SnowballStemmer(language='english')
    return snow_stemmer.stem(WordNetLemmatizer().lemmatize(text, pos='v'))
def preprocess_verb(text):
    result = []
    for token in gensim.utils.simple_preprocess(text):
        if token not in gensim.parsing.preprocessing.STOPWORDS and len(token) > 3:
            result.append(lemmatize_stemming_verb(token))
    return result

In [5]:
verb_processed_docs = df['comment_body'].map(preprocess_verb)
verb_processed_docs[:10]

In [6]:
dictionary = gensim.corpora.Dictionary(verb_processed_docs)
count = 0
# check if it's well recorded
for k, v in dictionary.iteritems():
    print(k, v)
    count += 1
    if count > 10:
        break

In [7]:
dictionary.filter_extremes(no_below=15, no_above=0.5, keep_n=5000)
bow_corpus = [dictionary.doc2bow(doc) for doc in verb_processed_docs]

In [8]:
tfidf = models.TfidfModel(bow_corpus)
corpus_tfidf = tfidf[bow_corpus]
for doc in corpus_tfidf:
    pprint(doc)
    break

In [9]:
lda_model = gensim.models.LdaMulticore(bow_corpus, num_topics=10, id2word=dictionary, passes=2, workers=2)
for idx, topic in lda_model.print_topics(-1):
    print(f'Topic: {idx} \nWord)s: {topic}')

Topic Modeling using only lda_model yeilds interesting results. Although the vaccine is a redundant term coming out, and given the frequency of word 'vaccine' was the largest among all the comments, the topics are well divided. Even some topics can be distinguished by reading the the words. But since there are some redundant topics, we can reduce the number of topics and see if the topics are well divided without redundant topics.

In [10]:
lda_model = gensim.models.LdaMulticore(bow_corpus, num_topics=6, id2word=dictionary, passes=2, workers=2)
for idx, topic in lda_model.print_topics(-1):
    print(f'Topic: {idx} \nWord)s: {topic}')

At least the lda model with 'num_topics = 6' is less redundant, but now the words in each topics are less clear than lda model with 'num_topics = 10'. Let's see if the lda model with tf-idf model applied and see if there's any difference between simple lda model with lda model with tf-idf model.

In [11]:
lda_model_tfidf = gensim.models.LdaMulticore(corpus_tfidf, num_topics=10, id2word=dictionary, passes=2, workers=4)
for idx, topic in lda_model_tfidf.print_topics(-1):
    print(f'Topic: {idx} Word: {topic}')

In [12]:
lda_model_tfidf = gensim.models.LdaMulticore(corpus_tfidf, num_topics=6, id2word=dictionary, passes=2, workers=4)
for idx, topic in lda_model_tfidf.print_topics(-1):
    print(f'Topic: {idx} Word: {topic}')

In case of lda model with tf-idf model applied, the weight of the vaccine has been dramatically reduced. However, some of the unnecessary words such as 'sure', 'reddit', yeah', which contain less information but oftenly used are included. There are some strength and weakness on the lda model with tf-idf model applied, but we decide to use simple lda model for further study since the words in the simple lda model contain more information.

#### 2.1.2 Topic Modeling in Adjective

In [13]:
def lemmatize_stemming_adj(text):
    snow_stemmer = snowball.SnowballStemmer(language='english')
    return snow_stemmer.stem(WordNetLemmatizer().lemmatize(text, pos='a'))
def preprocess_adj(text):
    result = []
    for token in gensim.utils.simple_preprocess(text):
        if token not in gensim.parsing.preprocessing.STOPWORDS and len(token) > 3:
            result.append(lemmatize_stemming_adj(token))
    return result

In [14]:
adj_processed_docs = df['comment_body'].map(preprocess_adj)
adj_processed_docs[:10]

In [15]:
dictionary = gensim.corpora.Dictionary(adj_processed_docs)
count = 0
for k, v in dictionary.iteritems():
    print(k, v)
    count += 1
    if count > 10:
        break

In [16]:
dictionary.filter_extremes(no_below=15, no_above=0.5, keep_n=5000)
bow_corpus = [dictionary.doc2bow(doc) for doc in adj_processed_docs]

In [17]:
lda_model = gensim.models.LdaMulticore(bow_corpus, num_topics=10, id2word=dictionary, passes=2, workers=2)
for idx, topic in lda_model.print_topics(-1):
    print(f'Topic: {idx} \nWord)s: {topic}')

In [18]:
lda_model = gensim.models.LdaMulticore(bow_corpus, num_topics=5, id2word=dictionary, passes=2, workers=2)
for idx, topic in lda_model.print_topics(-1):
    print(f'Topic: {idx} \nWord)s: {topic}')

Running lda model with only adjective led to bad performing result. The potential reasons behind is 1) the lemmatization wasn't perfectly done, we still can observe the verb although we have filtered adjective variable, 2) the adjective were used less frequently that some dominant words (i.e. vaccine, dose, trial, etc.) so few adjective words are being seen. Let's see if the same thing applies to noun and advervbs as well.

#### 2.1.3 Topic Modeling in Noun

In [19]:
def lemmatize_stemming_noun(text):
    snow_stemmer = snowball.SnowballStemmer(language='english')
    return snow_stemmer.stem(WordNetLemmatizer().lemmatize(text, pos='n'))
def preprocess_noun(text):
    result = []
    for token in gensim.utils.simple_preprocess(text):
        if token not in gensim.parsing.preprocessing.STOPWORDS and len(token) > 3:
            result.append(lemmatize_stemming_noun(token))
    return result

In [20]:
noun_processed_docs = df['comment_body'].map(preprocess_noun)
noun_processed_docs[:10]

In [21]:
dictionary = gensim.corpora.Dictionary(noun_processed_docs)
count = 0
for k, v in dictionary.iteritems():
    print(k, v)
    count += 1
    if count > 10:
        break

In [22]:
dictionary.filter_extremes(no_below=15, no_above=0.5, keep_n=5000)
bow_corpus = [dictionary.doc2bow(doc) for doc in noun_processed_docs]

In [23]:
lda_model = gensim.models.LdaMulticore(bow_corpus, num_topics=10, id2word=dictionary, passes=2, workers=2)
for idx, topic in lda_model.print_topics(-1):
    print(f'Topic: {idx} \nWord)s: {topic}')

In [24]:
lda_model = gensim.models.LdaMulticore(bow_corpus, num_topics=6, id2word=dictionary, passes=2, workers=2)
for idx, topic in lda_model.print_topics(-1):
    print(f'Topic: {idx} \nWord)s: {topic}')

Just using noun tag gives less information about the topic. Since most of the words are noun, it feels like random sequences of words. We'll see if adverbs gives meaningful result.

#### 2.1.4 Topic Modeling in Adverb

In [25]:
def lemmatize_stemming_adv(text):
    snow_stemmer = snowball.SnowballStemmer(language='english')
    return snow_stemmer.stem(WordNetLemmatizer().lemmatize(text, pos='r'))
def preprocess_adv(text):
    result = []
    for token in gensim.utils.simple_preprocess(text):
        if token not in gensim.parsing.preprocessing.STOPWORDS and len(token) > 3:
            result.append(lemmatize_stemming_adv(token))
    return result

In [26]:
adv_processed_docs = df['comment_body'].map(preprocess_adv)
adv_processed_docs[:10]

In [27]:
dictionary = gensim.corpora.Dictionary(adv_processed_docs)
count = 0
for k, v in dictionary.iteritems():
    print(k, v)
    count += 1
    if count > 10:
        break

In [28]:
dictionary.filter_extremes(no_below=15, no_above=0.5, keep_n=5000)
bow_corpus = [dictionary.doc2bow(doc) for doc in adv_processed_docs]

In [29]:
lda_model = gensim.models.LdaMulticore(bow_corpus, num_topics=10, id2word=dictionary, passes=2, workers=2)
for idx, topic in lda_model.print_topics(-1):
    print(f'Topic: {idx} \nWord)s: {topic}')

In [30]:
lda_model = gensim.models.LdaMulticore(bow_corpus, num_topics=6, id2word=dictionary, passes=2, workers=2)
for idx, topic in lda_model.print_topics(-1):
    print(f'Topic: {idx} \nWord)s: {topic}')

For the adverb part, similar to adjective, simple lda model with only adverb led to bad performing result. The lemmatizer hasn't identified adverb precisely. Maybe combination of pos tag might give us a better result, we run lda model with noun, verb, and adjective for the last time. and before running the model, we'll eliminate some words which appears often but not really necessary such as "reddit", "https", "comment", "wiki" and foul lauguage and informal words such as "fuck","boof", and "legit".

In [31]:
def lemmatize_verb_adj_noun(sentence):
    wnl = WordNetLemmatizer()
    snow_stemmer = snowball.SnowballStemmer(language='english')
    for word, tag in pos_tag(word_tokenize(sentence)):
        if tag.startswith("NN"):
            return wnl.lemmatize(word, pos='n')
        elif tag.startswith('VB'):
            return snow_stemmer.stem(wnl.lemmatize(word, pos='v'))
        elif tag.startswith('JJ'):
            return snow_stemmer.stem(wnl.lemmatize(word, pos='a'))
        else:
            return word

def preprocess_verb_adj_noun(text):
    result = []
    for token in gensim.utils.simple_preprocess(text):
        if token not in gensim.parsing.preprocessing.STOPWORDS and len(token) > 3:
            result.append(lemmatize_verb_adj_noun(token))
    return result

In [32]:
verb_adj_noun_processed_docs = df['comment_body'].map(preprocess_verb_adj_noun)
verb_adj_noun_processed_docs[:10]

In [33]:
# omit unnecessary variables
verb_adj_noun_processed_docs = [[element for element in sentence if element!="reddit" and element!='subreddit'
                                 and element!='vaccin' and element!= 'huuuuuuuge' and element!="https" and element!="http" 
                                 and element!="comment" and element!="wiki" and element!="fuck" and element!="boof" 
                                 and element!="legit" and element!='post' and element!='question' and element!='discussion'
                                 and element!="wiki_rule_" and element!="delet" and element!= "remov" and element!= "like"] for sentence in verb_adj_noun_processed_docs]


In [34]:
dictionary = gensim.corpora.Dictionary(verb_adj_noun_processed_docs)
count = 0
for k, v in dictionary.iteritems():
    print(k, v)
    count += 1
    if count > 10:
        break

In [35]:
dictionary.filter_extremes(no_below=15, no_above=0.5, keep_n=5000)
bow_corpus = [dictionary.doc2bow(doc) for doc in verb_adj_noun_processed_docs]

In [36]:
lda_model = gensim.models.LdaMulticore(bow_corpus, num_topics=4, id2word=dictionary, passes=2, workers=2)
for idx, topic in lda_model.print_topics(-1):
    print(f'Topic: {idx} \nWord)s: {topic}')

Some of the topics are hard to identify since a lot of vocabularies are redundant among the topics, but still we can distinguish the topics by the other keywords. Since we have eliminated few words that are not significant or detrimental, we assume this model yeilds the best result. For the number of topic, since the topics are pretty limited (It's about covid and vaccination, we assume the topic is not as wide as lda of random test) so we decide to reduce the num_topics to 4.

## 2.2 Sentiment Analysis

In [37]:
STOP_WORDS = nltk.corpus.stopwords.words()

def clean_sentence(val):
    "remove chars that are not letters or numbers, downcase, then remove stop words"
    regex = re.compile('([^\s\w?!]|_)')
    sentence = regex.sub('', val).lower()
    sentence = sentence.split(" ")
    
    for word in list(sentence):
        if word in STOP_WORDS:
            sentence.remove(word)
            
    sentence = " ".join(sentence)
    return sentence

def clean_dataframe(data):
    "drop nans, then apply 'clean_sentence' function to question1 and 2"
    data = data.dropna(how="any")
    
    data['comment_body'] = data['comment_body'].apply(clean_sentence)
    
    return data

In [38]:
analyser = SentimentIntensityAnalyzer()

def sentiment_analyzer_scores(sentence):
    score = analyser.polarity_scores(sentence)
    print(f"{sentence} {str(score)}")

In [39]:
df = clean_dataframe(df)
df['comment_body']

In [40]:
analyser.polarity_scores(df['comment_body'][3])['compound']

In [41]:
sentiment = []
for i in range(len(df['comment_body'])):
    sentiment.append(analyser.polarity_scores(df['comment_body'][i])['compound'])
sentiment = pd.Series(sentiment)
df = pd.concat([df,sentiment], axis=1)
df.columns = ['post_id','post_author','post_date','post_title','post_score','post_permalink','post_url','comment_id',
              'comment_author','comment_date','comment_parent_id','comment_edited','comment_score','comment_body','sentiment']

In [42]:
df.head()

In [43]:
df['sentiment'].plot.kde(ind=[-1, -0.5, 0, 0.5, 1],figsize=(15,15))

The reason why there are huge spike in 0.00 is because there are a lot of variable with the compound sentiment with 0.00 value, which means the segmentation of positive, neutral, negative tone came out as 100% neutral (with neu: 1.0 value). Since the comments are short and some of them contains only factual informations, having substantial amount of compound score of 0.0 is plausible.

In [44]:
pos = 0
neu = 0
neg = 0
for i in range(len(df['sentiment'])):
    if df['sentiment'][i] == 0.0:
        neu += 1
    elif df['sentiment'][i] > 0.0:
        pos += 1
    else:
        neg += 1
        
print(f"positive: {pos}, neutral: {neu}, negative: {neg}")

In [45]:
data = {'positive': pos, 'neutral': neu, 'negative': neg}
sentiment_series = pd.Series(data, index = ['positive', 'neutral', 'negative'])
sentiment_series.plot.bar(rot = 0, figsize=(15,15))

Looking at the barplot, we can see that although there is the highest spike on the 0.0, the total number of neutral comments are the least. One of the interesting findings is that the number of positive comments are much more than the neutral and negative comments. The possible reason for the phenomena is because that the dictionary tends to evaluate the comment positive.

## Adding covariates to topic modeling

In [46]:
lda_model.print_topics(-1,7)

following is the available topics for the finalized lda_model, we now need to extract the dominant topic of each sentence, fit it with the sentiment model to check how topics vary by the sentiment. We then need to check how to find the dominant topic in each sentence.

In [47]:
for index, score in sorted(lda_model[bow_corpus[1]], key=lambda tup: -1*tup[1]):
    print("\nScore: {}\t \nTopic: {}".format(score, lda_model.print_topic(index, 20)))

Test version to check the score of each topic for the sentence index 1. We can measure score by 
'lda_model[bow_corpus][sentence index]'.

In [48]:
lda_model[bow_corpus][4]

In [49]:
dominant_topic = []
dominant_topic_score = []
for i in range(len(bow_corpus)):
    array = np.array(lda_model[bow_corpus][i])[:,1]
    dominant_topic.append(array.argmax())
    dominant_topic_score.append(array[array.argmax()])
print(dominant_topic[:10])
print(dominant_topic_score[:10])

In [50]:
dominant_topic_df = pd.DataFrame({'dominant_topic':dominant_topic, 'dominant_topic_score':dominant_topic_score})
df = pd.concat([df,dominant_topic_df], axis=1)
df.head()

Now we divide up the df into 3 dfs: df with positive sentiment, df with neutral sentiment, and df with negative sentiment.

In [51]:
df_positive = df.query('sentiment > 0.0')
df_neutral = df.query('sentiment == 0.0')
df_negative = df.query('sentiment < 0.0')

print(df_positive.shape[0])
print(df_neutral.shape[0])
print(df_negative.shape[0])

In [52]:
pos_grouped = df_positive.groupby('dominant_topic').size()
neu_grouped = df_neutral.groupby('dominant_topic').size()
neg_grouped = df_negative.groupby('dominant_topic').size()

grouped_size = pd.concat([neg_grouped,neu_grouped,pos_grouped], axis = 1)
grouped_size.columns = ['neg_dominant', 'neu_dominant', 'pos_dominant']
grouped_size.head()

since the # of observations of the each df is different, we try to normalize the data by dividing it with the length of each columns.

In [53]:
grouped_size['pos_dominant'] = grouped_size['pos_dominant'].div(df_positive.shape[0])
grouped_size['neu_dominant'] = grouped_size['neu_dominant'].div(df_neutral.shape[0])
grouped_size['neg_dominant'] = grouped_size['neg_dominant'].div(df_negative.shape[0])
grouped_size.head()

Now we can see some pattens in the dataframe. we now visualize the result.

In [54]:
line_plot = grouped_size.T.plot.line(figsize=(15,15))

However looking at the line plot with 3 bin is not sufficient, so we do the same thing with more bin with binwidth of 0.2. But there are some interesting trends in the graph such as topic 0. We'll examine more when the binwidth is narrower.

In [55]:
df_most_positive = df.query('sentiment > 0.8')
df_quite_positive = df.query('sentiment <= 0.8 & sentiment > 0.6')
df_somewhat_positive = df.query('sentiment <= 0.6 & sentiment > 0.4')
df_abit_positive = df.query('sentiment <= 0.4 & sentiment > 0.2')
df_merely_positive = df.query('sentiment <= 0.2 & sentiment > 0.0')
df_neutral = df.query('sentiment == 0.0')
df_merely_negative = df.query('sentiment < 0.0 & sentiment >= -0.2')
df_abit_negative = df.query('sentiment < -0.2 & sentiment >= -0.4')
df_somewhat_negative = df.query('sentiment < -0.4 & sentiment >= -0.6')
df_quite_negative = df.query('sentiment < -0.6 & sentiment >= -0.8')
df_most_negative = df.query('sentiment < -0.8')

most_positive_grouped = df_most_positive.groupby('dominant_topic').size()
quite_positive_grouped = df_quite_positive.groupby('dominant_topic').size()
somewhat_positive_grouped = df_somewhat_positive.groupby('dominant_topic').size()
abit_positive_grouped = df_abit_positive.groupby('dominant_topic').size()
merely_positive_grouped = df_merely_positive.groupby('dominant_topic').size()
neutral_grouped = df_neutral.groupby('dominant_topic').size()
merely_negative_grouped = df_merely_negative.groupby('dominant_topic').size()
abit_negative_grouped = df_abit_negative.groupby('dominant_topic').size()
somewhat_negative_grouped = df_somewhat_negative.groupby('dominant_topic').size()
quite_negative_grouped = df_quite_negative.groupby('dominant_topic').size()
most_negative_grouped = df_most_negative.groupby('dominant_topic').size()

grouped_size = pd.concat([most_negative_grouped,quite_negative_grouped,somewhat_negative_grouped,
                         abit_negative_grouped, merely_negative_grouped, neutral_grouped,
                         merely_positive_grouped, abit_positive_grouped, somewhat_positive_grouped,
                         quite_positive_grouped, most_positive_grouped], axis = 1)
grouped_size.columns = ['most_neg_dom', 'quite_neg_dom', 'somewhat_neg_dom',
                       'abit_neg_dom', 'merely_neg_dom', 'neu_dom',
                       'merely_pos_dom', 'abit_pos_dom', 'somewhat_pos_dom',
                       'quite_pos_dom', 'most_pos_dom']
grouped_size['most_neg_dom'] = grouped_size['most_neg_dom'].div(df_most_negative.shape[0])
grouped_size['quite_neg_dom'] = grouped_size['quite_neg_dom'].div(df_quite_negative.shape[0])
grouped_size['somewhat_neg_dom'] = grouped_size['somewhat_neg_dom'].div(df_somewhat_negative.shape[0])
grouped_size['abit_neg_dom'] = grouped_size['abit_neg_dom'].div(df_abit_negative.shape[0])
grouped_size['merely_neg_dom'] = grouped_size['merely_neg_dom'].div(df_merely_negative.shape[0])
grouped_size['neu_dom'] = grouped_size['neu_dom'].div(df_neutral.shape[0])
grouped_size['merely_pos_dom'] = grouped_size['merely_pos_dom'].div(df_merely_positive.shape[0])
grouped_size['abit_pos_dom'] = grouped_size['abit_pos_dom'].div(df_abit_positive.shape[0])
grouped_size['somewhat_pos_dom'] = grouped_size['somewhat_pos_dom'].div(df_somewhat_positive.shape[0])
grouped_size['quite_pos_dom'] = grouped_size['quite_pos_dom'].div(df_quite_positive.shape[0])
grouped_size['most_pos_dom'] = grouped_size['most_pos_dom'].div(df_most_positive.shape[0])

grouped_size.head()

In [56]:
line_plot = grouped_size.T.plot.line(figsize=(16,8))

The trends are more angular, but the overall trends doesn't differ too much from the previous graph with binwidth 3. The interesting findings are, people who tends to leave comments related to topic 0 has neutral and positive feelings, while comments with topic 1 tend to contain some kind of emotions, topic 1 slightly tends to be negative. Topic 2 and 3 shows no trend.

Just to Quantify the data, let's see the regression details for each topics and see the tendency of being positive, neutral, or negative based on the topic.

In [59]:
df_dom_top_0 = df.query('dominant_topic == 0')
df_dom_top_1 = df.query('dominant_topic == 1')
df_dom_top_2 = df.query('dominant_topic == 2')
df_dom_top_3 = df.query('dominant_topic == 3')

In [62]:
top0_results = smf.ols('dominant_topic_score ~ sentiment', data=df_dom_top_0).fit()
print(top0_results.summary())

In [63]:
top1_results = smf.ols('dominant_topic_score ~ sentiment', data=df_dom_top_1).fit()
print(top1_results.summary())

In [64]:
top2_results = smf.ols('dominant_topic_score ~ sentiment', data=df_dom_top_2).fit()
print(top2_results.summary())

In [65]:
top3_results = smf.ols('dominant_topic_score ~ sentiment', data=df_dom_top_3).fit()
print(top3_results.summary())

The result differ with r because of the way of 1)methodology of evaluating the sentiment was different between python and R (We used vaderSentiment analyzer in python, the sentiment analysis model for unsupervised model which returns positive, negative, neutral sentiment for each sentences, while for r, we have imported dictionary of positive and negative words, count the appearence in sentence and divided them by number of token in the sentence) 2) methodology evaluating covarates were different (for python, we have created 2 new columns, dominant topic and its intensity, dived the dataframe by sentence topic and created linear regression model with the intensity of the topic with the sentiment, while we used stm package in r). Since the methodology was evaluating sentiment and methodology fitting sentiment as a covariate were different, these two models gave entirely different results.