In [1]:
# importing required libraries
import re
import os
import time
import gensim
import requests
import numpy as np
import pandas as pd
from gensim.utils import simple_preprocess
from gensim.parsing.preprocessing import STOPWORDS
from nltk.stem import WordNetLemmatizer, SnowballStemmer
from nltk.stem.porter import *
np.random.seed(2021)
import nltk
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\surya\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [2]:
df = pd.read_csv('combined-csv-files.csv')

  has_raised = await self.run_ast_nodes(code_ast.body, cell_name,


In [3]:
df_lda = pd.DataFrame()
df_lda['post_content'] = df['post_content'].values

In [4]:
df_lda = df_lda.dropna()
df_lda = df_lda.reset_index(drop=True)

In [5]:
df_lda['index'] = df_lda.index
documents = df_lda
documents

Unnamed: 0,post_content,index
0,"Hey, so Iâ€™m experiencing a burning sensation...",0
1,Sounds like a panic attack. I had the same thi...,1
2,I really hope so. Every time I think I have it...,2
3,I tested POSITIVE for COVID19. I had something...,3
4,"Wow, I really wish you a safe and speedy recov...",4
...,...,...
39054,I would call the hospital. See if they reccome...,39054
39055,"Get the antibodies, I would get help sooner th...",39055
39056,"I'm late seeing this post, to the original pos...",39056
39057,91% isnâ€™t that bad. \nYou really shouldnâ€™t...,39057


In [6]:
def lemmatize_stemming(text):
    stemmer = SnowballStemmer('english')
    return stemmer.stem(WordNetLemmatizer().lemmatize(text, pos='v'))

def preprocess(text):
    result = []
    for token in gensim.utils.simple_preprocess(text):
        if token not in gensim.parsing.preprocessing.STOPWORDS and len(token) > 3:
            result.append(lemmatize_stemming(token))
    return result

In [7]:
doc_sample = documents[documents['index'] == 4].values[0][0]

print('original document: ')
words = []
for word in doc_sample.split(' '):
    words.append(word)
print(words)
print('\n\n tokenized and lemmatized document: ')
print(preprocess(doc_sample))

original document: 
['Wow,', 'I', 'really', 'wish', 'you', 'a', 'safe', 'and', 'speedy', 'recovery.', 'How', 'long', 'have', 'you', 'been', 'positive?', 'Are', 'you', 'scared', 'that', 'you', 'might', 'need', 'to', 'go', 'to', 'the', 'ER?']


 tokenized and lemmatized document: 
['wish', 'safe', 'speedi', 'recoveri', 'long', 'posit', 'scar', 'need']


In [8]:
processed_docs = documents['post_content'].map(preprocess)
processed_docs

0        [experi, burn, sensat, right, middl, chest, co...
1        [sound, like, panic, attack, thing, begin, iso...
2        [hope, time, think, hold, breath, long, hold, ...
3        [test, posit, covid, similar, lung, headach, f...
4        [wish, safe, speedi, recoveri, long, posit, sc...
                               ...                        
39054    [hospit, reccomend, come, inhal, tell, concern...
39055               [antibodi, help, sooner, later, fight]
39056    [late, see, post, origin, poster, share, stori...
39057    [isnâ, shouldnâ, concern, reach, go, indic, yo...
39058    [thank, submiss, rememb, read, rule, ensur, po...
Name: post_content, Length: 39059, dtype: object

#### BOW

In [9]:
dictionary = gensim.corpora.Dictionary(processed_docs)
count = 0
for k, v in dictionary.iteritems():
    print(k, v)
    count += 1
    if count > 10:
        break

0 burn
1 chest
2 cough
3 covid
4 experi
5 havenâ
6 middl
7 overreact
8 relat
9 right
10 sensat


In [10]:
dictionary.filter_extremes(no_below=15, no_above=0.5, keep_n=100000)

#### Gensim doc2bow

In [11]:
bow_corpus = [dictionary.doc2bow(doc) for doc in processed_docs]
bow_corpus[4]

[(26, 1), (34, 1), (40, 1), (41, 1), (42, 1), (43, 1), (44, 1), (45, 1)]

In [12]:
bow_doc_4 = bow_corpus[4]
for i in range(len(bow_doc_4)):
    print("Word {} (\"{}\") appears {} time.".format(bow_doc_4[i][0], 
                                               dictionary[bow_doc_4[i][0]], 
bow_doc_4[i][1]))

Word 26 ("long") appears 1 time.
Word 34 ("posit") appears 1 time.
Word 40 ("need") appears 1 time.
Word 41 ("recoveri") appears 1 time.
Word 42 ("safe") appears 1 time.
Word 43 ("scar") appears 1 time.
Word 44 ("speedi") appears 1 time.
Word 45 ("wish") appears 1 time.


#### TF-IDF

In [13]:
from gensim import corpora, models
tfidf = models.TfidfModel(bow_corpus)
corpus_tfidf = tfidf[bow_corpus]
from pprint import pprint
for doc in corpus_tfidf:
    pprint(doc)
    break

[(0, 0.29952230827039616),
 (1, 0.21299857368565608),
 (2, 0.18070336398167702),
 (3, 0.08893386976084357),
 (4, 0.19854932216994936),
 (5, 0.30058642538983177),
 (6, 0.3676221123492878),
 (7, 0.46620073546823254),
 (8, 0.2805737356475635),
 (9, 0.1324365446187328),
 (10, 0.3470843989810606),
 (11, 0.2713600210339164),
 (12, 0.1972323962427399),
 (13, 0.11325352605661813)]


#### LDA using BOW

In [14]:
lda_model = gensim.models.LdaMulticore(bow_corpus, num_topics=10, id2word=dictionary, passes=2, workers=2)

In [15]:
for idx, topic in lda_model.print_topics(-1):
    print('Topic: {} \nWords: {}'.format(idx, topic))

Topic: 0 
Words: 0.058*"time" + 0.032*"thank" + 0.030*"hand" + 0.030*"post" + 0.030*"question" + 0.030*"rule" + 0.030*"kind" + 0.030*"read" + 0.029*"comment" + 0.029*"concern"
Topic: 1 
Words: 0.032*"test" + 0.019*"virus" + 0.017*"posit" + 0.013*"take" + 0.012*"time" + 0.012*"sure" + 0.011*"infect" + 0.011*"know" + 0.011*"negat" + 0.011*"go"
Topic: 2 
Words: 0.058*"https" + 0.044*"covid" + 0.043*"remov" + 0.026*"reddit" + 0.024*"post" + 0.017*"posit" + 0.016*"articl" + 0.014*"comment" + 0.013*"messag" + 0.011*"question"
Topic: 3 
Words: 0.041*"smell" + 0.025*"test" + 0.022*"tast" + 0.019*"covid" + 0.018*"posit" + 0.016*"symptom" + 0.014*"like" + 0.012*"day" + 0.012*"lose" + 0.011*"thing"
Topic: 4 
Words: 0.046*"test" + 0.041*"covid" + 0.017*"symptom" + 0.016*"posit" + 0.013*"antibodi" + 0.011*"negat" + 0.011*"doctor" + 0.011*"vaccin" + 0.010*"result" + 0.009*"get"
Topic: 5 
Words: 0.043*"test" + 0.038*"symptom" + 0.025*"fever" + 0.023*"day" + 0.023*"cough" + 0.020*"feel" + 0.018*"posit

#### LDA using TF-IDF

In [16]:
lda_model_tfidf = gensim.models.LdaMulticore(corpus_tfidf, num_topics=10, id2word=dictionary, passes=2, workers=4)
for idx, topic in lda_model_tfidf.print_topics(-1):
    print('Topic: {} Word: {}'.format(idx, topic))

Topic: 0 Word: 0.018*"better" + 0.016*"glad" + 0.015*"hope" + 0.012*"youâ" + 0.012*"thank" + 0.011*"feel" + 0.009*"okay" + 0.007*"time" + 0.006*"lose" + 0.006*"smell"
Topic: 1 Word: 0.011*"https" + 0.007*"covid" + 0.007*"hope" + 0.007*"test" + 0.007*"soon" + 0.007*"reddit" + 0.007*"blood" + 0.006*"peopl" + 0.006*"good" + 0.005*"symptom"
Topic: 2 Word: 0.053*"delet" + 0.010*"test" + 0.007*"symptom" + 0.006*"like" + 0.006*"day" + 0.006*"week" + 0.005*"sound" + 0.005*"fever" + 0.005*"know" + 0.005*"covid"
Topic: 3 Word: 0.011*"test" + 0.009*"peopl" + 0.007*"vaccin" + 0.007*"mask" + 0.006*"know" + 0.006*"wish" + 0.006*"virus" + 0.006*"think" + 0.006*"antibodi" + 0.005*"symptom"
Topic: 4 Word: 0.007*"help" + 0.007*"peopl" + 0.006*"hope" + 0.006*"chat" + 0.006*"situat" + 0.006*"place" + 0.006*"covid" + 0.005*"current" + 0.005*"thread" + 0.005*"allow"
Topic: 5 Word: 0.043*"time" + 0.031*"despair" + 0.031*"align" + 0.030*"automat" + 0.030*"compos" + 0.030*"perform" + 0.030*"submiss" + 0.030*"p

In [17]:
processed_docs[4]

['wish', 'safe', 'speedi', 'recoveri', 'long', 'posit', 'scar', 'need']

In [18]:
for index, score in sorted(lda_model[bow_corpus[4]], key=lambda tup: -1*tup[1]):
    print("\nScore: {}\t \nTopic: {}".format(score, lda_model.print_topic(index, 10)))


Score: 0.8999670743942261	 
Topic: 0.028*"peopl" + 0.023*"vaccin" + 0.015*"know" + 0.013*"thank" + 0.013*"mask" + 0.013*"covid" + 0.010*"think" + 0.009*"like" + 0.009*"go" + 0.009*"wear"

Score: 0.011115937493741512	 
Topic: 0.058*"time" + 0.032*"thank" + 0.030*"hand" + 0.030*"post" + 0.030*"question" + 0.030*"rule" + 0.030*"kind" + 0.030*"read" + 0.029*"comment" + 0.029*"concern"

Score: 0.011115716770291328	 
Topic: 0.020*"feel" + 0.018*"like" + 0.018*"better" + 0.015*"tast" + 0.015*"hope" + 0.013*"thing" + 0.012*"help" + 0.012*"think" + 0.012*"sorri" + 0.011*"smell"

Score: 0.011115482077002525	 
Topic: 0.046*"test" + 0.041*"covid" + 0.017*"symptom" + 0.016*"posit" + 0.013*"antibodi" + 0.011*"negat" + 0.011*"doctor" + 0.011*"vaccin" + 0.010*"result" + 0.009*"get"

Score: 0.011115405708551407	 
Topic: 0.041*"smell" + 0.025*"test" + 0.022*"tast" + 0.019*"covid" + 0.018*"posit" + 0.016*"symptom" + 0.014*"like" + 0.012*"day" + 0.012*"lose" + 0.011*"thing"

Score: 0.011115049012005329	 

In [19]:
for index, score in sorted(lda_model_tfidf[bow_corpus[4]], key=lambda tup: -1*tup[1]):
    print("\nScore: {}\t \nTopic: {}".format(score, lda_model_tfidf.print_topic(index, 10)))


Score: 0.8999711871147156	 
Topic: 0.011*"test" + 0.009*"peopl" + 0.007*"vaccin" + 0.007*"mask" + 0.006*"know" + 0.006*"wish" + 0.006*"virus" + 0.006*"think" + 0.006*"antibodi" + 0.005*"symptom"

Score: 0.0111159048974514	 
Topic: 0.018*"better" + 0.016*"glad" + 0.015*"hope" + 0.012*"youâ" + 0.012*"thank" + 0.011*"feel" + 0.009*"okay" + 0.007*"time" + 0.006*"lose" + 0.006*"smell"

Score: 0.011115079745650291	 
Topic: 0.047*"thank" + 0.022*"sorri" + 0.021*"test" + 0.011*"updat" + 0.010*"loss" + 0.010*"know" + 0.009*"day" + 0.008*"negat" + 0.008*"hear" + 0.008*"posit"

Score: 0.01111462339758873	 
Topic: 0.007*"help" + 0.007*"peopl" + 0.006*"hope" + 0.006*"chat" + 0.006*"situat" + 0.006*"place" + 0.006*"covid" + 0.005*"current" + 0.005*"thread" + 0.005*"allow"

Score: 0.011114274151623249	 
Topic: 0.016*"smell" + 0.014*"tast" + 0.009*"symptom" + 0.009*"week" + 0.007*"month" + 0.007*"like" + 0.007*"vaccin" + 0.007*"day" + 0.007*"lose" + 0.006*"think"

Score: 0.011114194057881832	 
Topic:

In [20]:
unseen_document = 'How a Pentagon deal became an identity crisis for Google'
bow_vector = dictionary.doc2bow(preprocess(unseen_document))
for index, score in sorted(lda_model[bow_vector], key=lambda tup: -1*tup[1]):
    print("Score: {}\t Topic: {}".format(score, lda_model.print_topic(index, 5)))

Score: 0.8198050260543823	 Topic: 0.029*"pain" + 0.024*"like" + 0.024*"feel" + 0.021*"symptom" + 0.016*"go"
Score: 0.020043853670358658	 Topic: 0.058*"https" + 0.044*"covid" + 0.043*"remov" + 0.026*"reddit" + 0.024*"post"
Score: 0.020023411139845848	 Topic: 0.046*"test" + 0.041*"covid" + 0.017*"symptom" + 0.016*"posit" + 0.013*"antibodi"
Score: 0.02001967839896679	 Topic: 0.028*"peopl" + 0.023*"vaccin" + 0.015*"know" + 0.013*"thank" + 0.013*"mask"
Score: 0.020019616931676865	 Topic: 0.020*"feel" + 0.018*"like" + 0.018*"better" + 0.015*"tast" + 0.015*"hope"
Score: 0.02001858316361904	 Topic: 0.032*"test" + 0.019*"virus" + 0.017*"posit" + 0.013*"take" + 0.012*"time"
Score: 0.020018573850393295	 Topic: 0.018*"breath" + 0.016*"feel" + 0.016*"like" + 0.013*"go" + 0.013*"heart"
Score: 0.0200177114456892	 Topic: 0.041*"smell" + 0.025*"test" + 0.022*"tast" + 0.019*"covid" + 0.018*"posit"
Score: 0.020017357543110847	 Topic: 0.043*"test" + 0.038*"symptom" + 0.025*"fever" + 0.023*"day" + 0.023*"c

#### Document Term Matrix

In [None]:
dtm = np.zeros((39059,10), dtype=float)

row = 0
for doc in bow_corpus[0:39059]:
    sm = lda_model_tfidf.get_document_topics(doc)
    for e in sm:
        dtm[row,e[0]] = e[1]
    row = row + 1
    
cp = pd.DataFrame(dtm)

cp.columns = ['Topic1', 'Topic2', 'Topic3', 'Topic4', 'Topic5', 'Topic6', 'Topic7', 'Topic8', 'Topic9', 'Topic10'] 
cp.insert(0, 'post_content', documents_ccf.post_content)

cp.to_csv("Doc_Out.csv")