In [39]:
# for text preprocessing
import re
import spacy

from nltk.corpus import stopwords 
from nltk.stem.wordnet import WordNetLemmatizer
import string

# import vectorizers
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer

# import numpy for matrix operation
import numpy as np

# import LDA from sklearn
from sklearn.decomposition import LatentDirichletAllocation

import pandas as pd

# import .py file in another directory
import sys  
import connect_to_db as cn

from gensim import corpora
import gensim

import csv

In [40]:
# to suppress warnings
from warnings import filterwarnings
filterwarnings('ignore')

In [41]:
nlp = spacy.load('en_core_web_sm')

In [42]:
# stop loss words 
stop = set(stopwords.words('english'))

# punctuation, 구두점 제거.
exclude = set(string.punctuation) 

# lemmatization, 표제어 추출. (am, are, is -> be, ed, s 등 제거.)
lemma = WordNetLemmatizer() 

# One function for all the steps:
def clean(doc):
    
    # convert text into lower case + split into words
    stop_free = " ".join([i for i in doc.lower().split() if i not in stop])
    
    # remove any stop words present
    punc_free = ''.join(ch for ch in stop_free if ch not in exclude)  
    
    # remove punctuations + normalize the text
    normalized = " ".join(lemma.lemmatize(word) for word in punc_free.split())  
    return normalized

In [43]:
sql = "select community_id_fastgreedy_is, count(*) from nodes group by community_id_fastgreedy_is having count(*) < 10000 order by count(*) desc limit 100;"
result_df = cn.select_query_result_to_df(sql)
communities = list(np.array(result_df['community_id_fastgreedy_is'].values.tolist()))
counts = list(np.array(result_df['count(*)'].values.tolist()))

In [48]:
def comments_lda(communities, counts):
    for community, count in zip(communities, counts):
        sql = f'select c.body from comments c, nodes n where c.author = n.node_id and c.is_valid=1 and n.community_id_fastgreedy_is = {community} and c.link_key = c.parent_key;'
        result_df = cn.select_query_result_to_df(sql)
        corpus = list(np.array(result_df['body'].astype(str).values.tolist()))
        doc = list(np.array(result_df['body'].astype(str).values.tolist()))
        
        # clean data stored in a new list
        clean_corpus = [clean(doc).split() for doc in corpus]
        dictionary = corpora.Dictionary(clean_corpus)
        corpus = [dictionary.doc2bow(text) for text in clean_corpus]
        
        if count >= 1000:
            num_topics = 5
        elif count >= 100:
            num_topics = 4
        else:
            num_topics = 3
            
        ldamodel = gensim.models.LdaMulticore(corpus, id2word=dictionary, num_topics=num_topics, passes=10)
        x=ldamodel.show_topics(num_topics=num_topics, num_words=25,formatted=False)
        
        topics_words = [(tp[0], [wd[0] for wd in tp[1]]) for tp in x]
        
        f = open(f"/home/mykim/source/plotting-and-graph-analysis/lda/3_to_5_topics/comments/lda_{num_topics}_topics_community_{community}_comments.csv", 'w')
        out = csv.writer(f, delimiter=",")
        out.writerow(['topic', 'topic_words'])
        for topic, words in topics_words:
            out.writerow([topic, words])
        f.close()

In [49]:
comments_lda(communities, counts)

In [54]:
def posts_lda(communities, counts):
    for community, count in zip(communities, counts):
        sql = f'select node_id from nodes where community_id_fastgreedy_is = {community}'
        result_df = cn.select_query_result_to_df(sql)
        authors = np.array(result_df['node_id'].astype(str).values.tolist())

        length = len(authors)

        doc = []

        for i in range(length):
            sql2 = f"select distinct p.post_key, p.title from posts p, comments c where p.post_key = c.link_key and c.author = '{authors[i]}' and c.link_key = c.parent_key and p.is_valid_author=1;";
            result_df2 = cn.select_query_result_to_df(sql2)
            if not result_df2.empty:
                titles = np.array(result_df2['title'].astype(str).values.tolist())
                doc.extend(titles)
        
        corpus = doc
        
        # clean data stored in a new list
        clean_corpus = [clean(doc).split() for doc in corpus]
        dictionary = corpora.Dictionary(clean_corpus)
        corpus = [dictionary.doc2bow(text) for text in clean_corpus]
        
        if count >= 1000:
            num_topics = 5
        elif count >= 100:
            num_topics = 4
        else:
            num_topics = 3
            
        ldamodel = gensim.models.LdaMulticore(corpus, id2word=dictionary, num_topics=num_topics, passes=10)
        x=ldamodel.show_topics(num_topics=num_topics, num_words=25,formatted=False)
        
        topics_words = [(tp[0], [wd[0] for wd in tp[1]]) for tp in x]
        
        f = open(f"/home/mykim/source/plotting-and-graph-analysis/lda/3_to_5_topics/posts/lda_{num_topics}_topics_community_{community}_posts.csv", 'w')
        out = csv.writer(f, delimiter=",")
        out.writerow(['topic', 'topic_words'])
        for topic, words in topics_words:
            out.writerow([topic, words])
        f.close()

In [56]:
posts_lda(communities, counts)