In [50]:
import re
import spacy
from nltk.corpus import stopwords 
from nltk.stem.wordnet import WordNetLemmatizer
import string
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
import numpy as np
from sklearn.decomposition import LatentDirichletAllocation
import pandas as pd
import sys  
import connect_to_db as cn
from gensim import corpora
import gensim
import csv
import parmap

In [44]:
# to suppress warnings
from warnings import filterwarnings
filterwarnings('ignore')

In [69]:
nlp = spacy.load('en_core_web_sm')

# stop loss words 
stop = set(stopwords.words('english'))

# punctuation, 구두점 제거.
exclude = set(string.punctuation) 

# lemmatization, 표제어 추출. (am, are, is -> be, ed, s 등 제거.)
lemma = WordNetLemmatizer() 

# One function for all the steps:
def clean(doc):
    
    # convert text into lower case + split into words
    stop_free = " ".join([i for i in doc.lower().split() if i not in stop])
    
    # remove any stop words present
    punc_free = ''.join(ch for ch in stop_free if ch not in exclude)  
    
    # remove punctuations + normalize the text
    normalized = " ".join(lemma.lemmatize(word) for word in punc_free.split())
                
    return normalized

In [160]:
custom_stop_words = ["im", "going", "would", "like", "cant", "donâ€™t", "canâ€™t", "iâ€™ve", "iâ€™m", "me", "someone", "whatâ€™s", "it", "really", "feel", "live", "like", "fucking", "myself", "another", "help", "got", "get", "dont", "want", "anymore", "know", "make", "self", "everything", "see", "else", "oh", "there", "thing", "wanna", "wouldnâ€™t", "might", "itâ€™s", "didnâ€™t", "yâ€™all", "do", "anyone", "people"]

def extract_custom_stop_words(word_lists):
    for word_list in word_lists:
        stops = []
        for word in word_list:
            # 단어가 custom stop words에 속하거나, 숫자거나, 알파벳 하나일 경우 제거.
            if word in custom_stop_words or word.isdigit() or len(word) == 1:
                stops.append(word)
        
        for stop in stops:
            word_list.remove(stop)
            
        # list가 stop words 제거로 인해 비었는지 확인.
        if not word_list:
            word_lists.remove(word_list)

    return word_lists

In [158]:
def save_topic_words_csv(table_name, community, count):
    sql = f'select node_id from {table_name} where community_id_fastgreedy_is = {community}'
    result_df = cn.select_query_result_to_df(sql)
    authors = np.array(result_df['node_id'].astype(str).values.tolist())

    length = len(authors)

    doc = []

    for i in range(length):
        sql2 = f"select distinct p.post_key, p.title from posts p, comments c where p.post_key = c.link_key and c.author = '{authors[i]}' and c.link_key = c.parent_key and p.is_valid_author=1 and MONTH(p.created_utc) <> 12;";
        result_df2 = cn.select_query_result_to_df(sql2)
        if not result_df2.empty:
            titles = np.array(result_df2['title'].astype(str).values.tolist())
            doc.extend(titles)
        
    corpus = doc
        
    # clean data stored in a new list
    clean_corpus = [clean(doc).split() for doc in corpus]
    # custom stop words 제거.
    clean_corpus = extract_custom_stop_words(clean_corpus)
    dictionary = corpora.Dictionary(clean_corpus)
    corpus = [dictionary.doc2bow(text) for text in clean_corpus]
        
    if count >= 10000:
        num_topics = 10
    elif count >= 1000:
        num_topics = 5
    elif count >= 100:
        num_topics = 4
    else:
        num_topics = 3
     
    # 결과가 매번 다르게 나오는 것을 방지하기 위한 seed 고정.
    SOME_FIXED_SEED = 5
    np.random.seed(SOME_FIXED_SEED)

    ldamodel = gensim.models.LdaMulticore(corpus, id2word=dictionary, num_topics=num_topics, passes=10)
    x=ldamodel.show_topics(num_topics=num_topics, num_words=40,formatted=False)
    topics_words = [[wd[0] for wd in tp[1]] for tp in x]
    topics_words_weights = [[wd[1] for wd in tp[1]] for tp in x]    
    
    words_df = pd.DataFrame(topics_words)
    weights_df = pd.DataFrame(topics_words_weights)
    words_df.to_csv(f"/home/mykim/source/plotting-and-graph-analysis/lda/topics_words_stop_words_removed/community_{community}_topics_40_words.csv", header=None, index=None)
    weights_df.to_csv(f"/home/mykim/source/plotting-and-graph-analysis/lda/topics_words_weights_stop_words_removed/community_{community}_topics_40_weights.csv", header=None, index=None)

In [120]:
sql = "select community_id_fastgreedy_is, count(*) from nodes_until_november group by community_id_fastgreedy_is order by count(*) desc limit 105;"
result_df = cn.select_query_result_to_df(sql)
communities = list(np.array(result_df['community_id_fastgreedy_is'].values.tolist()))
counts = list(np.array(result_df['count(*)'].values.tolist()))

In [159]:
for community, count in zip(communities, counts):
    save_topic_words_csv('nodes_until_november', community, count)

KeyboardInterrupt: 

In [55]:
def community_matching(sentence):
    clean_corpus = clean(sentence).split()
    
    most_related_community = 0
    largest_cosine_similarity = 0
    
    for community in communities:
        with open(f"/home/mykim/source/plotting-and-graph-analysis/lda/50_words/community_{community}_topics_50_words.csv", newline='') as f:
            reader = csv.reader(f)
            topics = list(reader)
            
            for topic_words in topics:
                count = 0
                for word in clean_corpus:
                    if word in topic_words:
                        count +=1
                        cosine_similarity = count / ((len(topic_words) ** 0.5) * (len(clean_corpus) ** 0.5))
                        if cosine_similarity > largest_cosine_similarity:
                            largest_cosine_similarity = cosine_similarity
                            most_related_community = community
    
    return most_related_community

In [56]:
def jaccard_coefficient_between_prediction_and_answer(post_key, predicted_community):
    sql = f"select author from comments where link_key = '{post_key}' and is_valid=1 and is_valid_author=1 and link_key = parent_key;"
    result_df = cn.select_query_result_to_df(sql)
    if not result_df.empty:
        authors = list(np.array(result_df['author'].values.tolist()))
    else:
        return -1
    
    sql2 = f"select node_id from nodes_until_november where community_id_fastgreedy_is = {predicted_community};"
    result_df2 = cn.select_query_result_to_df(sql2)
    predicted_authors = list(np.array(result_df2['node_id'].values.tolist()))
    
    count_a = len(authors)
    count_pa = len(predicted_authors)
    count_i = 0
    
    for author in authors:
        if author in predicted_authors:
            count_i += 1
    
    jaccard_coefficient = count_i / (count_a + count_pa - count_i)
    
    return jaccard_coefficient

In [57]:
def prediction_main(index):
    sql = "select post_key, title from posts where is_valid_author=1 and MONTH(created_utc) = 12;"
    result_df = cn.select_query_result_to_df(sql)
    post_keys = list(np.array(result_df['post_key'].values.tolist()))
    titles = list(np.array(result_df['title'].values.tolist()))

    result_for_csv = []
    
    if index % 10000 == 0:
        start_index = index - 10000
    else:
        start_index = index - (index % 10000)
        
    for i in range(start_index, index):
        community = community_matching(titles[i])
        jaccard_coefficient = jaccard_coefficient_between_prediction_and_answer(post_keys[i], community)
        result_for_csv.append([post_keys[i], community, format(float(jaccard_coefficient), '.10f')])
    
    fields = ['post_key', 'predicted_community', 'jaccard_coefficient']
    cn.write_csv_for_db_update(f"/home/mykim/source/plotting-and-graph-analysis/lda/prediction/prediction_result_{index}.csv", fields, result_for_csv)

In [58]:
# post_count = 110037
index_list = [10000, 20000, 30000, 40000, 50000, 60000, 70000, 80000, 90000, 100000, 110000, 110037]

if __name__ == '__main__':
    # multi processing.
    parmap.map(prediction_main, index_list, pm_pbar=True, pm_processes=12)

100%|██████████| 12/12 [1:02:33<00:00, 312.77s/it]
