In [62]:
# For using custom module.
import sys
sys.path.append('../custom_library')

In [63]:
import spacy
from nltk.corpus import stopwords 
from nltk.stem.wordnet import WordNetLemmatizer
import string
import numpy as np
import pandas as pd 
import connect_to_db as cn
from gensim import corpora
import gensim
import csv
import parmap

In [64]:
# to suppress warnings
from warnings import filterwarnings
filterwarnings('ignore')

In [65]:
nlp = spacy.load('en_core_web_sm')

# stop loss words 
stop = set(stopwords.words('english'))

# punctuation, 구두점 제거.
exclude = set(string.punctuation) 

# lemmatization, 표제어 추출. (am, are, is -> be, ed, s 등 제거.)
lemma = WordNetLemmatizer() 

# One function for all the steps:
def clean(doc):
    
    # convert text into lower case + split into words
    stop_free = " ".join([i for i in doc.lower().split() if i not in stop])
    
    # remove any stop words present
    punc_free = ''.join(ch for ch in stop_free if ch not in exclude)  
    
    # remove punctuations + normalize the text
    normalized = " ".join(lemma.lemmatize(word) for word in punc_free.split())
                
    return normalized

In [66]:
custom_stop_words = ["im", "going", "would", "like", "cant", "donâ€™t", "canâ€™t", "iâ€™ve", "iâ€™m", "me", "someone", "whatâ€™s", "it", "really", "feel", "live", "like", "fucking", "myself", "another", "help", "got", "get", "dont", "want", "anymore", "know", "make", "self", "everything", "see", "else", "oh", "there", "thing", "wanna", "wouldnâ€™t", "might", "itâ€™s", "didnâ€™t", "yâ€™all", "do", "anyone", "people", "ever", "please"]

def remove_custom_stop_words(word_lists):
    for word_list in word_lists:
        stops = []
        for word in word_list:
            # 단어가 custom stop words에 속하거나, 숫자거나, 알파벳 하나일 경우 제거.
            if word in custom_stop_words or word.isdigit() or len(word) == 1:
                stops.append(word)
        
        for stop in stops:
            word_list.remove(stop)
            
        # list가 stop words 제거로 인해 비었는지 확인.
        if not word_list:
            word_lists.remove(word_list)

    return word_lists

# LDA Topic Modeling

In [76]:
def save_topic_words_and_weights(table_name, community, count, remove_sw):
    sql = f'select node_id from {table_name} where community_id_fastgreedy_is = {community}'
    result_df = cn.select_query_result_to_df(sql)
    authors = np.array(result_df['node_id'].astype(str).values.tolist())

    length = len(authors)

    doc = []

    for i in range(length):
        # sql2 = f"select distinct p.post_key, p.title from posts p, comments c where p.post_key = c.link_key and c.author = '{authors[i]}' and c.link_key = c.parent_key and p.is_valid_author=1 and MONTH(p.created_utc) <> 12;";
        # sql2 = f"select body from comments where author = '{authors[i]}' and is_valid=1 and link_key = parent_key;"
        sql2 = f"select distinct p.post_key, p.title from posts p, comments c where p.post_key = c.link_key and c.author = '{authors[i]}' and c.link_key = c.parent_key and p.is_valid_author=1;"
        result_df2 = cn.select_query_result_to_df(sql2)
        if not result_df2.empty:
            titles = np.array(result_df2['title'].astype(str).values.tolist())
            # titles = np.array(result_df2['body'].astype(str).values.tolist())
            doc.extend(titles)
            
    if len(doc) < 2:
        return None
        
    corpus = doc
    num_words = 50
    folder = 'topic_words'
        
    # clean data stored in a new list
    clean_corpus = [clean(doc).split() for doc in corpus]
    # custom stop words 제거.
    if remove_sw:
        clean_corpus = remove_custom_stop_words(clean_corpus)
        num_words = 10
        # num_words = 40
        folder = 'topic_words_stop_words_removed'
    dictionary = corpora.Dictionary(clean_corpus)
    corpus = [dictionary.doc2bow(text) for text in clean_corpus]
        
    if count >= 10000:
        num_topics = 10
    elif count >= 1000:
        num_topics = 5
    elif count >= 100:
        num_topics = 4
    else:
        num_topics = 3
     
    # 결과가 매번 다르게 나오는 것을 방지하기 위한 seed 고정.
    SOME_FIXED_SEED = 624
    np.random.seed(SOME_FIXED_SEED)
    
    # Exception of empty list
    if corpus and dictionary:
        print(community, end=' ')
        ldamodel = gensim.models.LdaMulticore(corpus, id2word=dictionary, num_topics=num_topics, passes=10)
        x=ldamodel.show_topics(num_topics=num_topics, num_words=num_words,formatted=False)
        topics_words = [[wd[0] for wd in tp[1]] for tp in x]
        topics_words_weights = [[wd[1] for wd in tp[1]] for tp in x]    
    
        words_df = pd.DataFrame(topics_words)
        weights_df = pd.DataFrame(topics_words_weights)
        words_df.to_csv(f"../lda/csv/lda_results/{table_name}/posts/{folder}_{num_words}_for_tsne/community_{community}_topics_{num_words}_words.csv", header=None, index=None)
        # weights_df.to_csv(f"../lda/csv/lda_results/{table_name}/posts/{folder}_weights_{num_words}_for_tsne/community_{community}_topics_{num_words}_weights.csv", header=None, index=None)
        return community

In [45]:
sql = "select community_id_fastgreedy_is, count(*) from nodes_until_november group by community_id_fastgreedy_is order by count(*) desc limit 105;"
result_df = cn.select_query_result_to_df(sql)
communities = list(np.array(result_df['community_id_fastgreedy_is'].values.tolist()))
counts = list(np.array(result_df['count(*)'].values.tolist()))

In [40]:
for community, count in zip(communities, counts):
    save_topic_words_and_weights('nodes_until_november', community, count, True)
    save_topic_words_and_weights('nodes_until_november', community, count, False)

# For t-SNE

In [77]:
sql = "select community_id_fastgreedy_is, count(*) from nodes group by community_id_fastgreedy_is having count(*) > 2 order by count(*) asc;"
result_df = cn.select_query_result_to_df(sql)
communities = list(np.array(result_df['community_id_fastgreedy_is'].values.tolist()))
counts = list(np.array(result_df['count(*)'].values.tolist()))

In [None]:
valid_communities = []
for community, count in zip(communities, counts):
    valid_community = save_topic_words_and_weights('nodes', community, count, True)
    if valid_community != None:
        valid_communities.append(valid_community)
    # save_topic_words_and_weights('nodes', community, count, False)

14069 9074 2972 13835 8616 5890 5311 5 5455 25 27 28 1676 38 9124 57 78 5976 6360 109 115 8174 122 123 129 11104 13739 138 7914 152 154 159 13129 164 173 8785 221 222 230 6601 241 2721 3420 2329 249 7869 263 269 2334 270 274 1281 282 9023 287 296 305 8762 6381 314 7084 318 335 8196 9944 11030 349 366 387 388 3502 391 393 394 9205 10308 406 7798 410 881 12417 442 2425 449 470 476 481 5452 483 11902 488 1629 497 499 4199 8766 543 8046 9251 548 12853 550 551 562 563 2390 579 587 598 605 7320 608 901 613 666 10969 652 665 10248 12483 673 688 701 706 708 10040 712 730 968 732 742 752 783 786 6342 813 1938 831 5865 848 6269 9588 856 10630 887 4969 893 895 897 899 904 4871 925 6954 5562 5586 3837 970 1816 978 993 5617 1033 1035 1042 1043 9024 6539 1062 1069 1078 1093 1096 1107 8695 1123 9767 1131 1136 10307 1143 1148 1159 1167 1177 1210 1258 1217 4144 7388 1233 1235 938 1249 5104 1256 1265 1267 1530 1273 1277 1293 10551 1318 1190 3031 1325 7667 1334 8095 1339 1340 4737 5500 1361 1362 1370 658

7847 11826 2332 10005 10027 10033 9435 2553 10044 3280 10076 13402 10103 7037 10743 12388 10157 453 667 10193 6639 3954 10210 12354 280 7159 10225 11580 6892 4668 10247 10255 5347 10263 10264 10657 8381 11213 8672 10925 10327 10332 10342 1587 10352 11054 14007 12553 9904 422 12760 10492 9109 10500 12840 9722 5745 10529 10535 10539 2821 10563 10566 1101 10593 11504 10629 10634 10663 6755 6562 11814 10698 13746 5672 10717 13163 11956 6558 10772 10388 2145 10793 7433 10823 10870 10878 7536 10895 9032 8147 6414 9350 1284 10948 10960 4321 10977 11040 937 11063 1913 6626 3240 11101 11485 11688 5911 11170 6602 9627 11231 3177 11315 11319 11325 7485 11348 11358 7732 11366 2416 11378 8988 9669 5891 12218 9340 11542 8234 11556 1706 1407 11612 11650 11658 11748 11763 12153 1207 11882 9430 11963 11977 11999 9455 12477 757 12740 9861 12179 6141 12250 10131 12346 7367 12360 1199 7510 8542 12380 6050 9034 9758 14184 12846 6418 12505 12532 12561 13657 12661 13444 9453 12863 12951 14230 13009 13929 225

1964 1967 1124 3234 2866 3303 2067 1153 2113 2122 2136 2139 2161 2183 2187 2194 4277 12611 2211 6151 1154 5071 6471 6575 2313 5720 2941 2975 2339 2344 943 6524 2436 5412 2307 2503 2014 2541 3423 11106 3692 191 2635 2645 399 3866 2542 1753 2707 2732 5469 8250 2769 2770 2779 2818 1426 1590 2868 5485 2914 6190 2964 2968 2985 2994 2127 5926 2827 8835 3214 3248 12108 3299 3330 3418 3444 3477 3479 3500 5797 2910 3605 3572 3614 4750 3732 9328 3779 3608 5856 11957 306 3845 3468 3938 3942 4330 4026 7354 3994 3207 7378 7527 2812 1604 4336 2813 1346 4363 4416 4427 10431 8765 4551 4564 7897 2637 7786 5947 344 1663 3898 10666 390 192 4927 4930 4946 4951 2180 4301 2302 4980 12103 4983 4157 5039 5076 954 3543 5031 5142 5889 5157 5182 696 2508 5314 5329 205 3254 7654 1767 2536 8048 6326 5550 629 5754 2141 4497 341 5823 3193 5925 5154 1790 2278 1678 6000 572 6067 998 8644 8626 3719 8255 6329 1978 8824 10942 6376 6739 7518 4695 6569 6592 6683 8996 6133 6880 5451 411 2640 10753 11332 3814 4698 2281 6526 

In [6]:
sql = "select community_id_fastgreedy_is, count(*) from nodes group by community_id_fastgreedy_is order by count(*) desc limit 105;"
result_df = cn.select_query_result_to_df(sql)
communities = list(np.array(result_df['community_id_fastgreedy_is'].values.tolist()))
counts = list(np.array(result_df['count(*)'].values.tolist()))

In [7]:
for community, count in zip(communities, counts):
    save_topic_words_and_weights('nodes', community, count, True)

In [7]:
def calculate_cosine_similarity(numerator, topic_words, clean_corpus, largest_cosine_similarity, community, most_related_community):
    cosine_similarity = numerator / ((len(topic_words) ** 0.5) * (len(clean_corpus) ** 0.5))
    if cosine_similarity > largest_cosine_similarity:
        largest_cosine_similarity = cosine_similarity
        most_related_community = community
            
    return largest_cosine_similarity, most_related_community

# Community Matching by LDA

In [13]:
def calculate_similarity_with_the_community(clean_corpus, community, remove_sw, option, largest_cosine_similarity, most_related_community):
    # 1) stop word 제거 한 경우.
    if remove_sw:
        with open(f"../lda/csv/lda_results/nodes_until_november/comments/topic_words_stop_words_removed_40/community_{community}_topics_40_words.csv", newline='') as f:
            reader = csv.reader(f)
            topics = list(reader)
    
    # 2) if !remove_sw, stop word 제거 안 한 경우.
    else:
        with open(f"../lda/csv/lda_results/nodes_until_november/comments/topic_words_50/community_{community}_topics_50_words.csv", newline='') as f:
            reader = csv.reader(f)
            topics = list(reader)
        
    # 1-1), 2-1) 가중치 없이 cosine similarity 계산하는 경우.
    if option == "count":
        for topic_words in topics:
            numerator = 0
            for word in clean_corpus:
                if word in topic_words:
                    numerator +=1
            largest_cosine_similarity, most_related_community = calculate_cosine_similarity(numerator, topic_words, 
                                                                                            clean_corpus, 
                                                                                            largest_cosine_similarity, 
                                                                                            community, most_related_community)
        
    # 1-2), 2-2) 가중치를 rank로 줘서 cosine similarity 계산하는 경우.
    elif option == "rank":
        for topic_words in topics:
            numerator = 0
            for word in clean_corpus:
                if word in topic_words:
                    idx = topic_words.index(word)
                    # weight에 대한 내림차순으로 정렬되어있으므로, idx에 역수를 취해 가중치를 부여.
                    # ex) weight가 2번째로 높은 word일 경우, 1 / 1+1 = 1/2의 가중치를 받음.
                    numerator += 1 / (idx + 1)
            largest_cosine_similarity, most_related_community = calculate_cosine_similarity(numerator, topic_words, 
                                                                                            clean_corpus, 
                                                                                            largest_cosine_similarity, 
                                                                                            community, most_related_community)
                    
    # 1-3), 2-3) option == 'weight', 가중치를 topic words weight로 줘서 cosine similarity 계산하는 경우.
    else:
        # words에 대한 weights가 저장되어있는 file open.
        # stop words 제거한 file open.
        if remove_sw:
            with open(f"../lda/csv/lda_results/nodes_until_november/comments/topic_words_stop_words_removed_weights_40/community_{community}_topics_40_weights.csv", newline='') as f:
                reader = csv.reader(f)
                topics_weights = list(reader)
            
        else:
            with open(f"../lda/csv/lda_results/nodes_until_november/comments/topic_words_weights_50/community_{community}_topics_50_weights.csv", newline='') as f:
                reader = csv.reader(f)
                topics_weights = list(reader)
        
        for topic_words, weights in zip(topics, topics_weights):
            numerator = 0
            for word in clean_corpus:
                if word in topic_words:
                    idx = topic_words.index(word)
                    numerator += float(weights[idx])
            largest_cosine_similarity, most_related_community = calculate_cosine_similarity(numerator, topic_words, 
                                                                                            clean_corpus, 
                                                                                            largest_cosine_similarity, 
                                                                                            community, most_related_community)
        
    return largest_cosine_similarity, most_related_community

In [11]:
def community_matching(sentence, remove_sw, option):
    clean_corpus = clean(sentence).split()
    
    if len(clean_corpus) < 1:
        return -1
    
    largest_cosine_similarity = 0
    most_related_community = 0
    
    for community in communities:
        largest_cosine_similarity, most_related_community = calculate_similarity_with_the_community(clean_corpus, community, 
                                                                                           remove_sw, option,
                                                                                          largest_cosine_similarity,
                                                                                          most_related_community)
    
    return most_related_community

In [10]:
sentence = "My family is at my house and Iâ€™m locked in Jeff room crying."

print(community_matching(sentence, True, 'count'))
print(community_matching(sentence, True, 'rank'))
print(community_matching(sentence, True, 'weight'))
print(community_matching(sentence, False, 'count'))
print(community_matching(sentence, False, 'rank'))
print(community_matching(sentence, False, 'weight'))

2
1522
140
1039
1039
140


# Result Evaluation

In [11]:
def jaccard_coefficient_between_prediction_and_answer(post_key, predicted_community):
    sql = f"select author from comments where link_key = '{post_key}' and is_valid=1 and is_valid_author=1 and link_key = parent_key;"
    result_df = cn.select_query_result_to_df(sql)
    if not result_df.empty:
        authors = list(np.array(result_df['author'].values.tolist()))
    else:
        return -1
    
    sql2 = f"select node_id from nodes_until_november where community_id_fastgreedy_is = {predicted_community};"
    result_df2 = cn.select_query_result_to_df(sql2)
    predicted_authors = list(np.array(result_df2['node_id'].values.tolist()))
    
    count_a = len(authors)
    count_pa = len(predicted_authors)
    count_i = 0
    
    for author in authors:
        if author in predicted_authors:
            count_i += 1
    
    jaccard_coefficient = count_i / (count_a + count_pa - count_i)
    
    return jaccard_coefficient

In [14]:
def prediction_main(index, folder, remove_sw, option):
    sql = "select post_key, title from posts where is_valid_author=1 and MONTH(created_utc) = 12;"
    result_df = cn.select_query_result_to_df(sql)
    post_keys = list(np.array(result_df['post_key'].values.tolist()))
    titles = list(np.array(result_df['title'].values.tolist()))

    result_for_csv = []
    filename = f"../lda/csv/prediction_results/{folder}/prediction_result_{index}.csv"
    
    if index % 10000 == 0:
        start_index = index - 10000
    else:
        start_index = index - (index % 10000)
        
    for i in range(start_index, index):
        community = community_matching(titles[i], remove_sw=remove_sw, option=option)
        if community != -1:
            jaccard_coefficient = jaccard_coefficient_between_prediction_and_answer(post_keys[i], community)
        result_for_csv.append([post_keys[i], community, format(float(jaccard_coefficient), '.10f')])
    
    fields = ['post_key', 'predicted_community', 'jaccard_coefficient']
    cn.write_csv_for_db_update(filename, fields, result_for_csv)

In [18]:
# post_count = 110037
index_list = [10000, 20000, 30000, 40000, 50000, 60000, 70000, 80000, 90000, 100000, 110000, 110037]

if __name__ == '__main__':
    # multi processing.
    parmap.map(prediction_main, index_list, folder='prediction_weight', 
               remove_sw=False, option='weight', pm_pbar=True, pm_processes=12)

100%|██████████| 12/12 [42:23<00:00, 211.97s/it]  
