In [9]:
import re
import spacy
from nltk.corpus import stopwords 
from nltk.stem.wordnet import WordNetLemmatizer
import string
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
import numpy as np
from sklearn.decomposition import LatentDirichletAllocation
import pandas as pd
import sys  
import connect_to_db as cn
from gensim import corpora
import gensim
import csv

# save model
from gensim.test.utils import datapath

In [2]:
# to suppress warnings
from warnings import filterwarnings
filterwarnings('ignore')

In [3]:
nlp = spacy.load('en_core_web_sm')

# stop loss words 
stop = set(stopwords.words('english'))

# punctuation, 구두점 제거.
exclude = set(string.punctuation) 

# lemmatization, 표제어 추출. (am, are, is -> be, ed, s 등 제거.)
lemma = WordNetLemmatizer() 

# One function for all the steps:
def clean(doc):
    
    # convert text into lower case + split into words
    stop_free = " ".join([i for i in doc.lower().split() if i not in stop])
    
    # remove any stop words present
    punc_free = ''.join(ch for ch in stop_free if ch not in exclude)  
    
    # remove punctuations + normalize the text
    normalized = " ".join(lemma.lemmatize(word) for word in punc_free.split())  
    return normalized

In [24]:
sql = "select community_id_fastgreedy_is, count(*) from nodes_until_november group by community_id_fastgreedy_is order by count(*) desc limit 105;"
result_df = cn.select_query_result_to_df(sql)
communities = list(np.array(result_df['community_id_fastgreedy_is'].values.tolist()))
counts = list(np.array(result_df['count(*)'].values.tolist()))

print(communities)
print(counts)

[1, 2, 7, 3, 6, 12, 20, 17, 30, 43, 42, 21, 68, 1039, 45, 114, 450, 163, 384, 922, 489, 869, 755, 544, 1522, 2051, 827, 340, 951, 338, 23, 36, 1186, 1006, 26, 880, 980, 423, 396, 105, 2919, 108, 4, 224, 763, 685, 326, 104, 34, 258, 168, 0, 1679, 1240, 281, 836, 787, 254, 14, 86, 531, 2179, 1615, 1630, 1089, 51, 964, 297, 798, 1712, 271, 1242, 1129, 76, 1343, 1865, 140, 3381, 2010, 313, 493, 300, 3499, 744, 825, 3769, 1244, 727, 1502, 2262, 900, 604, 746, 217, 1623, 446, 1762, 3700, 1564, 2892, 991, 321, 2050, 736, 1948]
[191366, 154802, 111081, 99825, 13826, 8658, 1534, 1461, 1206, 1133, 1050, 732, 194, 141, 140, 137, 126, 118, 105, 101, 91, 88, 74, 74, 70, 62, 61, 61, 60, 58, 56, 55, 55, 54, 54, 54, 54, 54, 53, 52, 49, 49, 49, 48, 48, 47, 47, 46, 46, 45, 43, 42, 42, 41, 41, 40, 40, 40, 39, 39, 38, 38, 37, 37, 36, 36, 36, 35, 35, 35, 35, 35, 34, 34, 34, 33, 33, 33, 33, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 31, 30, 30, 29, 29, 29, 29, 28, 28, 28, 27, 27, 26, 26, 26, 26]


In [159]:
def save_topic_words_csv(table_name, community, count):
    sql = f'select node_id from {table_name} where community_id_fastgreedy_is = {community}'
    result_df = cn.select_query_result_to_df(sql)
    authors = np.array(result_df['node_id'].astype(str).values.tolist())

    length = len(authors)

    doc = []

    for i in range(length):
        sql2 = f"select distinct p.post_key, p.title from posts p, comments c where p.post_key = c.link_key and c.author = '{authors[i]}' and c.link_key = c.parent_key and p.is_valid_author=1 and MONTH(p.created_utc) <> 12;";
        result_df2 = cn.select_query_result_to_df(sql2)
        if not result_df2.empty:
            titles = np.array(result_df2['title'].astype(str).values.tolist())
            doc.extend(titles)
        
    corpus = doc
        
    # clean data stored in a new list
    clean_corpus = [clean(doc).split() for doc in corpus]
    dictionary = corpora.Dictionary(clean_corpus)
    corpus = [dictionary.doc2bow(text) for text in clean_corpus]
        
    if count >= 10000:
        num_topics = 10
    elif count >= 1000:
        num_topics = 5
    elif count >= 100:
        num_topics = 4
    else:
        num_topics = 3

    ldamodel = gensim.models.LdaMulticore(corpus, id2word=dictionary, num_topics=num_topics, passes=10)
    x=ldamodel.show_topics(num_topics=num_topics, num_words=50,formatted=False)
    topics_words = [[wd[0] for wd in tp[1]] for tp in x]
    
    df = pd.DataFrame(topics_words)
    df.to_csv("lda_test2.csv", header=None, index=None)

In [161]:
save_topic_words_csv('nodes_until_november', 736, 10)

In [162]:
with open('lda_test2.csv', newline='') as f:
    reader = csv.reader(f)
    data = list(reader)

In [164]:
print(data)

[['like', 'feel', 'suicidal', 'mother', 'tell', 'birthday', 'exist', 'others', 'day', 'im', 'chat', 'kinda', 'feeling', 'anyone', 'interested', 'shit', 'put', 'brain', 'able', 'coward', 'bullet', 'selfloathing', 'anything', 'wanting', 'reason', 'impossible', 'think', 'watching', 'make', 'gore', 'obsession', 'wild', 'job', 'hunting', 'member', 'advice', 'family', 'hard', 'terrified', 'sad', 'handle', 'completely', 'edge', 'trazodone', 'growing', 'insomnia', 'matter', 'need', 'nothing', 'klonopin'], ['like', 'animal', 'found', 'work', 'kill', 'parent', 'want', 'people', 'life', 'get', 'stuffed', 'emotional', 'cant', 'support', 'one', 'dead', 'woke', 'morning', 'mom', 'look', 'happiness', 'slip', 'facade', 'someoneâ€™s', 'almost', 'encouraging', 'nobody', 'dark', 'tothings', 'way', 'crush', 'cute', 'memory', 'could', 'start', 'issue', 'list', 'long', 'driving', 'crazy', 'living', 'over', 'keep', 'canâ€™t', 'left', '16', 'let', 'wish', 'klonopin', 'day'], ['talk', 'need', 'kill', 'future',

In [168]:
sentence = "shit brain talk lucky"
clean_corpus = clean(sentence).split()

most_related_community = 0
largest_cosine_similarity = 0
        
for topic in data:
    count = 0
    for word in clean_corpus:
        if word in topic:
            count += 1
    print(count / ((len(topic) ** 0.5) * (len(clean_corpus) ** 0.5)))

0.1414213562373095
0.0
0.1414213562373095
