# LDA 분석
### LDA 분석을 위한 main
- - -

## init

In [1]:
import os, subprocess, json, pickle
import pandas as pd
import numpy as np

# for DB connection
from urllib import parse
import sqlalchemy

# local
import lda_utils

# lda용 임시 import
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.model_selection import GridSearchCV, StratifiedKFold, LeaveOneOut, ShuffleSplit

In [2]:
with open('./../.API_KEY_/db_key.json', 'r') as f:
    keys = json.load(f)

def log(msg, flag=None):
    if flag==None:
        flag = 0
    head = ["debug", "error", "status"]
    from time import gmtime, strftime
    now = strftime("%H:%M:%S", gmtime())
    if not os.path.isfile("./debug.log"):
        assert subprocess.call(f"echo \"[{now}][{head[flag]}] > {msg}\" > debug.log", shell=True)==0, print(f"[error] > shell command failed to execute")
    else: assert subprocess.call(f"echo \"[{now}][{head[flag]}] > {msg}\" >> debug.log", shell=True)==0, print(f"[error] > shell command failed to execute")


def retrieve_df(table, limit=1000):
    engine = establish_conn()
    if limit == -1:
        return pd.read_sql_query(f"select * from english_news_{table}", con=engine)
    else:
        return pd.read_sql_query(f"select * from english_news_{table} limit {limit}", con=engine)
    

def establish_conn()->sqlalchemy.Engine:
    user = keys['user']
    password = keys['password']
    host = keys['ip']
    port = keys['port']
    database = keys['database']
    password = parse.quote_plus(password)
    engine = sqlalchemy.create_engine(f"mysql://{user}:{password}@{host}:{port}/{database}?charset=utf8mb4")
    return engine


def documents_generator(processed_df: pd.DataFrame, col:str):
    log("Generating documents from dataframe...")
    log("Iteration init")
    for idx, row in processed_df.iterrows():
        if (row[col]):
            yield row[col]  # Yield entire document
        else:
            log(f"null context found in {idx}!", 1)

- - -

In [3]:
with open('./../data_/result_df_15k.pkl', 'rb') as f:
    clustering_result_df = pickle.load(f)

In [4]:
clustering_result_df

Unnamed: 0,docKey,tokens,cluster_num
0,15267299181868174010,japan commits deploying satellite navigation s...,1
1,3857982117027626046,russia medvedev backs putin another presidenti...,2
2,4680005275022517970,jackson film absolute disaster pr expert told ...,2
3,2154078379147666248,uk announces deal us change diplomatic immunit...,2
4,13473316220249598904,goddard moving early next discovery competitio...,1
...,...,...,...
14995,10664914161492425434,federer hails wonderful friend ljubicic ahead ...,4
14996,15402084940834656814,kevin porter jr announcer apologizes mistaken ...,6
14997,10060053740378811890,hacking inquiry puts british minister spotligh...,2
14998,15078707064281132597,wimbledon two thousand fifteen djokovic beats ...,4


In [5]:
df_list = []
for label in clustering_result_df['cluster_num'].unique().tolist():
    df_list.append(clustering_result_df.loc[clustering_result_df['cluster_num'] == label].copy())

In [8]:
len(df_list)

6

- - -

In [56]:
target_df = df_list[5]

In [57]:
log(f"tfidf calculation init")
tfidf_matrix, vocabulary, voca_feature_names, vectorizer_ = lda_utils.get_tfidf_for_lda(documents_generator(target_df, "tokens"))
log(f"tfidf calculation done")

In [58]:
# tf-idf 기준 top n 추출
top_n = 10

#tfidf_matrix = tfidf_matrix.astype(np.float32)
top_tfidf_words = lda_utils.get_top_tfidf_words(tfidf_matrix, vocabulary, top_n)
log(f"success get top {top_n} of tfidf words")

In [59]:
lda_utils.record_tfidf_top_words_by_cluster(top_tfidf_words, 6)

In [60]:
print(top_tfidf_words)

[('gamepr', 32.97940095174935), ('confined', 25.599542336603186), ('aging', 24.62500044163629), ('heartbreaker', 21.508645718345313), ('infrastructure', 20.858250064443936), ('riet', 20.699619638749823), ('happen', 18.05602739149836), ('nineread', 17.97469778517522), ('starscar', 15.88126280103921), ('prodigieshistory', 14.42247452175076)]


In [61]:
lda_model = LatentDirichletAllocation(
    n_components=10,
    learning_method='online',
    learning_decay=0.85,
    doc_topic_prior=0.01,
    topic_word_prior=0.1,
    random_state=42,
    max_iter=30
)

lda_model.fit(tfidf_matrix)

In [62]:
print(lda_model.components_)
print(lda_model.components_.shape) 

[[0.11841277 0.11716549 0.11698168 ... 0.11741632 0.11915533 0.11847232]
 [0.11911145 0.11714199 0.11896424 ... 0.11336292 0.11561256 0.11832828]
 [0.11814574 0.11846165 0.1178748  ... 0.11992867 0.11825296 0.11738376]
 ...
 [0.11802964 0.11690773 0.11860926 ... 0.11686482 0.11832287 0.11811142]
 [0.11984667 0.11758827 0.11751714 ... 0.11819617 0.11666993 0.11549668]
 [0.12212441 0.11192723 0.11445573 ... 0.11629407 0.11455553 0.11689493]]
(10, 15090)


In [64]:
lda_utils.print_top_topics(lda_model.components_, voca_feature_names, 5)

Topic 1: [('ko', 0.13), ('golf', 0.13), ('woods', 0.13), ('two', 0.13), ('onewhile', 0.13)]
Topic 2: [('accident', 0.13), ('watchedwill', 0.13), ('doubly', 0.13), ('leadbetter', 0.13), ('bag', 0.13)]
Topic 3: [('mcilroy', 0.13), ('two', 0.13), ('world', 0.13), ('photos', 0.13), ('rejoining', 0.13)]
Topic 4: [('two', 0.14), ('thousand', 0.13), ('willett', 0.13), ('golf', 0.13), ('woods', 0.13)]
Topic 5: [('two', 32.34), ('one', 25.04), ('woods', 24.3), ('thousand', 21.12), ('golf', 20.42)]
Topic 6: [('tour', 0.13), ('one', 0.13), ('price', 0.13), ('haas', 0.13), ('interior', 0.13)]
Topic 7: [('dechambeau', 0.14), ('one', 0.14), ('pnc', 0.13), ('two', 0.13), ('woods', 0.13)]
Topic 8: [('two', 0.13), ('one', 0.13), ('dufner', 0.13), ('photos', 0.13), ('woods', 0.13)]
Topic 9: [('photos', 0.14), ('one', 0.14), ('two', 0.14), ('hide', 0.14), ('caption', 0.14)]
Topic 10: [('varner', 0.13), ('lootah', 0.13), ('phenomenal', 0.13), ('tie', 0.13), ('seasoned', 0.13)]


In [65]:
lda_utils.record_top_topics_by_cluster(lda_model.components_, voca_feature_names, 5, label_name=6)

In [18]:
import pyLDAvis.lda_model
pyLDAvis.lda_model.prepare

pyLDAvis.enable_notebook()
panel = pyLDAvis.lda_model.prepare(lda_model, tfidf_matrix, vectorizer_, mds='tsne')
pyLDAvis.display(panel)

In [None]:
pyLDAvis.save_html(panel, 'lda_visualization.html')