# 계층적 클러스터링 결과 기반 각 군집에 대한 문서 토픽 모델링
### Target Data:
- Clustering label값이 붙은 문서들

### 토픽 모델링 접근 기법
- LDA
- TF-IDF 상위 순위

In [1]:
import os, subprocess, json
import pandas as pd
import numpy as np

# for DB connection
from urllib import parse
import sqlalchemy

# local
from calculate_tfidf import get_top_tfidf_words, get_tfidf_for_lda

In [8]:
with open('./../.API_KEY_/db_key.json', 'r') as f:
    keys = json.load(f)

def log(msg, flag=None):
    if flag==None:
        flag = 0
    head = ["debug", "error", "status"]
    from time import gmtime, strftime
    now = strftime("%H:%M:%S", gmtime())
    if not os.path.isfile("./debug.log"):
        assert subprocess.call(f"echo \"[{now}][{head[flag]}] > {msg}\" > debug.log", shell=True)==0, print(f"[error] > shell command failed to execute")
    else: assert subprocess.call(f"echo \"[{now}][{head[flag]}] > {msg}\" >> debug.log", shell=True)==0, print(f"[error] > shell command failed to execute")


def retrieve_df(table, limit=1000):
    engine = establish_conn()
    if limit == -1:
        return pd.read_sql_query(f"select * from english_news_{table}", con=engine)
    else:
        return pd.read_sql_query(f"select * from english_news_{table} limit {limit}", con=engine)
    

def establish_conn()->sqlalchemy.Engine:
    user = keys['user']
    password = keys['password']
    host = keys['ip']
    port = keys['port']
    database = keys['database']
    password = parse.quote_plus(password)
    engine = sqlalchemy.create_engine(f"mysql://{user}:{password}@{host}:{port}/{database}?charset=utf8mb4")
    return engine


def documents_generator(processed_df: pd.DataFrame, col:str):
    log("Generating documents from dataframe...")
    log("Iteration init")
    for idx, row in processed_df.iterrows():
        if (row[col]):
            yield row[col]  # Yield entire document
        else:
            log(f"null context found in {idx}!", 1)

In [16]:
def indexing_cluster_random(df, num_rows):
    np.random.seed(42)
    num_selected_rows = num_rows // 2
    #print(num_selected_rows)
    selected_rows = np.random.choice(df.index, num_selected_rows)
    new_df = df.loc[selected_rows]
    #print(new_df)
    return new_df

In [10]:
log("exucute load df...")
init_df = retrieve_df('tokenized', -1)
init_df['tokens'] = init_df.loc[:, 'tokens'].map(lambda x: ' '.join([token for token in x.split() if len(token) > 3]))
log(f"Success load df")

In [19]:
target_df = indexing_cluster_random(init_df, init_df.shape[0])
log(f'Success Collect df rows.')

In [20]:
log(f"tfidf calculation init")
tfidf_matrix, vocabulary, voca_feature_names, vectorizer_ = get_tfidf_for_lda(documents_generator(target_df, "tokens"))
log(f"tfidf calculation done")

- - -
## 1. TF-IDF 값이 높은 상위 N개 단어 추출

In [23]:
top_n = 10
tfidf_matrix = tfidf_matrix.astype(np.float32)
top_tfidf_words = get_top_tfidf_words(tfidf_matrix, vocabulary, top_n)
log(f"success get top {top_n} of tfidf words")

In [44]:
top_tfidf_words

[('polatis', 1095.4767),
 ('journeyhumble', 982.72046),
 ('livecholfc', 877.8208),
 ('obenschain', 619.291),
 ('partake', 614.5731),
 ('collyer', 613.96985),
 ('kulpa', 552.3176),
 ('rejectedon', 542.4552),
 ('violatedthe', 527.94525),
 ('wolverhapton', 492.8305)]

- - -
## 2. TF-IDF 기반 LDA 분석

In [25]:
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.model_selection import GridSearchCV, StratifiedKFold, LeaveOneOut, ShuffleSplit

In [42]:
lda_model = LatentDirichletAllocation(
    learning_method='online',
    random_state=42,
)

# GridSearchCV를 위한 하이퍼파라미터 그리드 설정
param_grid = {
    'n_components': [3, 4, 5],  # 토픽 수
    'learning_decay': [0.5, 0.7, 0.9],  # 온라인 학습의 학습 감쇠율
    'doc_topic_prior': [0.1, 0.5, 1],  # α
    'topic_word_prior': [0.1, 0.5, 1]  # β
}

# cv options
# skf = StratifiedKFold(n_splits=5) # n_splits = 3, 5, 10
# LeaveOneOut()
sfs = ShuffleSplit(n_splits=10, test_size=0.2, random_state=42)

# GridSearchCV 설정
grid_search = GridSearchCV(lda_model, param_grid, cv=3, n_jobs=-1, verbose=2)

# GridSearchCV 실행
grid_search.fit(tfidf_matrix)

Fitting 3 folds for each of 81 candidates, totalling 243 fits


In [43]:
# 최적의 하이퍼파라미터 조합 출력
best_params = grid_search.best_params_
print("최적의 하이퍼파라미터 조합:", best_params)

AttributeError: 'GridSearchCV' object has no attribute 'best_params_'

In [50]:
# 최적의 하이퍼파라미터로 LDA 모델 학습
optimal_lda = LatentDirichletAllocation(
    n_components=10,
    learning_method='online',
    learning_decay=0.7,
    doc_topic_prior=0.01,
    topic_word_prior=0.1,
    random_state=42,
    max_iter=30
)

optimal_lda.fit(tfidf_matrix)

In [54]:
# 최적의 하이퍼파라미터로 LDA 모델 학습
optimal_lda = LatentDirichletAllocation(
    n_components=best_params['n_components'],
    learning_method='online',
    learning_decay=best_params['learning_decay'],
    doc_topic_prior=best_params['doc_topic_prior'],
    topic_word_prior=best_params['topic_word_prior'],
    random_state=42,
    max_iter=3
)

optimal_lda.fit(tfidf_matrix)

In [51]:
print(optimal_lda.components_)
print(optimal_lda.components_.shape) 

[[0.10000027 0.10000027 0.10000027 ... 0.10000027 0.10000027 0.10000027]
 [0.10004137 0.10002945 0.10004278 ... 0.1001145  0.10004482 0.10005373]
 [0.10000027 0.10000027 0.10000027 ... 0.10000027 0.10000027 0.10000027]
 ...
 [0.10000027 0.10000027 0.10000027 ... 0.10000027 0.10000027 0.10000027]
 [0.10000027 0.10000027 0.10000027 ... 0.10000027 0.10000027 0.10000027]
 [0.10000027 0.10000027 0.10000027 ... 0.10000027 0.10000027 0.10000027]]
(15, 196479)


In [52]:
def get_topics(components, feature_names, n=5):
    for idx, topic in enumerate(components):
        print("Topic %d:" % (idx+1), [(feature_names[i], topic[i].round(2)) for i in topic.argsort()[:-n - 1:-1]])

In [53]:
get_topics(optimal_lda.components_, voca_feature_names)

Topic 1: [('protestsukraine', 2.27), ('azarov', 0.1), ('institutskaya', 0.1), ('nakonechniy', 0.1), ('mikhailovsky', 0.1)]
Topic 2: [('said', 1095.22), ('space', 983.73), ('thousand', 878.03), ('hundred', 619.61), ('launch', 614.78)]
Topic 3: [('miller', 0.1), ('kite', 0.1), ('soyuz', 0.1), ('osiris', 0.1), ('jackson', 0.1)]
Topic 4: [('copernicus', 0.1), ('thousand', 0.1), ('biden', 0.1), ('photostwo', 0.1), ('photos', 0.1)]
Topic 5: [('football', 0.1), ('time', 0.1), ('salah', 0.1), ('caption', 0.1), ('last', 0.1)]
Topic 6: [('kenteris', 4.15), ('thanou', 3.55), ('tzekos', 1.13), ('faking', 0.83), ('segas', 0.1)]
Topic 7: [('mccann', 8.5), ('praia', 2.52), ('algarve', 0.1), ('budens', 0.1), ('sketches', 0.1)]
Topic 8: [('azoulay', 0.1), ('jews', 0.1), ('semitism', 0.1), ('myriam', 0.1), ('holocaust', 0.1)]
Topic 9: [('liquid', 0.1), ('comet', 0.1), ('flight', 0.1), ('investigation', 0.1), ('medvedev', 0.1)]
Topic 10: [('border', 0.1), ('belarus', 0.1), ('ethree', 0.1), ('ursa', 0.1),

In [54]:
import pyLDAvis.lda_model
pyLDAvis.lda_model.prepare

pyLDAvis.enable_notebook()
panel = pyLDAvis.lda_model.prepare(optimal_lda, tfidf_matrix, vectorizer_, mds='tsne')
pyLDAvis.display(panel)

In [19]:
pyLDAvis.save_html(panel, 'lda_visualization.html')