## 1. AGNES 중분류

In [None]:
import matplotlib.pyplot as plt
from scipy.cluster.hierarchy import dendrogram, linkage
from sklearn.cluster import AgglomerativeClustering
from sklearn.metrics import silhouette_score, silhouette_samples
import numpy as np
import pandas as pd
from joblib import Parallel, delayed
from tqdm import tqdm

# 덴드로그램 시각화 (2차원일 때만)
def plot_dendrogram(data, method="average", metric="euclidean", title="Dendrogram"):
    linked = linkage(data, method=method, metric=metric)
    plt.figure(figsize=(12, 6))
    dendrogram(linked)
    plt.title(title)
    plt.xlabel("Sample Index")
    plt.ylabel("Distance")
    plt.show()

# 최적 임계값과 실루엣 점수 찾기
def find_optimal_threshold(data, low=1.0, high=2.0, max_iter=1000):
    best_threshold = None
    best_score = -1
    best_labels = None
    best_n_clusters = None
    thresholds = []
    silhouette_scores = []
    n_clusters_list = []
    iter_count = 0
    
    print("▶ Finding the optimal distance_threshold using binary search...\n")

    while high - low > 0.001 and iter_count < max_iter:
        iter_count += 1
        mid = (low + high) / 2
        agnes = AgglomerativeClustering(n_clusters=None, metric="euclidean", linkage="average", distance_threshold=mid)
        labels = agnes.fit_predict(data)
        n_clusters = len(np.unique(labels))

        if n_clusters > 1:
            silhouette = silhouette_score(data, labels, metric="euclidean")
            thresholds.append(mid)
            silhouette_scores.append(silhouette)
            n_clusters_list.append(n_clusters)
            print(f"Distance Threshold: {mid:.3f}, Clusters: {n_clusters}, Silhouette Score: {silhouette:.4f}")

            if silhouette > best_score:
                best_score = silhouette
                best_threshold = mid
                best_labels = labels
                best_n_clusters = n_clusters

            if silhouette >= best_score:
                low = mid
            else:
                high = mid
    # 그래프 출력
    plt.figure(figsize=(8, 5))
    plt.plot(thresholds, silhouette_scores, marker='o', linestyle='-', color='blue')
    plt.title("Silhouette Score by Distance Threshold")
    plt.xlabel("Distance Threshold")
    plt.ylabel("Silhouette Score")
    plt.grid(True)
    plt.show()

    print(f"\n▶ Optimal Distance Threshold Found!")
    print(f"Best distance_threshold: {best_threshold:.3f}, Best Clusters: {best_n_clusters}, Best Silhouette Score: {best_score:.4f}")
    return best_threshold, best_labels, best_score, best_n_clusters

# 클러스터 시각화 (2차원일 때만)
def plot_silhouette(data, labels, title):
    plt.figure(figsize=(8, 8))
    plt.scatter(data[:, 0], data[:, 1], c=labels, cmap="viridis", alpha=0.7)
    plt.title(title)
    plt.xlabel("Dimension 1")
    plt.ylabel("Dimension 2")
    plt.colorbar(label="Cluster Label")
    plt.show()

# AGNES 클러스터링 및 통합 처리
def process_agnes_embedding(name, data, metadata_df=None):
    print(f"Processing: {name}")

    # 1. 덴드로그램 (2D일 때만)
    if data.shape[1] == 2:
        print("\n1. Dendrogram Visualization")
        plot_dendrogram(data, method="average", metric="euclidean", title=f"Dendrogram ({name})")
    else:
        print("\n1. Dendrogram skipped (not 2D)")

    # 2. 최적 threshold 탐색
    print("\n2. Finding the optimal distance_threshold...")
    best_threshold, best_labels, best_score, best_n_clusters = find_optimal_threshold(data)

    # 3. 최종 덴드로그램 (2D일 때만)
    if data.shape[1] == 2:
        print("\n3. Dendrogram with Optimal Threshold")
        linked = linkage(data, method="average", metric="euclidean")
        plt.figure(figsize=(12, 6))
        dendrogram(linked, color_threshold=best_threshold)
        plt.axhline(y=best_threshold, color="r", linestyle="--", label=f"Best threshold = {best_threshold:.3f}")
        plt.title(f"Dendrogram with Optimal Distance Threshold ({name})")
        plt.legend()
        plt.show()

    # 4. 클러스터링 결과 시각화 (2D만)
    if data.shape[1] == 2:
        print("\n4. Final Clustering Result")
        plot_silhouette(data, best_labels, title=f"Final Clustering ({name})")
    else:
        print("\n4. Final Clustering Result skipped (not 2D)")

    # 5. 결과 저장
    df_result = pd.DataFrame(data)
    df_result['Cluster'] = best_labels

    if best_n_clusters > 1:
        silhouette_vals = silhouette_samples(data, best_labels, metric="euclidean")
        df_result['Silhouette'] = silhouette_vals
    else:
        df_result['Silhouette'] = -1

    # 메타데이터 병합
    if metadata_df is not None:
        df_result = pd.concat([metadata_df.reset_index(drop=True), df_result], axis=1)

    # 리턴
    return {
        'name': name,
        'best_threshold': best_threshold,
        'n_clusters': best_n_clusters,
        'silhouette_score': best_score,
        'data': df_result
    }

In [None]:
def agnes_subcluster_for_id(cluster_id, df, embedding_cols):
    cluster_data = df[df['Cluster'] == cluster_id]
    X_cluster = cluster_data[embedding_cols].values

    try:
        _, best_labels, _, _ = find_optimal_threshold(X_cluster)
        return pd.Series(best_labels, index=cluster_data.index)
    except Exception as e:
        print(f"[ERROR] Cluster {cluster_id}: {e}")
        return pd.Series([-1] * len(cluster_data), index=cluster_data.index)

In [None]:
# 이 부분 위 코드 cosine으로 바꿔서 다시 돌려봐야함 (시간이 오래 걸림)

file_path = "./ref_origin_hi_clu.csv"
df = pd.read_csv(file_path)

# 필요한 컬럼만 추출
embedding_cols = [str(i) for i in range(64)]
X = df[embedding_cols].values

# 클러스터링 수행에 필요한 패키지 로드
from sklearn.cluster import AgglomerativeClustering

# 기존 BIRCH 클러스터 ID 확인
cluster_ids = df['Cluster'].unique()

# AGNES 결과 저장
agnes_all_results = []

# 각 BIRCH 클러스터별로 AGNES 수행
print("▶ Starting AGNES sub-clustering (parallelized)...\n")
agnes_all_results = Parallel(n_jobs=-1)(
    delayed(agnes_subcluster_for_id)(cid, df, embedding_cols)
    for cid in tqdm(df['Cluster'].unique())
)

# 결과 통합
df['AGNES_Subcluster'] = pd.concat(agnes_all_results).sort_index()

In [None]:
df

In [None]:
# Subcluster 칼럼 이름 변경
df.rename(columns={'AGNES_Subcluster': 'Subcluster'}, inplace=True)
df.to_csv('./ref_origin_emb_red_clu_mid.csv', index=False)

## 2. AGNES 중분류 결과 분석

In [None]:
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity

# 1. 데이터 불러오기
df = pd.read_csv("./ref_origin_emb_red_clu_mid.csv")  # AGNES 클러스터링 포함
ref_df = pd.read_csv("./ref_origin_emb_red.csv")  # 원본 keyword 포함

# 2. 컬럼 이름 변경
df.rename(columns={
    'Cluster': 'Topcluster',
    'Subcluster': 'Midcluster'
}, inplace=True)

# 3. Midcluster ID 생성
df['Midcluster_ID'] = df['Topcluster'].astype(str) + "_" + df['Midcluster'].astype(str)

# 4. 중심좌표 계산
embedding_cols = [str(i) for i in range(64)]
midcluster_centers = df.groupby("Midcluster_ID")[embedding_cols].mean()

# 5. 유사도 행렬 계산
similarity_matrix = cosine_similarity(midcluster_centers)
similarity_df = pd.DataFrame(similarity_matrix,
                             index=midcluster_centers.index,
                             columns=midcluster_centers.index).reset_index()

# 6. 유사도 melt
similarity_melted = similarity_df.melt(id_vars='Midcluster_ID',
                                       var_name='Compared_Midcluster',
                                       value_name='Cosine_Similarity')

# 7. top 3 유사도 추출
top_similar_expanded = (
    similarity_melted[similarity_melted['Midcluster_ID'] != similarity_melted['Compared_Midcluster']]
    .sort_values(['Midcluster_ID', 'Cosine_Similarity'], ascending=[True, False])
    .groupby('Midcluster_ID')
    .head(3)
    .reset_index(drop=True)
)
top_similar_expanded['Rank'] = top_similar_expanded.groupby('Midcluster_ID').cumcount() + 1
top_similar_pivot = top_similar_expanded.pivot(index='Midcluster_ID', columns='Rank',
                                                values=['Compared_Midcluster', 'Cosine_Similarity'])
top_similar_pivot.columns = [f"Most_Similar_{col[1]}" if col[0] == 'Compared_Midcluster'
                             else f"Similarity_{col[1]}" for col in top_similar_pivot.columns]
top_similar_pivot.reset_index(inplace=True)

# 8. 키워드 컬럼 추출
keyword_col = next((col for col in ref_df.columns if 'keyword' in col.lower()), None)

# 9. 클러스터-미드클러스터 매핑
merged = pd.merge(
    df[['ticket_id_hashed', 'Topcluster', 'Midcluster']],
    ref_df[['ticket_id_hashed', keyword_col]],
    on='ticket_id_hashed',
    how='left'
)

# 10. 키워드 집계
grouped_keywords = (
    merged.groupby(["Topcluster", "Midcluster"])[keyword_col]
    .apply(lambda x: pd.Series(x.dropna().explode().str.split(",")).explode().str.strip().value_counts())
    .reset_index()
)
grouped_keywords.columns = ["Topcluster", "Midcluster", "Keyword", "Count"]
grouped_keywords['Midcluster_ID'] = grouped_keywords['Topcluster'].astype(str) + "_" + grouped_keywords['Midcluster'].astype(str)

# 11. 최종 병합 및 저장
final_keywords_top3 = pd.merge(grouped_keywords, top_similar_pivot, on='Midcluster_ID', how='left')
final_keywords_top3.to_csv("./midcluster_top_keywords_with_top3_similarity.csv", index=False, encoding="utf-8-sig")

In [None]:
midcluster = pd.read_csv('./midcluster_top_keywords_with_top3_similarity.csv')
midcluster

In [None]:
unique_ids = midcluster['Midcluster_ID'].unique()
print('Unique Midcluster_IDs:', unique_ids)
print('Number of unique Midcluster_IDs:', len(unique_ids))

In [None]:
# 대분류 클러스터 크기 계산
cluster_sizes = df['Topcluster'].value_counts().sort_index()
print("대분류 클러스터 크기:", cluster_sizes)

In [None]:
# 서브클러스터 크기 계산
pd.set_option('display.max_rows', 200)

subcluster_sizes = df['Midcluster_ID'].value_counts().sort_index()
print("서브클러스터 크기:", subcluster_sizes)

In [None]:
# 서브클러스터 크기 시각화
subcluster_sizes.plot(kind='bar', figsize=(12, 6), title='Subcluster Sizes')
plt.xlabel('Midcluster ID')
plt.ylabel('Size')
plt.show()

In [None]:
cluster_counts = df['Midcluster_ID'].value_counts()

min_samples = 100  # 입력
target_clusters = cluster_counts[cluster_counts >= min_samples].index.tolist()
print(f"소분류 대상 클러스터 개수 : {len(target_clusters)}")

## 3. AGNES 소분류 (AgglomerativeClustering + distance_threshold)

In [None]:
from scipy.cluster.hierarchy import linkage, fcluster, dendrogram
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import LatentDirichletAllocation
import matplotlib.pyplot as plt
import numpy as np

# 결과 저장용 컬럼 초기화
df['Subcluster'] = 0
sub_keywords_and_texts = {}

def auto_cut_dendrogram(data, method="average", metric="cosine", plot=False, cluster_name=""):
    linked = linkage(data, method=method, metric=metric)
    distances = linked[:, 2]
    deltas = np.diff(distances)
    threshold = distances[np.argmax(deltas)]

    labels = fcluster(linked, t=threshold, criterion="distance")

    if plot:
        plt.figure(figsize=(8, 4))
        dendrogram(
            linked,
            no_labels=True,
            color_threshold=0,              # 색상 구분 제거
            above_threshold_color='black'   # 모든 선 검정색
        )
        plt.axhline(y=threshold, color='red', linestyle='--', label=f"cut@{threshold:.2f}")
        plt.title(f"Dendrogram: {cluster_name} (Clusters={len(set(labels))})")
        plt.legend()
        plt.show()

    return labels, threshold

# 벡터 컬럼 (64차원 임베딩)
vec_cols = list(map(str, range(64)))

# Topcluster → Midcluster 단위로 AGNES 소분류 수행
for top_id in sorted(df['Topcluster'].unique()):
    top_df = df[df['Topcluster'] == top_id]

    for mid_id in sorted(top_df['Midcluster'].unique()):
        sub_df = top_df[top_df['Midcluster'] == mid_id]
        if len(sub_df) <= 100:
            continue  # 소분류 의미 없을 정도로 작으면 스킵

        vectors = sub_df[vec_cols].values
        texts = sub_df['generated_summary'].tolist()
        indices = sub_df.index.tolist()

        # AGNES 기반 덴드로그램 컷팅 + 시각화
        sub_labels, threshold = auto_cut_dendrogram(
            data=vectors,
            plot=True,
            cluster_name=f"Top {top_id} - Mid {mid_id}"
        )

        # 소분류 레이블 저장
        df.loc[indices, 'Subcluster'] = sub_labels

        # 소분류별 키워드 추출
        for sub_id in sorted(set(sub_labels)):
            sub_idx = [i for i, l in enumerate(sub_labels) if l == sub_id]
            sub_texts = [texts[i] for i in sub_idx]
            if len(sub_texts) < 3:
                continue  # LDA 돌리기에는 너무 적음

            tfidf = TfidfVectorizer(max_features=30)
            X_tfidf = tfidf.fit_transform(sub_texts)
            tfidf_keywords = tfidf.get_feature_names_out()

            lda = LatentDirichletAllocation(n_components=1, random_state=42)
            lda.fit(X_tfidf)
            lda_keywords = [
                tfidf.get_feature_names_out()[i]
                for i in lda.components_[0].argsort()[-10:][::-1]
            ]

            sub_keywords_and_texts[f"{top_id}-{mid_id}-{sub_id}"] = {
                'TFIDF': tfidf_keywords.tolist(),
                'LDA': lda_keywords
            }

# Subcluster ID 생성
df['Subcluster_ID'] = (
    df['Topcluster'].astype(str) + "_" +
    df['Midcluster'].astype(str) + "_" +
    df['Subcluster'].astype(str)
)


In [None]:
from scipy.cluster.hierarchy import linkage, fcluster, dendrogram
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import LatentDirichletAllocation
import matplotlib.pyplot as plt
import matplotlib.font_manager as fm
import matplotlib as mpl
import numpy as np
import pandas as pd

# 한글 폰트 설정 (macOS 기준)
plt.rcParams['font.family'] = 'AppleGothic'
plt.rcParams['axes.unicode_minus'] = False

# 결과 저장용 초기화
df['Subcluster'] = 0
sub_keywords_and_texts = {}

# 병합 거리 변화량 시각화 함수
def plot_deltas(deltas, threshold_index, cluster_name):
    plt.figure(figsize=(10, 4))
    plt.plot(range(1, len(deltas) + 1), deltas, label='거리 증가량 (delta)')
    plt.axvline(x=threshold_index + 1, color='red', linestyle='--', label='최대 증가 지점')
    plt.xlabel("병합 단계")
    plt.ylabel("거리 증가량")
    plt.title(f"병합 거리 변화량: {cluster_name}")
    plt.legend()
    plt.tight_layout()
    plt.show()

# 병합 거리 증가량 및 임계값 계산 함수
def show_linkage_distances(linked):
    distances = linked[:, 2]
    deltas = np.diff(distances)
    threshold_index = np.argmax(deltas)
    threshold_value = distances[threshold_index]

    max_increase_flags = [False] * len(distances)
    if threshold_index + 1 < len(distances):
        max_increase_flags[threshold_index + 1] = True  # delta 최대 줄 (파란색 표시)

    df_merge = pd.DataFrame({
        "병합번호": range(1, len(distances) + 1),
        "병합거리 (distance)": distances,
        "거리 증가량 (delta)": [np.nan] + deltas.tolist(),
        "컷팅 기준 여부 (distance[i])": [i == threshold_index for i in range(len(distances))],
        "delta 최대 여부 (distance[i+1])": max_increase_flags
    })

    return df_merge, threshold_index, threshold_value

# 병합 테이블 Top10 + 색상 강조 출력 함수
def display_top_deltas_with_max(df_merge, threshold_index):
    delta_max_row_index = threshold_index + 1
    distance_cut_row_index = threshold_index

    top_deltas = df_merge.sort_values("거리 증가량 (delta)", ascending=False).head(9)
    delta_max_row = df_merge.iloc[[delta_max_row_index]]
    distance_cut_row = df_merge.iloc[[distance_cut_row_index]]
    merged = pd.concat([top_deltas, delta_max_row, distance_cut_row]).drop_duplicates(subset=["병합번호"])

    def highlight_row(row):
        if row.name == delta_max_row_index:
            return ['background-color: blue'] * len(row)
        elif row.name == distance_cut_row_index:
            return ['background-color: red'] * len(row)
        else:
            return [''] * len(row)

    styled = merged.sort_values("거리 증가량 (delta)", ascending=False).style.apply(highlight_row, axis=1)
    display(styled)

# 벡터 컬럼 정의
vec_cols = list(map(str, range(64)))

# Topcluster → Midcluster 단위 AGNES 수행
for top_id in sorted(df['Topcluster'].unique()):
    top_df = df[df['Topcluster'] == top_id]

    for mid_id in sorted(top_df['Midcluster'].unique()):
        sub_df = top_df[top_df['Midcluster'] == mid_id]
        if len(sub_df) <= 100:
            continue  # 너무 작으면 스킵

        vectors = sub_df[vec_cols].values
        texts = sub_df['generated_summary'].tolist()
        indices = sub_df.index.tolist()

        # linkage 및 임계값 계산
        linked = linkage(vectors, method="average", metric="cosine")
        df_merge, threshold_index, threshold_value = show_linkage_distances(linked)
        deltas = np.diff(linked[:, 2])

        cluster_name = f"Top {top_id} - Mid {mid_id}"
        plot_deltas(deltas, threshold_index, cluster_name=cluster_name)
        display_top_deltas_with_max(df_merge, threshold_index)

        # 덴드로그램 컷팅
        labels = fcluster(linked, t=threshold_value, criterion="distance")
        df.loc[indices, 'Subcluster'] = labels

        # 키워드 추출
        for sub_id in sorted(set(labels)):
            sub_idx = [i for i, l in enumerate(labels) if l == sub_id]
            sub_texts = [texts[i] for i in sub_idx]
            if len(sub_texts) < 3:
                continue

            tfidf = TfidfVectorizer(max_features=30)
            X_tfidf = tfidf.fit_transform(sub_texts)
            tfidf_keywords = tfidf.get_feature_names_out()

            lda = LatentDirichletAllocation(n_components=1, random_state=42)
            lda.fit(X_tfidf)
            lda_keywords = [
                tfidf.get_feature_names_out()[i]
                for i in lda.components_[0].argsort()[-10:][::-1]
            ]

            sub_keywords_and_texts[f"{top_id}-{mid_id}-{sub_id}"] = {
                'TFIDF': tfidf_keywords.tolist(),
                'LDA': lda_keywords
            }

# Subcluster ID 최종 생성
df['Subcluster_ID'] = (
    df['Topcluster'].astype(str) + "_" +
    df['Midcluster'].astype(str) + "_" +
    df['Subcluster'].astype(str)
)

In [None]:
df
# Topcluster, Midcluster, Subcluster 확인 가능

In [None]:
# 대분류 개수
df['Topcluster'].nunique()

In [None]:
# 중분류 개수
df['Midcluster_ID'].nunique()

In [None]:
df['Subcluster_ID'].nunique()

In [None]:
# Subcluster_ID별 개수 세기
subcluster_counts = df['Subcluster_ID'].value_counts().sort_index()
print(subcluster_counts)

In [None]:
df.to_csv('./ref_origin_emb_red_clu_final.csv', index=False)

## 4. TFIDF & LDA

In [None]:
import os
import pandas as pd
import nltk
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction import text
from gensim import corpora, models
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

# nltk 리소스 다운로드
# nltk.download('punkt')
# nltk.download('stopwords')

### 1. TF-IDF 키워드 추출 함수
def extract_keywords(texts, top_n=5):
    if len(texts) == 0:
        return ["No Keywords"]
    try:
        stop_words = text.ENGLISH_STOP_WORDS
        excluded_words = ['refrigerator', 'ref', 'user', 'suggests', 'suggest', 'suggestion']
        stop_words = list(stop_words.union(set(excluded_words)))
        vectorizer = TfidfVectorizer(max_df=0.85, min_df=1, stop_words=stop_words)
        tfidf_matrix = vectorizer.fit_transform(texts)
        feature_names = vectorizer.get_feature_names_out()
        avg_tfidf = tfidf_matrix.mean(axis=0).A1
        keywords = [feature_names[i] for i in avg_tfidf.argsort()[-top_n:]]
        return keywords
    except ValueError as e:
        print(f"Error while extracting keywords: {e}")
        return ["Error"]

### 2. LDA 토픽 모델링 함수
def preprocess_texts(texts, extra_stopwords=None):
    stop_words = set(stopwords.words('english'))
    if extra_stopwords:
        stop_words = stop_words.union(set(extra_stopwords))

    processed = []
    for text in texts:
        tokens = word_tokenize(text.lower(), preserve_line=True)
        tokens = [word for word in tokens if word.isalpha() and word not in stop_words]
        processed.append(tokens)
    return processed

def cluster_topic_modeling(final_df, cluster_col, text_col, num_topics=1, num_words=5, excluded_words=None):
    clusters = final_df[cluster_col].unique()
    cluster_topics = {}

    for cluster in clusters:
        cluster_texts = final_df[final_df[cluster_col] == cluster][text_col].dropna().tolist()
        cluster_texts = [text for text in cluster_texts if text.strip()]
        processed_texts = preprocess_texts(cluster_texts, extra_stopwords=excluded_words)

        if not processed_texts or all(len(text) == 0 for text in processed_texts):
            cluster_topics[cluster] = ["No topic"]
            continue

        dictionary = corpora.Dictionary(processed_texts)
        corpus = [dictionary.doc2bow(text) for text in processed_texts]

        if len(dictionary.token2id) == 0:
            cluster_topics[cluster] = ["No topic"]
            continue

        lda_model = models.LdaModel(corpus, num_topics=num_topics, id2word=dictionary, passes=10)
        topics = lda_model.print_topics(num_words=num_words)
        topic_keywords = [t[1] for t in topics]
        cluster_topics[cluster] = topic_keywords

    return cluster_topics

### 3. TF-IDF + LDA 병합
def merge_keywords_and_topics(tfidf_dict, topic_dict):
    all_clusters = sorted(set(tfidf_dict.keys()).union(set(topic_dict.keys())))
    merged_data = []

    for cluster in all_clusters:
        tfidf_keywords = ', '.join(tfidf_dict.get(cluster, ["No Keywords"]))
        topic_keywords = ', '.join(topic_dict.get(cluster, ["No topic"]))
        merged_data.append((cluster, tfidf_keywords, topic_keywords))

    merged_df = pd.DataFrame(merged_data, columns=["Cluster", "TFIDF_Keywords", "LDA_Topic_Keywords"])
    return merged_df

### 4. 클러스터별 키워드 추출 (TF-IDF 전용)
def get_cluster_keywords(final_df, cluster_col, text_col, top_n=5):
    clusters = final_df[cluster_col].unique()
    cluster_keywords = {}
    for cluster in clusters:
        cluster_texts = final_df[final_df[cluster_col] == cluster][text_col].dropna().tolist()
        cluster_texts = [text for text in cluster_texts if text.strip()]
        keywords = extract_keywords(cluster_texts, top_n)
        cluster_keywords[cluster] = keywords
    return cluster_keywords

### 5. 실행 파이프라인
def run_pipeline(final_df, level_name, cluster_col, text_col):
    print(f"\n▶ Processing: {level_name}")
    tfidf_keywords = get_cluster_keywords(final_df, cluster_col, text_col)
    topic_keywords = cluster_topic_modeling(final_df, cluster_col, text_col, excluded_words=excluded_words)
    merged_df = merge_keywords_and_topics(tfidf_keywords, topic_keywords)

    output_path = f"./ref_{level_name}_TFIDF_LDA.csv"
    merged_df.to_csv(output_path, index=False, encoding='utf-8-sig')
    print(f"저장 완료: {output_path}")

text_col = 'generated_summary'
excluded_words = ['refrigerator', 'ref', 'user', 'suggests', 'suggest', 'suggestion']

### 6. 메인 실행
if __name__ == "__main__":
    # 이미 final_df가 메모리에 존재한다고 가정
    text_col = 'generated_summary'
    excluded_words = ['refrigerator', 'ref', 'user', 'suggests', 'suggest', 'suggestion']
    
    # Midcluster / Subcluster 단위 분석 실행
    run_pipeline(df, "Topcluster", "Topcluster", text_col)
    run_pipeline(df, "Midcluster", "Midcluster_ID", text_col)
    run_pipeline(df, "Subcluster", "Subcluster_ID", text_col)

    # 에러발생 이유: texts의 개수가 매우 적은 경우 (예: Subcluster 안에 문서가 1~2개뿐인 경우)
    # 1~2개여도 괜찮다면 좀 더 유하게 조정 or subcluster 자체를 하지말기 등 고민 필요