In [17]:
import pandas as pd
df_raw = pd.read_csv("data/embedding/bulletin_data_with_embeddings.csv")
df = pd.read_csv('outputs/summary_kmeans/data_with_clusters.csv')

In [23]:
df = df.rename(columns={'cluster': 'major_topic'})

In [24]:
print("클러스터링 전 갯수:", len(df_raw))
print("클러스터링 후 갯수:", len(df))

클러스터링 전 갯수: 6944
클러스터링 후 갯수: 6942


In [25]:
df.head(2)

Unnamed: 0,연번,상담일자,상담유형,상담요약,상담인 유형,상담내용,연도,embedding,summary_embedding,major_topic
0,1,2023.1.2-6,전화,누수피해,구분소유자,배수관 역류에 따라 바닥재(마루) 및 씽크대 등의 피해가 발생하였습니다. 역류원인을...,2023,"[0.015961677, 0.03234928, 0.07411159, 0.007246...","[0.02874046191573143, -0.016275031492114067, 0...",7
1,2,2023.1.2-6,전화,통합정보마당,임차인,통합정보마당 사용 방법에 대한 이해,2023,"[-0.019310059, 0.0036593825, 0.005384315, -0.0...","[-0.022464005276560783, 0.0037450117524713278,...",8


In [26]:
df['major_topic'].value_counts()

major_topic
5    2316
8    1932
3     668
2     492
6     454
1     451
7     236
0     217
4     176
Name: count, dtype: int64

In [29]:
topic_0 = df[df['major_topic'] == 0]
topic_1 = df[df['major_topic'] == 1]
topic_2 = df[df['major_topic'] == 2]
topic_3 = df[df['major_topic'] == 3]
topic_4 = df[df['major_topic'] == 4]
topic_5 = df[df['major_topic'] == 5]
topic_6 = df[df['major_topic'] == 6]
topic_7 = df[df['major_topic'] == 7]
topic_8 = df[df['major_topic'] == 8]

In [30]:
import pandas as pd
import numpy as np
import json
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score, calinski_harabasz_score, davies_bouldin_score
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
import logging
from pathlib import Path
import warnings
warnings.filterwarnings('ignore')

# 로깅 설정
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
)
logger = logging.getLogger(__name__)

class SubTopicClusterer:
    """소주제 클러스터링 클래스"""
    
    def __init__(self, k_range=(2, 10)):
        """초기화"""
        self.k_range = k_range
        self.results = {}
        self.optimal_k = {}
        self.cluster_assignments = {}
        
    def load_embeddings(self, df, topic_column='major_topic', embedding_column='embedding'):
        """임베딩 데이터 로드"""
        embeddings = {}
        for topic in df[topic_column].unique():
            topic_df = df[df[topic_column] == topic]
            topic_embeddings = []
            
            for idx, row in topic_df.iterrows():
                try:
                    # 임베딩 문자열을 리스트로 변환
                    if isinstance(row[embedding_column], str):
                        embedding = json.loads(row[embedding_column])
                    else:
                        embedding = row[embedding_column]
                    topic_embeddings.append(embedding)
                except:
                    continue
            
            if topic_embeddings:
                embeddings[topic] = np.array(topic_embeddings)
                logger.info(f"major_topic {topic}: {len(topic_embeddings)}개 임베딩 로드")
        
        return embeddings
    
    def optimize_kmeans(self, embeddings, topic_id):
        """K-means 최적화"""
        if len(embeddings) < 3:
            logger.warning(f"major_topic {topic_id}: 데이터가 너무 적어 클러스터링을 건너뜁니다.")
            return None
        
        # 스케일링
        scaler = StandardScaler()
        scaled_embeddings = scaler.fit_transform(embeddings)
        
        # 최적의 k 찾기
        metrics = {
            'k': [],
            'silhouette': [],
            'calinski_harabasz': [],
            'davies_bouldin': [],
            'inertia': []
        }
        
        max_k = min(self.k_range[1], len(embeddings) - 1)
        
        for k in range(self.k_range[0], max_k + 1):
            try:
                kmeans = KMeans(n_clusters=k, random_state=42, n_init=10)
                cluster_labels = kmeans.fit_predict(scaled_embeddings)
                
                # 메트릭 계산
                silhouette = silhouette_score(scaled_embeddings, cluster_labels)
                calinski = calinski_harabasz_score(scaled_embeddings, cluster_labels)
                davies = davies_bouldin_score(scaled_embeddings, cluster_labels)
                inertia = kmeans.inertia_
                
                metrics['k'].append(k)
                metrics['silhouette'].append(silhouette)
                metrics['calinski_harabasz'].append(calinski)
                metrics['davies_bouldin'].append(davies)
                metrics['inertia'].append(inertia)
                
                logger.info(f"major_topic {topic_id}, k={k}: silhouette={silhouette:.4f}, calinski={calinski:.2f}")
                
            except Exception as e:
                logger.error(f"major_topic {topic_id}, k={k}에서 오류: {e}")
                continue
        
        if not metrics['k']:
            return None
        
        # 최적의 k 선택 (silhouette score 기준)
        optimal_k = metrics['k'][np.argmax(metrics['silhouette'])]
        
        # 최적 k로 최종 클러스터링
        final_kmeans = KMeans(n_clusters=optimal_k, random_state=42, n_init=10)
        final_labels = final_kmeans.fit_predict(scaled_embeddings)
        
        return {
            'metrics': pd.DataFrame(metrics),
            'optimal_k': optimal_k,
            'cluster_labels': final_labels,
            'embeddings': embeddings,
            'scaled_embeddings': scaled_embeddings
        }
    
    def cluster_all_topics(self, topic_dfs, embedding_column='embedding'):
        """모든 major_topic에 대해 클러스터링 수행"""
        for topic_id, topic_df in topic_dfs.items():
            logger.info(f"major_topic {topic_id} 클러스터링 시작...")
            
            # 임베딩 추출
            embeddings = []
            valid_indices = []
            
            for idx, row in topic_df.iterrows():
                try:
                    if isinstance(row[embedding_column], str):
                        embedding = json.loads(row[embedding_column])
                    else:
                        embedding = row[embedding_column]
                    embeddings.append(embedding)
                    valid_indices.append(idx)
                except:
                    continue
            
            if len(embeddings) < 3:
                logger.warning(f"major_topic {topic_id}: 유효한 임베딩이 부족합니다.")
                continue
            
            embeddings = np.array(embeddings)
            
            # 클러스터링 수행
            result = self.optimize_kmeans(embeddings, topic_id)
            
            if result:
                self.results[topic_id] = result
                self.optimal_k[topic_id] = result['optimal_k']
                self.cluster_assignments[topic_id] = {
                    'indices': valid_indices,
                    'labels': result['cluster_labels']
                }
                
                logger.info(f"major_topic {topic_id}: 최적 k={result['optimal_k']}, 클러스터 수={len(np.unique(result['cluster_labels']))}")
    
    def save_results(self, output_dir="outputs/subtopics"):
        """결과 저장"""
        Path(output_dir).mkdir(parents=True, exist_ok=True)
        
        # 최적화 메트릭 저장
        all_metrics = []
        for topic_id, result in self.results.items():
            metrics_df = result['metrics'].copy()
            metrics_df['major_topic'] = topic_id
            all_metrics.append(metrics_df)
        
        if all_metrics:
            combined_metrics = pd.concat(all_metrics, ignore_index=True)
            combined_metrics.to_csv(f"{output_dir}/subtopic_optimization_metrics.csv", index=False)
        
        # 클러스터 할당 결과 저장
        cluster_results = []
        for topic_id, assignment in self.cluster_assignments.items():
            for idx, label in zip(assignment['indices'], assignment['labels']):
                cluster_results.append({
                    'major_topic': topic_id,
                    'sub_topic': label,
                    'row_index': idx
                })
        
        cluster_df = pd.DataFrame(cluster_results)
        cluster_df.to_csv(f"{output_dir}/subtopic_cluster_assignments.csv", index=False)
        
        # 최적 k 요약 저장
        optimal_k_summary = pd.DataFrame([
            {'major_topic': topic_id, '최적_클러스터_수': k}
            for topic_id, k in self.optimal_k.items()
        ])
        optimal_k_summary.to_csv(f"{output_dir}/optimal_k_summary.csv", index=False)
        
        logger.info(f"결과가 {output_dir}에 저장되었습니다.")
    
    def visualize_results(self, output_dir="outputs/subtopics"):
        """결과 시각화"""
        Path(output_dir).mkdir(parents=True, exist_ok=True)
        
        # 최적 k 분포
        plt.figure(figsize=(10, 6))
        plt.bar(self.optimal_k.keys(), self.optimal_k.values())
        plt.title('major_topic별 최적 클러스터 수')
        plt.xlabel('major_topic')
        plt.ylabel('최적 클러스터 수')
        plt.xticks(rotation=45)
        plt.tight_layout()
        plt.savefig(f"{output_dir}/optimal_k_distribution.png", dpi=300, bbox_inches='tight')
        plt.close()
        
        # Silhouette score 분포
        silhouette_scores = []
        for topic_id, result in self.results.items():
            optimal_idx = result['metrics']['k'] == result['optimal_k']
            if optimal_idx.any():
                silhouette_scores.append(result['metrics'].loc[optimal_idx, 'silhouette'].iloc[0])
            else:
                silhouette_scores.append(0)
        
        plt.figure(figsize=(10, 6))
        plt.bar(self.optimal_k.keys(), silhouette_scores)
        plt.title('major_topic별 최적 Silhouette Score')
        plt.xlabel('major_topic')
        plt.ylabel('Silhouette Score')
        plt.xticks(rotation=45)
        plt.tight_layout()
        plt.savefig(f"{output_dir}/silhouette_scores.png", dpi=300, bbox_inches='tight')
        plt.close()

# 사용 예시
def main():
    # major_topic별 데이터프레임들
    topic_dfs = {
        0: topic_0,
        1: topic_1,
        2: topic_2,
        3: topic_3,
        4: topic_4,
        5: topic_5,
        6: topic_6,
        7: topic_7,
        8: topic_8
    }
    
    # 소주제 클러스터러 초기화
    clusterer = SubTopicClusterer(k_range=(2, 15))
    
    # 모든 major_topic에 대해 클러스터링 수행
    clusterer.cluster_all_topics(topic_dfs)
    
    # 결과 저장
    clusterer.save_results()
    
    # 결과 시각화
    clusterer.visualize_results()
    
    # 결과 요약 출력
    print("\n=== 소주제 클러스터링 결과 ===")
    for topic_id, optimal_k in clusterer.optimal_k.items():
        print(f"major_topic {topic_id}: 최적 클러스터 수 = {optimal_k}")

# 실행
if __name__ == "__main__":
    main()

2025-08-20 13:27:40,974 - __main__ - INFO - major_topic 0 클러스터링 시작...
2025-08-20 13:27:41,080 - __main__ - INFO - major_topic 0, k=2: silhouette=0.0314, calinski=8.55
2025-08-20 13:27:41,347 - __main__ - INFO - major_topic 0, k=3: silhouette=0.0359, calinski=7.70
2025-08-20 13:27:41,473 - __main__ - INFO - major_topic 0, k=4: silhouette=0.0289, calinski=6.50
2025-08-20 13:27:41,672 - __main__ - INFO - major_topic 0, k=5: silhouette=0.0298, calinski=5.79
2025-08-20 13:27:41,823 - __main__ - INFO - major_topic 0, k=6: silhouette=0.0265, calinski=5.01
2025-08-20 13:27:41,970 - __main__ - INFO - major_topic 0, k=7: silhouette=0.0273, calinski=4.54
2025-08-20 13:27:42,085 - __main__ - INFO - major_topic 0, k=8: silhouette=0.0221, calinski=4.19
2025-08-20 13:27:42,198 - __main__ - INFO - major_topic 0, k=9: silhouette=0.0301, calinski=4.16
2025-08-20 13:27:42,335 - __main__ - INFO - major_topic 0, k=10: silhouette=0.0213, calinski=3.77
2025-08-20 13:27:42,507 - __main__ - INFO - major_topic 


=== 소주제 클러스터링 결과 ===
major_topic 0: 최적 클러스터 수 = 3
major_topic 1: 최적 클러스터 수 = 11
major_topic 2: 최적 클러스터 수 = 2
major_topic 3: 최적 클러스터 수 = 8
major_topic 4: 최적 클러스터 수 = 3
major_topic 5: 최적 클러스터 수 = 4
major_topic 6: 최적 클러스터 수 = 3
major_topic 7: 최적 클러스터 수 = 2
major_topic 8: 최적 클러스터 수 = 9
