In [1]:
import ast
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
from sklearn.preprocessing import MinMaxScaler
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE, trustworthiness
from sklearn.metrics import pairwise_distances
from scipy.stats import spearmanr
import umap
from sklearn.neighbors import NearestNeighbors

# CSV 데이터 로드
df = pd.read_csv('./all_origin_emb.csv')

# 적용
# df['bert_embeddings'] = df['bert_embeddings'].apply(ast.literal_eval)
df['sbert_embeddings'] = df['sbert_embeddings'].apply(ast.literal_eval)

# bert_embeddings = np.stack(df['bert_embeddings'].values)
sbert_embeddings = np.stack(df['sbert_embeddings'].values)

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# Autoencoder 정의
class Autoencoder(nn.Module):
    def __init__(self, input_dim, hidden_dim, encoding_dim):
        super(Autoencoder, self).__init__()
        self.encoder = nn.Sequential(
            nn.Linear(input_dim, hidden_dim),
            nn.ReLU(),
            nn.Linear(hidden_dim, encoding_dim)
        )
        self.decoder = nn.Sequential(
            nn.Linear(encoding_dim, hidden_dim),
            nn.ReLU(),
            nn.Linear(hidden_dim, input_dim)
        )

    def forward(self, x):
        encoded = self.encoder(x)
        decoded = self.decoder(encoded)
        return decoded

# 평가지표 함수
def continuity(X, X_embedded, n_neighbors=5):
    n = NearestNeighbors(n_neighbors=n_neighbors).fit(X_embedded)
    dist_X = pairwise_distances(X)
    original_neighbors = np.argsort(dist_X, axis=1)[:, 1:n_neighbors+1]
    continuity_sum = 0
    for i in range(X.shape[0]):
        neighbors_embedded = n.kneighbors([X_embedded[i]], return_distance=False)[0]
        continuity_sum += np.sum([
            np.where(original_neighbors[i] == j)[0][0] if j in original_neighbors[i] else n_neighbors
            for j in neighbors_embedded
        ])
    return 1 - (2 / (X.shape[0] * n_neighbors * (2 * X.shape[0] - 3 * n_neighbors - 1))) * continuity_sum

def mrre(X, X_embedded, n_neighbors=5):
    orig_dist = pairwise_distances(X)
    emb_dist = pairwise_distances(X_embedded)
    orig_rank = np.argsort(orig_dist, axis=1)
    emb_rank = np.argsort(emb_dist, axis=1)
    mrre_sum = 0
    for i in range(X.shape[0]):
        for j in range(1, n_neighbors+1):
            rank_orig = np.where(orig_rank[i] == emb_rank[i, j])[0][0]
            mrre_sum += abs(rank_orig - j) / j
    return mrre_sum / (X.shape[0] * n_neighbors)

def rank_correlation(X, X_embedded):
    dist_X = pairwise_distances(X).flatten()
    dist_embedded = pairwise_distances(X_embedded).flatten()
    rho, _ = spearmanr(dist_X, dist_embedded)
    return rho

In [3]:
# 입력 데이터 준비
# bert_embeddings = np.stack(df['bert_embeddings'].values)
sbert_embeddings = np.stack(df['sbert_embeddings'].values)

embeddings_dict = {
    # "BERT": bert_embeddings,
    "SBERT": sbert_embeddings
}

results_all = []
encoding_dim = 64

In [4]:
for model_name, embedding_matrix in embeddings_dict.items():
    print(f"\n==== {model_name} 처리 시작 ====")

    # 정규화
    scaler = MinMaxScaler()
    X = scaler.fit_transform(embedding_matrix)

    # TensorDataset 구성
    tensor_X = torch.tensor(X, dtype=torch.float32)
    dataset = TensorDataset(tensor_X, tensor_X)
    dataloader = DataLoader(dataset, batch_size=64, shuffle=True)

    # # PCA
    # print(f"{model_name} - PCA")
    # pca = PCA(n_components=encoding_dim)
    # X_pca = pca.fit_transform(X)
    # df[f"{model_name}_PCA_{encoding_dim}d"] = [",".join(map(str, row)) for row in X_pca]

    # results_all.append({
    #     "Model": model_name,
    #     "Method": "PCA",
    #     "Dim": encoding_dim,
    #     "ExplainedVariance": pca.explained_variance_ratio_.sum(),
    #     "Trustworthiness": trustworthiness(X, X_pca),
    #     "Continuity": continuity(X, X_pca),
    #     "MRRE": mrre(X, X_pca),
    #     "Spearman": rank_correlation(X, X_pca)
    # })

    # # UMAP
    # print(f"{model_name} - UMAP")
    # reducer = umap.UMAP(n_components=encoding_dim, random_state=42)
    # X_umap = reducer.fit_transform(X)
    # df[f"{model_name}_UMAP_{encoding_dim}d"] = list(X_umap)
    # results_all.append({
    #     "Model": model_name,
    #     "Method": "UMAP",
    #     "Dim": encoding_dim,
    #     "ExplainedVariance": None,
    #     "Trustworthiness": trustworthiness(X, X_umap),
    #     "Continuity": continuity(X, X_umap),
    #     "MRRE": mrre(X, X_umap),
    #     "Spearman": rank_correlation(X, X_umap)
    # })

    # # t-SNE
    # print(f"{model_name} - t-SNE")
    # tsne = TSNE(n_components=encoding_dim, method='exact', random_state=42, perplexity=30)
    # X_tsne = tsne.fit_transform(X)
    # df[f"{model_name}_TSNE_{encoding_dim}d"] = list(X_tsne)
    # results_all.append({
    #     "Model": model_name,
    #     "Method": "t-SNE",
    #     "Dim": encoding_dim,
    #     "ExplainedVariance": None,
    #     "Trustworthiness": trustworthiness(X, X_tsne),
    #     "Continuity": continuity(X, X_tsne),
    #     "MRRE": mrre(X, X_tsne),
    #     "Spearman": rank_correlation(X, X_tsne)
    # })

    # Autoencoder
    print(f"{model_name} - Autoencoder")
    model = Autoencoder(input_dim=X.shape[1], hidden_dim=128, encoding_dim=encoding_dim)
    criterion = nn.MSELoss()
    optimizer = optim.Adam(model.parameters(), lr=1e-3)
    for epoch in range(50):
        epoch_loss = 0
        for batch, _ in dataloader:
            optimizer.zero_grad()
            reconstructed = model(batch)
            loss = criterion(reconstructed, batch)
            loss.backward()
            optimizer.step()
            epoch_loss += loss.item()
        if (epoch + 1) % 10 == 0 or epoch == 0 or epoch == 49:
            print(f"Epoch [{epoch+1}/50], Loss: {epoch_loss/len(dataloader):.6f}")

    with torch.no_grad():
        encoded = model.encoder(tensor_X).numpy()
    df[f"{model_name}_AE_{encoding_dim}d"] = [",".join(map(str, row)) for row in encoded]

    results_all.append({
        "Model": model_name,
        "Method": "Autoencoder",
        "Dim": encoding_dim,
        "ExplainedVariance": None,
        "Trustworthiness": trustworthiness(X, encoded),
        "Continuity": continuity(X, encoded),
        "MRRE": mrre(X, encoded),
        "Spearman": rank_correlation(X, encoded)
    })

    # 모델 저장
    model_path = f"autoencoder_{model_name}_{encoding_dim}d.pt"
    torch.save(model.state_dict(), model_path)
    print(f"Autoencoder 저장 완료 → {model_path}")

# 결과 저장
results_df = pd.DataFrame(results_all)
results_df.to_csv("./emb_result_ref.csv", index=False)
df.to_csv("./all_origin_emb_red.csv", index=False)
print("모든 차원 축소 및 저장 완료!")


==== SBERT 처리 시작 ====
SBERT - Autoencoder
Epoch [1/50], Loss: 0.021580
Epoch [10/50], Loss: 0.006854
Epoch [20/50], Loss: 0.005821
Epoch [30/50], Loss: 0.005595
Epoch [40/50], Loss: 0.005426
Epoch [50/50], Loss: 0.005269
Autoencoder 저장 완료 → autoencoder_SBERT_64d.pt
모든 차원 축소 및 저장 완료!


In [None]:
results_df

In [5]:
# 라이브러리 불러오기
import re
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.neighbors import NearestNeighbors
from sklearn.model_selection import GridSearchCV
import matplotlib.cm as cm
from scipy.cluster.hierarchy import dendrogram, linkage
from sklearn.cluster import AgglomerativeClustering
from sklearn.cluster import Birch
from sklearn.metrics import silhouette_score, silhouette_samples
import random
import pickle

In [6]:
df = pd.read_csv('./all_origin_emb_red.csv')
df

Unnamed: 0,ticket_id_hashed,Privacy_Detect_Col,components,keyword,사업부,지역,language,대분류,중분류,소분류,...,thinQmodel,salesmodel,generated_summary,generated_translation,merge_key,generated_response,mapped_summary,matched_terms,sbert_embeddings,SBERT_AE_64d
0,569aa41,본 데이터는 HADATAPLFM-10114에서 제공한 개인정보로 2025-07-31...,Dryer,버튼설정,리빙,US,,,,,...,"['BDV_TX_BT_LED_6203', 'TLXU7BBP']","['DLE8400BE.AEBEEUS', 'WT8400CB.AEBETUS']",The user suggests having the control panel lig...,,,,The user suggests having the control panel lig...,[],"[-0.020517314, -0.047066975, -0.0118728345, 0....","0.07576729,0.07902212,0.016557923,-0.16225083,..."
1,d71a6d7,본 데이터는 HADATAPLFM-10114에서 제공한 개인정보로 2025-07-31...,Dryer,애플홈킷연동,리빙,US,['en'],,,,...,"['BDH_D30007_US', 'FAFXU22027']","['DLHC5502V.ASSEEUS', 'WM5500HVA.ASSEVUS']",The user suggests allowing full integration wi...,,,,The user suggests allowing full integration wi...,[],"[0.04891209, -0.015489842, 0.007906878, -0.059...","-0.05278446,0.040736273,-0.13176265,0.07386213..."
2,04d7c25,본 데이터는 HADATAPLFM-10114에서 제공한 개인정보로 2025-07-31...,Dryer,3rd_Party_연동,리빙,US,,,,,...,"['BDV_TX_BT_LED_6203', 'TLXU7BBP']","['DLG8401WE.AEWEEUS', 'WT8405CW.AEWEUUS']",The user suggests improving local control over...,,,,The user suggests improving local control over...,[],"[0.057805084, -0.015579243, 0.04613969, -0.015...","-0.02832905,0.013609283,-0.05970249,0.05727831..."
3,b6452d1,본 데이터는 HADATAPLFM-10114에서 제공한 개인정보로 2025-07-31...,Dryer,원격기능유지,리빙,US,,,,,...,['BDVG_FX0003_US'],['DLEX5500V.ASSEEUS'],The user suggests enabling remote start withou...,,,,The user suggests enabling remote controller s...,['remote'],"[0.042198393, -0.06663066, -0.0379671, -0.0171...","-0.1414326,-0.018785818,0.016333956,0.223997,-..."
4,d25d98b,본 데이터는 HADATAPLFM-10114에서 제공한 개인정보로 2025-07-31...,Dryer,타임싱크,리빙,US,,,,,...,"['BDVG_FX0003_US', 'FAFXU22007']","['DLEX6700B.ABLEEUS', 'WM6700HBA.ABLEVUS']",The user suggests providing a way to correct t...,,,,The user suggests providing a way to correct t...,[],"[0.002019161, 0.110980995, 0.040430292, 0.0094...","-0.2388581,0.078645445,-0.26221836,0.18130253,..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
24318,e0d440d,본 데이터는 HADATAPLFM-10114에서 제공한 개인정보로 2025-07-31...,에어컨,HW개선(소재개선),에어,KR,,제품,HW개선,HW 기능 추가/개선,...,,,The user suggests changing the connection part...,Changing the connection part of the wall-mount...,,,The user suggests changing the connection part...,[],"[0.025649324, 0.05508208, 0.06596928, 0.049144...","-0.024508232,-0.12841865,-0.17069769,-0.195582..."
24319,45e8080,본 데이터는 HADATAPLFM-10114에서 제공한 개인정보로 2025-07-31...,에어컨,UP콘텐츠_무드라이팅컬러추가,에어,KR,,제품,설정 제어,설정/변경,...,,,The user suggests a feature that allows settin...,I am currently using a model of the Whisen Tow...,,,The user suggests a feature that allows settin...,['feature'],"[-0.05098548, 0.024941046, 0.02539901, 0.06017...","-0.1293251,0.037915766,-0.3202865,0.10780667,-..."
24320,864eeff,본 데이터는 HADATAPLFM-10114에서 제공한 개인정보로 2025-07-31...,에어컨,바람방향설정/변경,에어,KR,,제품,설정 제어,설정/변경,...,,,"The user suggests adding an option for up, dow...","If there is an option for up, down, left, and ...",,,"The user suggests adding an option for up, dow...",['APP'],"[-0.04032098, -0.011781391, 0.024384914, 0.034...","-0.21959513,-0.07519117,0.036713343,-0.1908241..."
24321,6f042cb,본 데이터는 HADATAPLFM-10114에서 제공한 개인정보로 2025-07-31...,에어컨,바람방향설정/변경,에어,KR,,제품,설정 제어,설정/변경,...,,,The user suggests saving the wind direction se...,"Saving wind direction (left-right angle, up-do...",,,The user suggests saving the wind direction se...,[],"[0.02718613, 0.03308086, 0.07442402, 0.0356220...","0.015032046,0.06438572,-0.119413204,-0.0541646..."


In [7]:
def embedding_column_to_dataframe(df, embedding_col, embedding_dim=64):
    """
    문자열로 저장된 임베딩 컬럼을 float 배열로 변환하고, DataFrame으로 반환하는 함수

    Parameters:
    - df: 원본 DataFrame
    - embedding_col: 변환할 컬럼 이름 (문자열)
    - embedding_dim: 벡터 차원 수

    Returns:
    - df_embeddings: 변환된 임베딩 DataFrame
    """
    
    # 1. 문자열 → float 배열 파싱 함수
    def fix_embedding_format(embedding_str):
        if isinstance(embedding_str, str):
            embedding_str = re.sub(r"[\[\],]", " ", embedding_str)
            embedding_str = re.sub(r"\s+", " ", embedding_str.strip())
            return np.array(list(map(float, embedding_str.split(" "))))
        return embedding_str

    # 2. 적용 및 변환
    df[embedding_col] = df[embedding_col].apply(fix_embedding_format)

    # 3. NumPy 2D 배열로 변환
    embedding_array = np.vstack(df[embedding_col].values)
    print(embedding_array.shape)
    print(embedding_array.dtype)

    # 4. DataFrame 변환 및 컬럼 이름 지정
    df_embeddings = pd.DataFrame(embedding_array, columns=[f"dim_{i}" for i in range(embedding_dim)])

    return df_embeddings

In [8]:
# 예시: 64차원 BERT_AE 임베딩 컬럼 변환
df_embeddings = embedding_column_to_dataframe(df, 'SBERT_AE_64d', embedding_dim=64) # <-----------활용할 칼럼명, 차원수만 입력

# 결과 확인
df_embeddings.head()

(24323, 64)
float64


Unnamed: 0,dim_0,dim_1,dim_2,dim_3,dim_4,dim_5,dim_6,dim_7,dim_8,dim_9,...,dim_54,dim_55,dim_56,dim_57,dim_58,dim_59,dim_60,dim_61,dim_62,dim_63
0,0.075767,0.079022,0.016558,-0.162251,-0.069939,0.176784,0.344817,0.145883,-0.019547,-0.052286,...,-0.291032,-0.001047,0.234873,-0.06481,0.02745,-0.143102,-0.197554,-0.01096,-0.001367,0.213566
1,-0.052784,0.040736,-0.131763,0.073862,0.318433,-0.123994,-0.159574,-0.052375,0.154869,0.067556,...,-0.11262,0.23226,-0.052793,-0.053453,0.184811,0.100419,-0.277661,0.392118,-0.274975,0.24109
2,-0.028329,0.013609,-0.059702,0.057278,0.343442,-0.005791,-0.218238,0.090118,0.140481,-0.075181,...,-0.085646,0.134212,0.033091,-0.127528,0.092318,0.188004,-0.490866,0.327098,0.023757,0.108377
3,-0.141433,-0.018786,0.016334,0.223997,-0.112966,0.06439,-0.218879,0.050858,0.171491,-0.055794,...,-0.253747,0.123764,0.102222,-0.242341,0.029058,0.014906,-0.181656,0.142165,-0.291786,-0.186976
4,-0.238858,0.078645,-0.262218,0.181303,-0.024056,-0.108034,-0.032344,0.006672,0.004191,0.0328,...,-0.079107,0.102014,-0.094346,0.160867,-0.191842,0.02423,-0.378175,0.068542,0.088154,0.020848


In [9]:
def save(file_name):
    save_result = results[0]['data']
    save_result.to_csv(f"{file_name}.csv", index=False, encoding="utf-8-sig")
    return save_result

In [10]:

from sklearn.metrics import calinski_harabasz_score, davies_bouldin_score, pairwise_distances

def dunn_index(X, labels):
    distances = pairwise_distances(X)
    unique_labels = np.unique(labels)
    n_clusters = len(unique_labels)

    intra_dists = []
    for label in unique_labels:
        indices = np.where(labels == label)[0]
        if len(indices) > 1:
            intra = np.max(distances[np.ix_(indices, indices)])
            intra_dists.append(intra)
        else:
            intra_dists.append(0)

    inter_dists = []
    for i in range(n_clusters):
        for j in range(i + 1, n_clusters):
            indices_i = np.where(labels == unique_labels[i])[0]
            indices_j = np.where(labels == unique_labels[j])[0]
            inter = np.min(distances[np.ix_(indices_i, indices_j)])
            inter_dists.append(inter)

    return np.min(inter_dists) / np.max(intra_dists)


In [15]:
def birch_clustering(data, threshold=0.5, branching_factor=50, n_clusters=None):
    birch = Birch(threshold=threshold, branching_factor=branching_factor, n_clusters=n_clusters)
    labels = birch.fit_predict(data)
    return labels

# 실루엣 지수 계산
def calculate_silhouette(data, labels):
    if len(np.unique(labels)) > 1:
        return silhouette_score(data, labels)
    else:
        return -1

# 실루엣 점수 시각화 (2차원 데이터만)
def plot_silhouette(data, labels, title):
    if data.shape[1] != 2:
        print("\n* [Not 2D] Silhouette plot skipped")
        return
    silhouette_avg = silhouette_score(data, labels)
    sample_silhouette_values = silhouette_samples(data, labels)
    plt.figure(figsize=(10, 6))
    y_lower = 10
    for i in np.unique(labels):
        ith_cluster_silhouette_values = sample_silhouette_values[labels == i]
        ith_cluster_silhouette_values.sort()
        size_cluster_i = len(ith_cluster_silhouette_values)
        y_upper = y_lower + size_cluster_i
        color = plt.cm.nipy_spectral(float(i) / len(np.unique(labels)))
        plt.fill_betweenx(np.arange(y_lower, y_upper), 0, ith_cluster_silhouette_values, facecolor=color, alpha=0.7)
        plt.text(-0.05, y_lower + 0.5 * size_cluster_i, str(i))
        y_lower = y_upper + 10
    plt.axvline(x=silhouette_avg, color="red", linestyle="--")
    plt.title(title)
    plt.xlabel("Silhouette Coefficient")
    plt.ylabel("Cluster Label")
    plt.show()

# 클러스터링 시각화 (2차원 데이터만)
def visualize_clusters(data, labels, title):
    if data.shape[1] != 2:
        print("\n* [Not 2D] Cluster scatter plot skipped")
        return
    plt.figure(figsize=(8, 8))
    scatter = plt.scatter(data[:, 0], data[:, 1], c=labels, cmap="viridis", alpha=0.7)
    plt.title(title)
    plt.xlabel("Dimension 1")
    plt.ylabel("Dimension 2")
    plt.colorbar(scatter, label="Cluster Label")
    plt.show()

# 동적 파라미터 최적화
def optimize_birch(data, initial_samples=100, fine_tuning_samples=5):
    best_score = -1
    best_params = None
    best_n_clusters = 0
    success = False

    # 초기 랜덤 탐색
    print("1. Random protecting")
    for _ in range(initial_samples):
        threshold = random.uniform(1, 2) # <---------------------- 파라미터 수정 구간
        branching_factor = random.randint(30, 70)
        labels = birch_clustering(data, threshold=threshold, branching_factor=branching_factor)
        n_clusters = len(np.unique(labels))
        if n_clusters > len(data) * 0.5 or n_clusters <= 1:
            continue
        score = calculate_silhouette(data, labels)
        if score > best_score:
            best_score = score
            best_params = {'threshold': threshold, 'branching_factor': branching_factor}
            best_n_clusters = n_clusters
            success = True
        ch_score = calinski_harabasz_score(data, labels)
        db_score = davies_bouldin_score(data, labels)
        dunn = dunn_index(data, labels)
        print(f"threshold :{threshold:.4f}, branching factor :{branching_factor}, k :{n_clusters}, score :{score:.4f},ch_score :{ch_score:.4f}, db_score :{db_score:.4f}, dunn_index :{dunn:.4f}")
    print(f"Initial best params: {best_params}, Best clusters: {best_n_clusters}, Best silhouette score: {best_score:.4f}")

    # 탐색 실패 시 범위 축소 후 재탐색
    if not success:
        print("\nInitial optimization failed. Reducing parameter range and retrying.")
        for _ in range(fine_tuning_samples):
            threshold = random.uniform(0.4, 0.6)
            branching_factor = random.randint(40, 60)
            labels = birch_clustering(data, threshold=threshold, branching_factor=branching_factor)
            n_clusters = len(np.unique(labels))
            if n_clusters > len(data) * 0.5 or n_clusters <= 1:
                continue
            score = calculate_silhouette(data, labels)
            if score > best_score:
                best_score = score
                best_params = {'threshold': threshold, 'branching_factor': branching_factor}
                best_n_clusters = n_clusters
                success = True

    if not success:
        print("Warning: No optimal parameters found after multiple attempts. Using default values.")
        return {'threshold': 0.5, 'branching_factor': 50, 'n_clusters': 1}
    print('\n[최적값 도출] -------------------------------------------------------------------------')
    print(f'Best Params: {best_params}, Best Clusters: {best_n_clusters}, Best Silhouette Score: {best_score:.4f}')
    return {**best_params, 'n_clusters': best_n_clusters}

# 클러스터링 수행 함수 (metadata_df 포함)
def process_birch_embedding(data, metadata_df=None):
    best_params = optimize_birch(data)
    labels = birch_clustering(data, threshold=best_params['threshold'], branching_factor=best_params['branching_factor'])
    silhouette = calculate_silhouette(data, labels)
    # ch_score = calinski_harabasz_score(data, labels)
    # db_score = davies_bouldin_score(data, labels)
    # dunn = dunn_index(data, labels)
    # print(f"ch_score: {ch_score:.4f}, db_score: {db_score:.4f}, dunn_index: {dunn:.4f}")
    # 결과 저장
    df_result = pd.DataFrame(data)
    df_result['Cluster'] = labels

    if best_params['n_clusters'] > 1:
        silhouette_vals = silhouette_samples(data, labels)
        df_result['Silhouette'] = silhouette_vals
    else:
        df_result['Silhouette'] = -1

    # 시각화 조건 처리
    visualize_clusters(data, labels, title=f"BIRCH Clustering")
    plot_silhouette(data, labels, title=f"Silhouette Plot")

    # 메타데이터 병합
    if metadata_df is not None:
        df_result = pd.concat([metadata_df.reset_index(drop=True), df_result], axis=1)

    return {
        'threshold': best_params['threshold'],
        'branching_factor': best_params['branching_factor'],
        'n_clusters': best_params['n_clusters'],
        'silhouette_score': silhouette,
        'data': df_result
    }

In [16]:
results = []
results.append(process_birch_embedding(df_embeddings,metadata_df=df[['ticket_id_hashed', 'generated_summary']]))

1. Random protecting
threshold :1.0637, branching factor :62, k :36, score :0.0790,ch_score :454.3427, db_score :2.5704, dunn_index :0.0389
threshold :1.0829, branching factor :34, k :22, score :0.0693,ch_score :566.6174, db_score :2.7102, dunn_index :0.0362
threshold :1.1536, branching factor :48, k :18, score :0.0673,ch_score :632.4130, db_score :2.7304, dunn_index :0.0722
threshold :1.0887, branching factor :58, k :23, score :0.0745,ch_score :573.0070, db_score :2.7273, dunn_index :0.0307
threshold :1.1166, branching factor :54, k :20, score :0.0668,ch_score :584.8511, db_score :2.8216, dunn_index :0.0329
threshold :1.0075, branching factor :42, k :74, score :0.0701,ch_score :280.7384, db_score :2.6478, dunn_index :0.0287
threshold :1.1379, branching factor :37, k :24, score :0.0683,ch_score :550.5184, db_score :2.8697, dunn_index :0.0557
threshold :1.0987, branching factor :33, k :23, score :0.0707,ch_score :567.1396, db_score :2.8069, dunn_index :0.0579
threshold :1.0771, branchin

In [18]:
def birch_clustering(data, threshold=1.0834781007926841, branching_factor=65, n_clusters=None):
    birch = Birch(threshold=threshold, branching_factor=branching_factor, n_clusters=n_clusters)
    labels = birch.fit_predict(data)
    model = Birch(threshold=threshold, branching_factor=branching_factor, n_clusters=n_clusters)
    return labels, model

def process_birch_embedding_fixed(data, threshold=1.0834781007926841, branching_factor=65, n_clusters=None, metadata_df=None,model_path=None):
    if isinstance(data, pd.DataFrame):
        data = data.to_numpy()
    # 클러스터링
    labels, model = birch_clustering(data, threshold=threshold, branching_factor=branching_factor, n_clusters=n_clusters)
    silhouette = calculate_silhouette(data, labels)

    # 결과 저장
    df_result = pd.DataFrame(data)
    df_result['Cluster'] = labels

    if len(np.unique(labels)) > 1:
        silhouette_vals = silhouette_samples(data, labels)
        df_result['Silhouette'] = silhouette_vals
    else:
        df_result['Silhouette'] = -1

    # 시각화
    visualize_clusters(data, labels, title=f"BIRCH Clustering (threshold={threshold})")
    plot_silhouette(data, labels, title=f"Silhouette Plot (threshold={threshold})")

    # 메타데이터 병합
    if metadata_df is not None:
        df_result = pd.concat([metadata_df.reset_index(drop=True), df_result], axis=1)
    
    if model_path:
        with open(model_path, 'wb') as f:
            pickle.dump(model, f)
        print(f"Model saved to {model_path}")

    return {
        'threshold': threshold,
        'branching_factor': branching_factor,
        'n_clusters': len(np.unique(labels)),
        'silhouette_score': silhouette,
        'data': df_result
    }

In [19]:
# threshold=1.15로 고정하여 클러스터링 수행
results = []
results.append(process_birch_embedding_fixed(
    data=df_embeddings,
    threshold=1.0834781007926841,
    branching_factor=65,
    metadata_df=df[['ticket_id_hashed', 'generated_summary']],
    model_path='birch_model.pkl'  # 모델 저장 경로
))


* [Not 2D] Cluster scatter plot skipped

* [Not 2D] Silhouette plot skipped
Model saved to birch_model.pkl


In [21]:
save('./all_origin_hi_clu')

Unnamed: 0,ticket_id_hashed,generated_summary,0,1,2,3,4,5,6,7,...,56,57,58,59,60,61,62,63,Cluster,Silhouette
0,569aa41,The user suggests having the control panel lig...,0.075767,0.079022,0.016558,-0.162251,-0.069939,0.176784,0.344817,0.145883,...,0.234873,-0.064810,0.027450,-0.143102,-0.197554,-0.010960,-0.001367,0.213566,7,0.091018
1,d71a6d7,The user suggests allowing full integration wi...,-0.052784,0.040736,-0.131763,0.073862,0.318433,-0.123994,-0.159574,-0.052375,...,-0.052793,-0.053453,0.184811,0.100419,-0.277661,0.392118,-0.274975,0.241090,0,-0.008502
2,04d7c25,The user suggests improving local control over...,-0.028329,0.013609,-0.059702,0.057278,0.343442,-0.005791,-0.218238,0.090118,...,0.033091,-0.127528,0.092318,0.188004,-0.490866,0.327098,0.023757,0.108377,0,-0.019235
3,b6452d1,The user suggests enabling remote start withou...,-0.141433,-0.018786,0.016334,0.223997,-0.112966,0.064390,-0.218879,0.050858,...,0.102222,-0.242341,0.029058,0.014906,-0.181656,0.142165,-0.291786,-0.186976,6,0.106826
4,d25d98b,The user suggests providing a way to correct t...,-0.238858,0.078645,-0.262218,0.181303,-0.024056,-0.108034,-0.032344,0.006672,...,-0.094346,0.160867,-0.191842,0.024230,-0.378175,0.068542,0.088154,0.020848,12,0.141516
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
24318,e0d440d,The user suggests changing the connection part...,-0.024508,-0.128419,-0.170698,-0.195582,0.054395,0.188260,-0.028575,-0.063503,...,0.050949,0.280975,-0.165160,0.384893,-0.466052,0.099158,0.047679,-0.176595,13,0.020911
24319,45e8080,The user suggests a feature that allows settin...,-0.129325,0.037916,-0.320286,0.107807,-0.081001,0.065715,0.450952,-0.097956,...,0.020701,-0.151016,-0.171744,0.095755,-0.314283,0.065789,-0.038813,0.204168,14,0.074073
24320,864eeff,"The user suggests adding an option for up, dow...",-0.219595,-0.075191,0.036713,-0.190824,-0.254214,0.116395,0.048776,-0.096348,...,0.101272,-0.046288,-0.034956,-0.058602,-0.420390,0.176029,0.159094,-0.157889,13,0.066487
24321,6f042cb,The user suggests saving the wind direction se...,0.015032,0.064386,-0.119413,-0.054165,-0.056012,0.143908,-0.008847,-0.081922,...,0.178498,0.125502,-0.118026,-0.080283,-0.468905,0.182202,0.119406,0.020134,13,0.037834
