## dbscan 최적 파라미터 찾기

### eps 찾기 - k-거리 계산법

directory_path 는 경로 파일들의 상위 폴더를 넣으면 됨

In [None]:
import numpy as np
import pandas as pd
import os
from sklearn.neighbors import NearestNeighbors
import matplotlib.pyplot as plt

def extract_lat_lng_from_csv(directory):
    all_lat_lng_lists = []
    for filename in os.listdir(directory):
        if filename.endswith(".csv"):
            filepath = os.path.join(directory, filename)
            df = pd.read_csv(filepath)
            lat_lng_list = [(row['lat'], row['lng']) for index, row in df.iterrows()]
            all_lat_lng_lists.append(lat_lng_list)
    return np.concatenate(all_lat_lng_lists)  # 모든 데이터 포인트를 하나로 결합

def plot_k_distance_graph(data, k):
    # K번째 가까운 이웃까지의 거리 계산
    neighbors = NearestNeighbors(n_neighbors=k)
    neighbors.fit(data)
    distances, _ = neighbors.kneighbors(data)
    k_distances = np.sort(distances[:, -1], axis=0)  # K번째 거리 추출 및 정렬

    # 1차 미분 계산
    first_derivative = np.diff(k_distances)
    
    # 2차 미분 계산
    second_derivative = np.diff(first_derivative)

    # 그래프 그리기
    plt.figure(figsize=(15, 10))
    plt.subplot(3, 1, 1)
    plt.plot(k_distances)
    plt.title('K-Distance Graph')
    plt.xlabel('Data Points Sorted by Distance')
    plt.ylabel('Distance to the k-th Nearest Neighbor')
    plt.grid(True)

    plt.subplot(3, 1, 2)
    plt.plot(first_derivative)
    plt.xlabel('Data Points Sorted by Distance')
    plt.ylabel('1st Derivative')
    plt.grid(True)

    plt.subplot(3, 1, 3)
    plt.plot(second_derivative)
    plt.xlabel('Data Points Sorted by Distance')
    plt.ylabel('2nd Derivative')
    plt.grid(True)
    plt.show()

    # 엉덩이 지점 찾기
    elbow_index = np.argmax(second_derivative)
    return k_distances[elbow_index]

# 데이터 경로 설정
directory_path = "../어디쉐어/어디쉐어 dbscan"
data = extract_lat_lng_from_csv(directory_path)

# K-거리 그래프와 기울기 변화 분석
optimal_eps = plot_k_distance_graph(data, k=5)
print(f'Optimal eps value: {optimal_eps}')


### min_samples, eps변경에 따른 silhouette_score 비교

silhouette_score가 1에 가까울수록 적합도가 높다고 평가

In [None]:
from sklearn.metrics import silhouette_score

def evaluate_dbscan(data, eps_values, min_samples_values):
    best_score = -1
    best_eps = None
    best_min_samples = None

    for eps in eps_values:
        for min_samples in min_samples_values:
            dbscan = DBSCAN(eps=eps, min_samples=min_samples)
            labels = dbscan.fit_predict(data)
            if len(set(labels)) > 1:  # 최소 두 개 이상의 클러스터가 필요
                score = silhouette_score(data, labels, metric='euclidean')
                print(f'eps={eps}, min_samples={min_samples}, silhouette_score={score}')
                if score > best_score:
                    best_score = score
                    best_eps = eps
                    best_min_samples = min_samples

    return best_eps, best_min_samples

# 예시 사용
eps_values = np.arange(0.01, 0.015, 0.05)ㅁ
min_samples_values = range(20, 50, 5)
data = np.concatenate(lat_lng_values)  # 모든 데이터 포인트를 하나로 결합
best_eps, best_min_samples = evaluate_dbscan(data, eps_values, min_samples_values)

print(f'Best eps: {best_eps}, Best min_samples: {best_min_samples}')
