In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import silhouette_score
from sklearn.cluster import DBSCAN


In [2]:
df = pd.read_csv("housing.csv")
df = df.drop(['ocean_proximity'], axis=1)
df.head()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value
0,-122.23,37.88,41.0,880.0,129.0,322.0,126.0,8.3252,452600.0
1,-122.22,37.86,21.0,7099.0,1106.0,2401.0,1138.0,8.3014,358500.0
2,-122.24,37.85,52.0,1467.0,190.0,496.0,177.0,7.2574,352100.0
3,-122.25,37.85,52.0,1274.0,235.0,558.0,219.0,5.6431,341300.0
4,-122.25,37.85,52.0,1627.0,280.0,565.0,259.0,3.8462,342200.0


In [3]:
df.isnull().sum().sort_values(ascending=False).head()


total_bedrooms        207
longitude               0
latitude                0
housing_median_age      0
total_rooms             0
dtype: int64

In [4]:
df.loc[(df['total_bedrooms'].isnull() == True), 'total_bedrooms'] = 0


In [5]:
df.isnull().sum().sort_values(ascending=False).head()

longitude             0
latitude              0
housing_median_age    0
total_rooms           0
total_bedrooms        0
dtype: int64

In [6]:
scaler = MinMaxScaler()
df = scaler.fit_transform(df)
df


array([[0.21115538, 0.5674814 , 0.78431373, ..., 0.02055583, 0.53966842,
        0.90226638],
       [0.21215139, 0.565356  , 0.39215686, ..., 0.18697583, 0.53802706,
        0.70824656],
       [0.21015936, 0.5642933 , 1.        , ..., 0.02894261, 0.46602805,
        0.69505074],
       ...,
       [0.31175299, 0.73219979, 0.31372549, ..., 0.07104095, 0.08276438,
        0.15938285],
       [0.30179283, 0.73219979, 0.33333333, ..., 0.05722743, 0.09429525,
        0.14371281],
       [0.30976096, 0.72582359, 0.29411765, ..., 0.08699227, 0.13025338,
        0.15340349]])

In [7]:
# Modify your silhouette_coefficient function to handle single-point clusters
def silhouette_coefficient(X, labels):
    num_samples = len(X)
    cluster_labels = np.unique(labels)
    num_clusters = len(cluster_labels)

    if num_clusters == 1:
        return 0.0  # Silhouette Coefficient undefined for one cluster

    s = 0.0
    for i in range(num_samples):
        a_i = 0.0
        b_i = float('inf')
        if i >= len(labels):
            continue  # Handle cases where the index is out of range
        cluster_i = labels[i]  # Cluster label of the current point
        if np.sum(labels == cluster_i) <= 1:  # Handle single-point clusters
            continue
        for j in range(num_samples):
            if i != j:
                if j >= len(labels):
                    continue  # Handle cases where the index is out of range
                if labels[j] == cluster_i:
                    a_i += np.linalg.norm(X[i] - X[j])
                else:
                    b_ij = np.mean([np.linalg.norm(X[i] - X[p])
                                   for p in range(num_samples) if labels[p] == labels[j]])
                    b_i = min(b_i, b_ij)

        s_i = (b_i - a_i) / max(a_i, b_i)
        s += s_i

    return s / num_samples


In [8]:
def dbscan(data, eps, min_samples):
    # Helper function to calculate Euclidean distance between two points
    def euclidean_distance(point1, point2):
        return np.linalg.norm(point1 - point2)

    # Helper function to find neighboring points
    def find_neighbors(point_index, epsilon):
        neighbors = []
        for i in range(len(data)):
            if i != point_index and euclidean_distance(data[point_index], data[i]) <= epsilon:
                neighbors.append(i)
        return neighbors

    # Initialize labels (0: unvisited, -1: noise, positive integers: cluster labels)
    labels = [0] * len(data)
    cluster_label = 0

    for point_index in range(len(data)):
        if labels[point_index] != 0:
            continue

        neighbors = find_neighbors(point_index, eps)

        if len(neighbors) < min_samples:
            labels[point_index] = -1  # Mark as noise
        else:
            cluster_label += 1
            labels[point_index] = cluster_label

            i = 0
            while i < len(neighbors):
                current_neighbor = neighbors[i]

                if labels[current_neighbor] == -1:
                    labels[current_neighbor] = cluster_label

                if labels[current_neighbor] == 0:
                    labels[current_neighbor] = cluster_label
                    new_neighbors = find_neighbors(current_neighbor, eps)

                    if len(new_neighbors) >= min_samples:
                        neighbors += new_neighbors

                i += 1

    return labels   


In [9]:

# 1. Set your DBSCAN parameters
eps = 0.5
min_samples = 5

# 2. Run DBSCAN on the scaled data
labels = dbscan(df, eps, min_samples)

# db = DBSCAN(eps=0.5, min_samples=5).fit(df)
# labels = db.labels_

# 3. Filter out noise points (-1) and get valid data
valid_labels = [label for label in labels if label != -1]
valid_data = df[labels != -1]

# # 4. Calculate Silhouette Coefficient for the best parameters
silhouette_avg = silhouette_score(valid_data, valid_labels)
# silhouette_avg = silhouette_score(df, labels)

print(
    f"Best parameters: eps={eps}, min_samples={min_samples}, Best Silhouette Coefficient={silhouette_avg}")


Best parameters: eps=0.5, min_samples=5, Best Silhouette Coefficient=0.6245720645625877


In [10]:
# Calculate Silhouette Score
silhouette_avg = silhouette_score(df, labels)                                               
print(f"Silhouette Score for eps=0.5 and min_pts=5: {silhouette_avg}")          

Silhouette Score for eps=0.5 and min_pts=5: 0.6245720645625877


In [13]:

# 1. Set your DBSCAN parameters
eps = 0.01
min_samples = 3

# 2. Run DBSCAN on the scaled data
labels = dbscan(df, eps, min_samples)

# 3. Filter out noise points (-1) and get valid data
valid_labels = [label for label in labels if label != -1]
valid_data = df[labels != -1]

# # 4. Calculate Silhouette Coefficient for the best parameters
silhouette_avg = silhouette_score(valid_data, valid_labels)

print(
    f"Best parameters: eps={eps}, min_samples={min_samples}, Best Silhouette Coefficient={silhouette_avg}")


Best parameters: eps=0.01, min_samples=3, Best Silhouette Coefficient=-0.2091164468712811


In [14]:

# 1. Set your DBSCAN parameters
eps = 0.1
min_samples = 6

# 2. Run DBSCAN on the scaled data
labels = dbscan(df, eps, min_samples)

# 3. Filter out noise points (-1) and get valid data
valid_labels = [label for label in labels if label != -1]
valid_data = df[labels != -1]

# # 4. Calculate Silhouette Coefficient for the best parameters
silhouette_avg = silhouette_score(valid_data, valid_labels)

print(
    f"Best parameters: eps={eps}, min_samples={min_samples}, Best Silhouette Coefficient={silhouette_avg}")


Best parameters: eps=0.1, min_samples=6, Best Silhouette Coefficient=-0.3827686941565909


In [16]:

# 1. Set your DBSCAN parameters
eps = 0.2
min_samples = 9

# 2. Run DBSCAN on the scaled data
labels = dbscan(df, eps, min_samples)
# db = DBSCAN(eps=0.2, min_samples=9).fit(df)
# labels = db.labels_

# 3. Filter out noise points (-1) and get valid data
valid_labels = [label for label in labels if label != -1]
valid_data = df[labels != -1]

# # 4. Calculate Silhouette Coefficient for the best parameters
silhouette_avg = silhouette_score(valid_data, valid_labels)
# silhouette_avg = silhouette_score(df, labels)

print(
    f"Best parameters: eps={eps}, min_samples={min_samples}, Best Silhouette Coefficient={silhouette_avg}")


Best parameters: eps=0.2, min_samples=9, Best Silhouette Coefficient=0.36298796879455447


In [26]:

# 1. Set your DBSCAN parameters
eps = 0.6
min_samples = 15

# 2. Run DBSCAN on the scaled data
labels = dbscan(df, eps, min_samples)
# db = DBSCAN(eps=0.6, min_samples=15).fit(df)
# labels = db.labels_

# 3. Filter out noise points (-1) and get valid data
valid_labels = [label for label in labels if label != -1]
valid_data = df[labels != -1]

# # 4. Calculate Silhouette Coefficient for the best parameters
silhouette_avg = silhouette_score(valid_data, valid_labels)
# silhouette_avg = silhouette_score(df, labels)

print(
    f"Best parameters: eps={eps}, min_samples={min_samples}, Best Silhouette Coefficient={silhouette_avg}")


Best parameters: eps=0.6, min_samples=15, Best Silhouette Coefficient=0.6267714862377932


In [25]:

# # 1. Set your DBSCAN parameters
# eps = 0.7
# min_samples = 9

# # 2. Run DBSCAN on the scaled data
# # labels = dbscan(df, eps, min_samples)

# db = DBSCAN(eps=0.8, min_samples=12).fit(df)
# labels = db.labels_


# # 3. Filter out noise points (-1) and get valid data
# valid_labels = [label for label in labels if label != -1]
# valid_data = df[labels != -1]

# # # 4. Calculate Silhouette Coefficient for the best parameters
# # silhouette_avg = silhouette_score(valid_data, valid_labels)
# silhouette_avg = silhouette_score(df, labels)

# print(
#     f"Best parameters: eps={eps}, min_samples={min_samples}, Best Silhouette Coefficient={silhouette_avg}")


In [24]:
# # Set parameters for DBSCAN
# eps_values = [0.01, 0.1, 0.2, 0.5, 0.8, 1.0]
# min_samples_values = [3, 6, 9, 12, 15, 18]

# # Initialize the best_silhouette value
# best_silhouette = -1.0

# # Iterate through different combinations of eps and min_samples
# for i in range(len(eps_values)):
#     eps = eps_values[i]
#     min_samples = min_samples_values[i]

#     # Fit DBSCAN to the data
#     db = DBSCAN(eps=eps, min_samples=min_samples).fit(df)
#     # labels = db.fit_predict(df)
    
#     # db = DBSCAN(eps=0.5, min_samples=5).fit(X)
#     labels = db.labels_
#     # print(labels)

#     # Filter out noise points (-1)
#     valid_labels = [label for label in labels if label != -1]
#     valid_data = df[labels != -1]

#     # Calculate Silhouette Coefficient
#     silhouette_avg = silhouette_score(valid_data, valid_labels)

#     print(
#         f"eps={eps}, min_samples={min_samples}, Silhouette Coefficient={silhouette_avg}")

#     if silhouette_avg > best_silhouette:
#         best_silhouette = silhouette_avg
#         best_eps = eps
#         best_min_samples = min_samples

# print(
#     f"Best parameters: eps={best_eps}, min_samples={best_min_samples}, Best Silhouette Coefficient={best_silhouette}")
