In [14]:
import numpy as np 
import pandas as pd 
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler

In [15]:
data= pd.read_csv("housing.csv")
print(data.shape)
data.head()

(20640, 10)


Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity
0,-122.23,37.88,41.0,880.0,129.0,322.0,126.0,8.3252,452600.0,NEAR BAY
1,-122.22,37.86,21.0,7099.0,1106.0,2401.0,1138.0,8.3014,358500.0,NEAR BAY
2,-122.24,37.85,52.0,1467.0,190.0,496.0,177.0,7.2574,352100.0,NEAR BAY
3,-122.25,37.85,52.0,1274.0,235.0,558.0,219.0,5.6431,341300.0,NEAR BAY
4,-122.25,37.85,52.0,1627.0,280.0,565.0,259.0,3.8462,342200.0,NEAR BAY


# Data Preprocessing

In [16]:
# checking for missing values
# number of instances where values are missing in different columns
data.isnull().sum()

longitude               0
latitude                0
housing_median_age      0
total_rooms             0
total_bedrooms        207
population              0
households              0
median_income           0
median_house_value      0
ocean_proximity         0
dtype: int64

In [17]:
# dropping data points where total_bedrooms is empty
data = data.dropna(subset=["total_bedrooms"])

# Alternative : replacing missing values with thier mean
# data.loc[(data['total_bedrooms'].isnull()==True),'total_bedrooms']=data['total_bedrooms'].mean()

data.isnull().sum()

longitude             0
latitude              0
housing_median_age    0
total_rooms           0
total_bedrooms        0
population            0
households            0
median_income         0
median_house_value    0
ocean_proximity       0
dtype: int64

In [18]:
data["ocean_proximity"].value_counts()

<1H OCEAN     9034
INLAND        6496
NEAR OCEAN    2628
NEAR BAY      2270
ISLAND           5
Name: ocean_proximity, dtype: int64

In [19]:
label_encoder = LabelEncoder()
# adding a new column ocean_proximity_N
data['ocean_proximity_N']=label_encoder.fit_transform(data['ocean_proximity'])
# dropping ocean_proximity
data.drop('ocean_proximity', inplace=True, axis=1)
data.drop('ocean_proximity_N', inplace=True, axis=1)

In [20]:
# taking only first 5000 data samples to reduce computation time
data = data.iloc[:5000, :]

# scale = StandardScaler()
# X = scale.fit_transform(data)

# Scale the numerical features to be between 0 and 1
scaler = MinMaxScaler()
X = scaler.fit_transform(data)

print(data.shape)
data.head()

(5000, 9)


Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value
0,-122.23,37.88,41.0,880.0,129.0,322.0,126.0,8.3252,452600.0
1,-122.22,37.86,21.0,7099.0,1106.0,2401.0,1138.0,8.3014,358500.0
2,-122.24,37.85,52.0,1467.0,190.0,496.0,177.0,7.2574,352100.0
3,-122.25,37.85,52.0,1274.0,235.0,558.0,219.0,5.6431,341300.0
4,-122.25,37.85,52.0,1627.0,280.0,565.0,259.0,3.8462,342200.0


# DBSCAN

In [21]:
import math
import time

In [22]:
# Calculate the Silhouette Coefficient
def silhouette_coefficient(X, labels):
    num_samples = len(X)
    cluster_labels = np.unique(labels)
    num_clusters = len(cluster_labels)

    if num_clusters == 1:
        return 0.0  # Silhouette Coefficient undefined for one cluster

    s = 0.0
    for i in range(num_samples):
        a_i = 0.0
        b_i = float('inf')
        cluster_i = labels[i]  # Cluster label of the current point
        for j in range(num_samples):
            if i != j:
                if labels[j] == cluster_i:
                    a_i += np.linalg.norm(X[i] - X[j])
                else:
                    b_ij = np.mean([np.linalg.norm(X[i] - X[p])
                                   for p in range(num_samples) if labels[p] == labels[j]])
                    b_i = min(b_i, b_ij)

        s_i = (b_i - a_i) / max(a_i, b_i)
        s += s_i

    return s / num_samples


In [23]:
def getNeighbours(data, corePoint, eps):
    
    def EuclideanDistance(p1, p2):
        return  np.sqrt(np.sum(np.square(p1 - p2)))
    
    neighbours = []
    
    for i in range(len(data)):
        dist = EuclideanDistance(data[i], data[corePoint])
        
        if dist < eps:
            neighbours.append(i)
            
    return neighbours

In [24]:
def dbScan(data, eps, MinPts):
    
    def growCluster(data, labels, corePoint, 
                    neighbours, clusterID, eps, MinPts):
        labels[corePoint] = clusterID
        i = 0
        while i < len(neighbours):    
            nh = neighbours[i]

            if labels[nh] == -1:
                labels[nh] = clusterID
            elif labels[nh] == 0:
                labels[nh] = clusterID
                PnNeighbours = getNeighbours(data, nh, eps)

                if len(PnNeighbours) >= MinPts:
                    neighbours = neighbours + PnNeighbours
            i += 1
            
    labels = np.zeros(len(data))
    clusterID = 0
    
    for p in range(len(data)):
        if labels[p] == 0:
            neighbours = getNeighbours(data, p, eps)
            
            if len(neighbours) < MinPts:
                labels[p] = -1
            else:
                clusterID += 1
                growCluster(data, labels, p, neighbours, 
                            clusterID, eps, MinPts)
                
    return labels

In [25]:
def numNoise(labels):
    """
    @param labels: Predicted labels
    @return number of noise determined by cluster algorithm 
    """
    return int(abs(np.sum(labels[labels < 0])))
    
def nClusters(labels):
    """
    @param labels: Predicted labels
    @return number of cluseters 
    """
    return len(np.unique(labels[labels > 0]))

In [26]:
sT = time.time()
labels = dbScan(X, 0.5, 5)
eT = time.time()

In [27]:
print('DBSCAN took {:.3f} seconds'.format(eT-sT))
print('Number of clusters = {}'.format(nClusters(labels)))
print('Number of noise points = {}'.format(numNoise(labels)))

DBSCAN took 118.317 seconds
Number of clusters = 1
Number of noise points = 1


In [28]:
# score = silhouette_coefficient(X, labels)
# print('silhouette_coefficient', score)

In [29]:
# Calculate Silhouette Score
silhouette_avg = silhouette_score(data, labels)
print(f"Silhouette Score for eps=0.5 and min_pts=5: {silhouette_avg}")

Silhouette Score for eps=0.5 and min_pts=5: 0.5016937576906915


# Verify with sklearn

In [30]:
from sklearn.cluster import DBSCAN
db = DBSCAN(eps = 0.5, min_samples = 5).fit(X)
labels1 = db.labels_
print('Number of clusters = {}'.format(nClusters(labels1) + 1))

Number of clusters = 1


In [31]:
from sklearn.metrics import silhouette_score
silhouette_avg = silhouette_score(data, labels1)
print(f"Silhouette Score for eps=0.5 and min_pts=5: {silhouette_avg}")

Silhouette Score for eps=0.5 and min_pts=5: 0.5016937576906915


# Test for different values of Eps and Min_points

In [33]:
# eps = 0.1 min_samples = 6
from sklearn.cluster import DBSCAN
db = DBSCAN(eps = 0.1, min_samples = 6).fit(X)
labels1 = db.labels_
print('Number of clusters = {}'.format(nClusters(labels1) + 1))
from sklearn.metrics import silhouette_score
silhouette_avg = silhouette_score(data, labels1)
print(f"Silhouette Score for eps=0.5 and min_pts=5: {silhouette_avg}")

Number of clusters = 13
Silhouette Score for eps=0.5 and min_pts=5: -0.6078854612073233


In [34]:
# eps = 0.2 min_samples = 9
from sklearn.cluster import DBSCAN
db = DBSCAN(eps = 0.2, min_samples = 9).fit(X)
labels1 = db.labels_
print('Number of clusters = {}'.format(nClusters(labels1) + 1))
from sklearn.metrics import silhouette_score
silhouette_avg = silhouette_score(data, labels1)
print(f"Silhouette Score for eps=0.5 and min_pts=5: {silhouette_avg}")

Number of clusters = 2
Silhouette Score for eps=0.5 and min_pts=5: -0.2222213987714873


In [36]:
# eps = 0.5 min_samples = 12
from sklearn.cluster import DBSCAN
db = DBSCAN(eps = 0.5, min_samples = 12).fit(X)
labels1 = db.labels_
print('Number of clusters = {}'.format(nClusters(labels1) + 1))
from sklearn.metrics import silhouette_score
silhouette_avg = silhouette_score(data, labels1)
print(f"Silhouette Score for eps=0.5 and min_pts=5: {silhouette_avg}")

Number of clusters = 1
Silhouette Score for eps=0.5 and min_pts=5: 0.3582920361087843


In [43]:
# eps = 0.6 min_samples = 15
from sklearn.cluster import DBSCAN
db = DBSCAN(eps = 0.6, min_samples = 15).fit(X)
labels1 = db.labels_
print('Number of clusters = {}'.format(nClusters(labels1) + 1))
from sklearn.metrics import silhouette_score
silhouette_avg = silhouette_score(data, labels1)
print(f"Silhouette Score for eps=0.5 and min_pts=5: {silhouette_avg}")

Number of clusters = 1
Silhouette Score for eps=0.5 and min_pts=5: 0.5016937576906915


In [46]:
# eps = 0.7 min_samples = 18
from sklearn.cluster import DBSCAN
db = DBSCAN(eps = 0.6, min_samples = 18).fit(X)
labels1 = db.labels_
print('Number of clusters = {}'.format(nClusters(labels1) + 1))
from sklearn.metrics import silhouette_score
silhouette_avg = silhouette_score(data, labels1)
print(f"Silhouette Score for eps=0.5 and min_pts=5: {silhouette_avg}")

Number of clusters = 1
Silhouette Score for eps=0.5 and min_pts=5: 0.5016937576906915
