## Imports and datafraem


In [196]:
# DBscan from scratch
import math
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.datasets import make_blobs, make_moons

# Load the data
df = pd.read_csv('new_dataset.csv')

# Print the first 5 rows of the data    
df.head()

Unnamed: 0,Age,Attrition,BusinessTravel,DailyRate,Department,DistanceFromHome,Education,EducationField,EnvironmentSatisfaction,Gender,...,PercentSalaryHike,RelationshipSatisfaction,StockOptionLevel,TotalWorkingYears,TrainingTimesLastYear,WorkLifeBalance,YearsAtCompany,YearsInCurrentRole,YearsSinceLastPromotion,YearsWithCurrManager
0,0.547619,1,1.0,0.71582,Sales,0.0,0.25,Life Sciences,0.333333,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.15,0.222222,0.0,0.344828
1,0.814978,0,0.5,0.185735,Research & Development,0.275862,0.157895,Life Sciences,0.727273,1.0,...,0.92,1.0,0.5,0.025,0.5,0.75,0.25,0.388889,0.066667,0.482759
2,0.613136,1,1.0,0.915934,Research & Development,0.068966,0.380435,Other,1.0,1.0,...,0.6,0.5,0.0,0.025,0.5,0.75,0.0,0.0,0.0,0.0
3,0.545855,0,0.5,0.92861,Research & Development,0.103448,0.793478,Life Sciences,1.0,0.0,...,0.44,0.75,0.0,0.025,0.5,0.75,0.2,0.388889,0.2,0.0
4,0.44495,0,1.0,0.394188,Research & Development,0.068966,0.173913,Medical,0.181818,1.0,...,0.48,1.0,0.5,0.025,0.5,0.75,0.05,0.111111,0.133333,0.137931


In [197]:
class DBSCAN: 
    def __init__(self, eps, min_pts, data):
        self.eps = eps
        self.min_pts = min_pts
        self.data = data
        self.clusters = []
        self.noise = []
        self.core_pts = []
        self.visited = []
        self.clustered = []
        self.cluster_num = 0
        self.clustered_pts = []
        
    def _distance(self, p1, p2):
        result = 0
        for i in range(len(p1)):
            if(type(p1[i]) == str or type(p2[i]) == str):
                if(p1[i] != p2[i]):
                    result += 1
            else : result += (p1[i] - p2[i]) ** 2
        return math.sqrt(result)
        #return math.sqrt(sum([(a - b) ** 2 for a, b in zip(p1, p2)]))
    
    def _region_query(self, point):
        neighbors = []
        for i in range(len(self.data)):
            if self._distance(point, self.data[i]) < self.eps:
                neighbors.append(i)
        return neighbors
    
    def _expand_cluster(self, point, neighbors):
        self.clusters[self.cluster_num].append(point)
        self.clustered.append(point)
        self.visited.append(point)
        for i in neighbors:
            if i not in self.visited:
                self.visited.append(i)
                new_neighbors = self._region_query(self.data[i])
                if len(new_neighbors) >= self.min_pts:
                    neighbors += new_neighbors
            if i not in self.clustered:
                self.clusters[self.cluster_num].append(i)
                self.clustered.append(i)
                
    def fit(self):
        for i in range(len(self.data)):
            if i not in self.visited:
                self.visited.append(i)
                neighbors = self._region_query(self.data[i])
                if len(neighbors) < self.min_pts:
                    self.noise.append(i)
                else:
                    self.clusters.append([])
                    self._expand_cluster(i, neighbors)
                    self.cluster_num += 1
                    
    def get_clusters(self):
        return self.clusters
    
    def get_noise(self):
        return self.noise



In [198]:
X = df.values
dbscan = DBSCAN(eps=1.71, min_pts=2, data=X)
dbscan.fit()
clusters = dbscan.get_clusters()
noise = dbscan.get_noise()

In [199]:
len(dbscan.get_clusters())

2

In [200]:
dbscan.get_clusters()

[[0,
  525,
  70,
  168,
  591,
  849,
  980,
  1012,
  1057,
  1120,
  1121,
  1237,
  1270,
  131,
  154,
  212,
  376,
  754,
  897,
  963,
  43,
  75,
  115,
  137,
  167,
  228,
  282,
  374,
  397,
  402,
  445,
  450,
  461,
  493,
  518,
  574,
  670,
  672,
  705,
  719,
  771,
  834,
  885,
  1220,
  1254,
  216,
  366,
  829,
  1249,
  1255,
  21,
  370,
  662,
  952,
  1153,
  171,
  776,
  1016,
  462,
  563,
  583,
  698,
  802,
  964,
  1014,
  1070,
  1103,
  1132,
  1350,
  1454,
  358,
  431,
  659,
  974,
  573,
  695,
  947,
  1021,
  1391,
  1172,
  29,
  46,
  218,
  219,
  295,
  328,
  338,
  393,
  433,
  527,
  532,
  536,
  811,
  839,
  917,
  960,
  1011,
  1051,
  1253,
  1319,
  76,
  339,
  442,
  444,
  679,
  926,
  1218,
  63,
  446,
  569,
  606,
  805,
  18,
  39,
  82,
  96,
  117,
  227,
  238,
  306,
  355,
  401,
  417,
  507,
  517,
  548,
  580,
  641,
  704,
  751,
  755,
  872,
  984,
  990,
  993,
  1049,
  1067,
  1119,
  1191,
  1241,
  1

In [201]:
len(dbscan.get_noise())

67

In [202]:
dbscan.get_noise()

[45,
 83,
 105,
 112,
 119,
 201,
 237,
 271,
 275,
 398,
 411,
 422,
 453,
 469,
 494,
 504,
 544,
 592,
 595,
 663,
 701,
 746,
 748,
 749,
 752,
 780,
 789,
 813,
 875,
 913,
 914,
 966,
 1007,
 1039,
 1058,
 1086,
 1111,
 1167,
 1204,
 1213,
 1225,
 1295,
 1297,
 1301,
 1303,
 1368,
 1373,
 1379,
 1395,
 1400,
 1401,
 1413,
 1414,
 1434,
 1436,
 1438,
 1444,
 1445,
 1447,
 1450,
 1463,
 1464,
 1465,
 1466,
 1467,
 1468,
 1469]

In [203]:
# # sklearn dbscan
# from sklearn.cluster import DBSCAN
# from sklearn.preprocessing import StandardScaler

# X = df.values
# scaler = StandardScaler()
# X_scaled = scaler.fit_transform(X)

# dbscan = DBSCAN(eps=0.5, min_samples=5)
# clusters = dbscan.fit_predict(X_scaled)

# clusters