## Imports and datafraem


In [1]:
# DBscan from scratch
import math
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.datasets import make_blobs, make_moons

# Load the data
df = pd.read_csv('new_dataset.csv')

# Print the first 5 rows of the data    
df.head()

Unnamed: 0,Age,Attrition,BusinessTravel,DailyRate,Department,DistanceFromHome,Education,EducationField,EnvironmentSatisfaction,Gender,...,PercentSalaryHike,RelationshipSatisfaction,StockOptionLevel,TotalWorkingYears,TrainingTimesLastYear,WorkLifeBalance,YearsAtCompany,YearsInCurrentRole,YearsSinceLastPromotion,YearsWithCurrManager
0,0.547619,1,1.0,0.71582,Sales,0.0,0.25,Life Sciences,0.333333,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.15,0.222222,0.0,0.344828
1,0.814978,0,0.5,0.185735,Research & Development,0.275862,0.157895,Life Sciences,0.727273,1.0,...,0.92,1.0,0.5,0.025,0.5,0.75,0.25,0.388889,0.066667,0.482759
2,0.613136,1,1.0,0.915934,Research & Development,0.068966,0.380435,Other,1.0,1.0,...,0.6,0.5,0.0,0.025,0.5,0.75,0.0,0.0,0.0,0.0
3,0.545855,0,0.5,0.92861,Research & Development,0.103448,0.793478,Life Sciences,1.0,0.0,...,0.44,0.75,0.0,0.025,0.5,0.75,0.2,0.388889,0.2,0.0
4,0.44495,0,1.0,0.394188,Research & Development,0.068966,0.173913,Medical,0.181818,1.0,...,0.48,1.0,0.5,0.025,0.5,0.75,0.05,0.111111,0.133333,0.137931


In [2]:
class DBSCAN: 
    def __init__(self, eps, min_pts, data):
        self.eps = eps
        self.min_pts = min_pts
        self.data = data
        self.clusters = []
        self.noise = []
        self.core_pts = []
        self.visited = []
        self.clustered = []
        self.cluster_num = 0
        self.clustered_pts = []
        
    def _distance(self, p1, p2):
        result = 0
        for i in range(len(p1)):
            if(type(p1[i]) == str or type(p2[i]) == str):
                if(p1[i] != p2[i]):
                    result += 1
            else : result += (p1[i] - p2[i]) ** 2
        return math.sqrt(result)
        #return math.sqrt(sum([(a - b) ** 2 for a, b in zip(p1, p2)]))
    
    def _region_query(self, point):
        neighbors = []
        for i in range(len(self.data)):
            if self._distance(point, self.data[i]) < self.eps:
                neighbors.append(i)
        return neighbors
    
    def _expand_cluster(self, point, neighbors):
        self.clusters[self.cluster_num].append(point)
        self.clustered.append(point)
        self.visited.append(point)
        for i in neighbors:
            if i not in self.visited:
                self.visited.append(i)
                new_neighbors = self._region_query(self.data[i])
                if len(new_neighbors) >= self.min_pts:
                    neighbors += new_neighbors
            if i not in self.clustered:
                self.clusters[self.cluster_num].append(i)
                self.clustered.append(i)
                
    def fit(self):
        for i in range(len(self.data)):
            if i not in self.visited:
                self.visited.append(i)
                neighbors = self._region_query(self.data[i])
                if len(neighbors) < self.min_pts:
                    self.noise.append(i)
                else:
                    self.clusters.append([])
                    self._expand_cluster(i, neighbors)
                    self.cluster_num += 1
                    
    def get_clusters(self):
        return self.clusters
    
    def get_noise(self):
        return self.noise



In [3]:
Y = df['Attrition'].values
df = df.drop(['Attrition'], axis=1)
X = df.values

In [4]:
dbscan = DBSCAN(eps=2.135, min_pts=1, data=X)
dbscan.fit()
clusters = dbscan.get_clusters()
noise = dbscan.get_noise()

In [5]:
len(dbscan.get_clusters())

2

In [6]:
dbscan.get_clusters()

[[0,
  70,
  115,
  132,
  168,
  174,
  215,
  318,
  320,
  327,
  376,
  397,
  450,
  525,
  566,
  665,
  719,
  747,
  829,
  834,
  849,
  897,
  947,
  1011,
  1121,
  1255,
  1270,
  18,
  22,
  29,
  43,
  48,
  63,
  66,
  75,
  82,
  89,
  96,
  117,
  131,
  136,
  137,
  138,
  143,
  154,
  165,
  167,
  179,
  212,
  216,
  218,
  219,
  227,
  228,
  230,
  238,
  261,
  282,
  328,
  332,
  347,
  355,
  358,
  366,
  370,
  374,
  393,
  401,
  402,
  410,
  431,
  442,
  445,
  446,
  448,
  461,
  462,
  464,
  490,
  493,
  502,
  520,
  527,
  529,
  531,
  536,
  548,
  563,
  569,
  573,
  574,
  580,
  583,
  591,
  596,
  603,
  606,
  608,
  624,
  641,
  646,
  659,
  662,
  670,
  672,
  679,
  685,
  694,
  695,
  703,
  705,
  706,
  712,
  754,
  771,
  794,
  802,
  805,
  811,
  839,
  850,
  866,
  884,
  885,
  915,
  943,
  963,
  964,
  974,
  980,
  990,
  993,
  1012,
  1016,
  1051,
  1057,
  1070,
  1079,
  1103,
  1120,
  1132,
  1144,
  1172

In [7]:
len(dbscan.get_noise())

0

In [8]:
dbscan.get_noise()

[]

In [9]:
# # sklearn dbscan
# from sklearn.cluster import DBSCAN
# from sklearn.preprocessing import StandardScaler

# X = df.values
# scaler = StandardScaler()
# X_scaled = scaler.fit_transform(X)

# dbscan = DBSCAN(eps=0.5, min_samples=5)
# clusters = dbscan.fit_predict(X_scaled)

# clusters

In [38]:
class AGNES:
    def __init__(self, n_clusters=2, linkage='average'):
        self.n_clusters = n_clusters
        self.linkage = linkage
        self.labels_ = None
        self.cluster_centers_ = None
        self.n_leaves = None
        self.results = {}

    def fit(self, X):
        X = np.array([self.string_to_numerical(x) for x in X])
        self.n_leaves = X.shape[0]
        self.labels_ = np.arange(self.n_leaves)
        self.cluster_centers_ = X.copy()
        self.results[self.n_leaves+1] = [self.labels_.copy(), self.cluster_centers_.copy()]

        # while self.n_leaves >= self.n_clusters:
        #     self.merge()

        while self.n_leaves >= 1:
            self.merge()
            self.results[self.n_leaves+1] = [self.labels_.copy(), self.cluster_centers_.copy()]
        return self

    def merge(self):
        dist = self.distance(self.cluster_centers_)
        i, j = np.unravel_index(dist.argmin(), dist.shape)
        self.cluster_centers_[i] = self._linkage(i, j)
        self.cluster_centers_ = np.delete(self.cluster_centers_, j, axis=0)
        self.labels_[self.labels_ == j] = i
        self.labels_[self.labels_ > j] -= 1
        self.n_leaves -= 1
    
    def string_to_numerical(self,arr):
        """Convert a string to a numerical value"""
        # Convert the string to a list of ASCII values
        new_arr = []
        for s in arr:
            #print(type(s))
            if (type(s) == str):
                ascii_values = [ord(c) for c in s]
                # Convert the list of ASCII values to a numpy array
                ascii_array = np.array(ascii_values)
                # Return the sum of the array
                new_arr.append(ascii_array.sum())
            else: new_arr.append(s)
        return np.array(new_arr)

    def distance(self, X):
        return np.sqrt(-2 * np.dot(X, X.T) + np.sum(X ** 2, axis=1) + np.sum(X ** 2, axis=1)[:, np.newaxis])

    def _distance(self, p1, p2):
        result = 0
        for i in range(len(p1)):
            if(type(p1[i]) == str or type(p2[i]) == str):
                if(p1[i] != p2[i]):
                    result += 1
            else : result += (p1[i] - p2[i]) ** 2
        return math.sqrt(result)

    def distance2(self, X):
        distances = []
        for i in range(len(X)):
            for j in range(len(X)):
                distances.append(self._distance(X[i], X[j]))
        return np.array(distances).reshape(len(X), len(X))
                
    

    def _linkage(self, i, j):
        if self.linkage == 'average':
            # linkage = []
            # for i in range (len(self.cluster_centers_[i])):
            #     if type(self.cluster_centers_[i][i]) == str:
            #         if self.cluster_centers_[i][i] != self.cluster_centers_[j][i]:
            #             linkage.append(0)
            #         else: linkage.append(1)
            #     else: linkage.append((self.cluster_centers_[i][i] + self.cluster_centers_[j][i]))
            # linkage = np.array(linkage)
            # return linkage / 2
            return (self.cluster_centers_[i] + self.cluster_centers_[j]) / 2
        elif self.linkage == 'single':
            return np.minimum(self.cluster_centers_[i], self.cluster_centers_[j])
        elif self.linkage == 'complete':
            return np.maximum(self.cluster_centers_[i], self.cluster_centers_[j])
        else:
            raise ValueError('Unknown linkage method: {}'.format(self.linkage))

    def predict(self, X):
        return self.labels_

    def fit_predict(self, X):
        self.fit(X)
        return self.predict(X)

    def get_results(self):
        return self.results

In [39]:
agnes = AGNES(n_clusters=2, linkage='average')
res = agnes.fit_predict(X)
res



array([0, 0, 0, ..., 0, 0, 0])

In [40]:
results = agnes.get_results()

In [41]:
for each in results[2][0]:
        print(each)

0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1


In [42]:
def accuracy(y_true, y_pred):
    accuracy = np.sum(y_true == y_pred) / len(y_true)
    return accuracy

def precision(y_true, y_pred):
    tp = np.sum(y_true * y_pred)
    fp = np.sum((1 - y_true) * y_pred)
    return tp / (tp + fp)

def sensitivity(y_true, y_pred):
    tp = np.sum(y_true * y_pred)
    fn = np.sum(y_true * (1 - y_pred))
    return tp / (tp + fn)

def specificity(y_true, y_pred):
    tn = np.sum((1 - y_true) * (1 - y_pred))
    fp = np.sum((1 - y_true) * y_pred)
    return tn / (tn + fp)

def f_score(y_true, y_pred):
    tp = np.sum(y_true * y_pred)
    fn = np.sum(y_true * (1 - y_pred))
    fp = np.sum((1 - y_true) * y_pred)
    precision = tp / (tp + fp)
    recall = tp / (tp + fn)
    return 2 * precision * recall / (precision + recall)

# confusion matrix containing true positives, false negatives, false positives and true negatives
def confusion_matrix(y_true, y_pred):
    tp = np.sum(y_true * y_pred)
    fn = np.sum(y_true * (1 - y_pred))
    fp = np.sum((1 - y_true) * y_pred)
    tn = np.sum((1 - y_true) * (1 - y_pred))
    return np.array([[tp, fn], [fp, tn]])

In [43]:
print(f"accuracy = {accuracy(Y, res)}")
print(f"precision = {precision(Y, res)}")
print(f"sensitivity = {sensitivity(Y, res)}")
print(f"specificity = {specificity(Y, res)}")
print(f"f_score = {f_score(Y, res)}")
print(f"confusion_matrix = \n{confusion_matrix(Y, res)}")

accuracy = 0.8387755102040816
precision = nan
sensitivity = 0.0
specificity = 1.0
f_score = nan
confusion_matrix = 
[[   0  237]
 [   0 1233]]


  


In [44]:
dbscan_res = {}
val = 0
for cluster in dbscan.get_clusters():
        for i in cluster:
                dbscan_res[i] = val
        val += 1

In [45]:
# order dict by key
dbscan_res = {k: dbscan_res[k] for k in sorted(dbscan_res)}
dbscan_res = list(dbscan_res.values())

In [46]:
dbscan_res = np.array(dbscan_res)

In [47]:
print(f"accuracy = {accuracy(Y, dbscan_res)}")
print(f"precision = {precision(Y, dbscan_res)}")
print(f"sensitivity = {sensitivity(Y, dbscan_res)}")
print(f"specificity = {specificity(Y, dbscan_res)}")
print(f"f_score = {f_score(Y, dbscan_res)}")
print(f"confusion_matrix = \n{confusion_matrix(Y, dbscan_res)}")


accuracy = 0.8380952380952381
precision = 0.0
sensitivity = 0.0
specificity = 0.9991889699918897
f_score = nan
confusion_matrix = 
[[   0  237]
 [   1 1232]]




In [48]:
# sklearn agglomerative clustering 
from sklearn.cluster import AgglomerativeClustering
agg = AgglomerativeClustering(n_clusters=2, affinity='euclidean', linkage='average')
agg_res = agg.fit_predict(X)

ValueError: could not convert string to float: 'Sales'

In [None]:
for each in agg_res:
        print(each)