In [1]:
import numpy as np
import pandas as pd
import math
import json
import sys
import time

### data manipulation

In [2]:
def readFiles(filename):
    df = pd.read_csv(datafile, header=None)
    
    # restrictions are in first row
    restr = pd.to_numeric(df.iloc[0])
    
    # drop metadata columns
    df = df.drop([0], axis=0)
    
    return df, restr

In [77]:
def restrictdf(df, restr):
    # remove restricted cols and convert to numeric
    for i, v in enumerate(df.columns):
        if restr[i] == 0:
            df = df.set_index(df.columns[i])
        elif restr[i] < 1:
            df = df.drop(columns=[v])
        else:
            df[v] = pd.to_numeric(df[v], errors='coerce')
            
    # drop unknown values
    df = df.dropna()
    df = df[(df != '?').all(axis=1)]
    return df

In [4]:
# normalizes all columns
def normalizedf(indf):
    df=indf.copy()
    for c in df.columns:
        colMax = df[c].max()
        colMin = df[c].min()
        
        # probably no need to normalize if the values are very small. Might have to adjust the value
#         if colMax < 1:
#             continue
        df[c] = df[c].apply(lambda x: (x - colMin)/(colMax-colMin))
    return df

### helper functions

In [5]:
def euclideanDist(point, pointArray):
    return np.sqrt(np.sum((pointArray - point) ** 2, axis=1))

In [21]:
def initCentroids(numdf, k):
    return np.array(numdf.sample(k))

In [101]:
# pass a ***vectorized*** distance function: dist(point, pointArray)
# dataframe must be numeric other than last column
def assignCentroids(numdf, centroids, distfunc):
    if df.columns[-1] != 'cluster':
        df['cluster'] = -1
        
    for i, row in df.iloc[:,:-1].iterrows():
        df.at[i,'cluster'] = np.argmin(distfunc(row.values, centroids))
    return df

In [125]:
def reCalcCentroids(numdf, centroids):
    # centroid index is tag
    for i in range(len(centroids)):
        cluster = numdf[numdf['cluster'] == i]
        centroids[i] = np.mean(np.array(cluster.iloc[:,:-1]), axis=0)
    return centroids
#         meanpt = np.mean(np.array(cluster.iloc[:,-1]), axis=1)
#         print(meanpt)

In [25]:
# pass a ***vectorized*** distance function: dist(point, pointArray)
def calcCentroid(df, distFunctionVect):
    # must be fully numeric and normalized df
    dfarray = np.array(df)
    
    distMatrix = []
    for i, d in enumerate(dfarray):
        # performs Euclidean distance on all elements in data (vectorized)
        dists = distFunctionVect(dfarray[i], dfarray)
        distMatrix.append(dists)
    
    return pd.DataFrame(distMatrix)

### analytical functions

In [7]:
# gets centroid of numeric dataframe (not normalized)
def calcCentroid(numdf):
    return np.divide(np.sum(np.array(numdf), axis=0),len(numdf))

In [8]:
# takes df ran through dbscan with visited, cluster, and type columns
def analyzeClusters(df, numdf, distFunc):
    clusters=[]
    for i, c in enumerate(df['cluster'].unique()):
        info = {}
        info["clusterID"] = i
        if c is None:
            pnts = df.loc[df['cluster'].isna()]
            info["type"] = "Noise"
        else:
            pnts = df[df['cluster'] == c]
            info["type"] = "Cluster"
        
        numpnts = numdf.loc[pnts.index]
        
        info["centroid"] = calcCentroid(numpnts)
        dists = distFunc(info["centroid"], np.array(numpnts))
        df.loc[pnts.index, "distToCentroid"] = dists
        pnts = df.loc[pnts.index]
        info["maxDistToCentroid"] = max(dists)
        info["minDistToCentroid"] = min(dists)
        info["avgDistToCentroid"] = np.sum(dists)/len(pnts)
        info["numPoints"] = len(pnts)
        info["dataPoints"] = pnts
        clusters.append(info)
    return clusters

In [9]:
def printClusterInfo(clusters):
    for clusterInfo in clusters:
        for key in clusterInfo:
            if key == "dataPoints":
                print(f"{key}: \n{clusterInfo[key].to_markdown()}")
            else:
                print(f"{key}: {clusterInfo[key]}")
        print('\n')

### dbscan functions

### running

In [10]:
def hyperparams(df, restr, epsmin, epsmax, mptsmin, mptsmax):
    numeps = 5
    nummpts = 5
#     epsmin=0.001
#     epsmax=0.1
#     mptsmin=2
#     mptsmax=25
    
    results=[]
    
    for e in range(numeps):
        for p in range(nummpts):
            eps = epsmin + (epsmax-epsmin)/numeps*e
            mpts = int(mptsmin + (mptsmax-mptsmin)/nummpts*p)
            df, restr = readFiles(datafile)
            cl,tmp=dbscan(df, restr, euclideanDist, eps, mpts, silent=True)
            results.append([eps,mpts,len(cl)])
    print(pd.DataFrame(results, columns=["epsilon","minPoints", "numClusters"]))

In [127]:
k=4

sys.argv = f"dbscan.py ./data/mammal_milk.csv {k}".split(" ")
if __name__ == "__main__":
    if len(sys.argv) == 3:
        _, datafile, k = sys.argv
    else:
        print("Usage: python3 dbscan.py <datafile.csv> <epsilon> <numPoints>")
        exit(1)
        
    k = int(k)
    df_full, restr = readFiles(datafile)
    df = normalizedf(restrictdf(df_full, restr))
    centroids = initCentroids(df,k)
    df = assignCentroids(df, centroids, euclideanDist)
    print(df.to_markdown())    
    print(reCalcCentroids(df, centroids))


| 0          |        1 |         2 |         3 |        4 |         5 |   cluster |
|:-----------|---------:|----------:|----------:|---------:|----------:|----------:|
| Horse      | 0.993407 | 0.17094   | 0         | 1        | 0.113636  |         1 |
| Orangutan  | 0.958242 | 0.0683761 | 0.0609756 | 0.869565 | 0.0636364 |         1 |
| Monkey     | 0.956044 | 0.136752  | 0.0414634 | 0.927536 | 0.0363636 |         1 |
| Donkey     | 0.997802 | 0.0940171 | 0.0097561 | 0.898551 | 0.136364  |         1 |
| Hippo      | 1        | 0         | 0.0853659 | 0.637681 | 0         |         1 |
| Camel      | 0.940659 | 0.247863  | 0.0585366 | 0.695652 | 0.277273  |         1 |
| Bison      | 0.923077 | 0.358974  | 0.0170732 | 0.826087 | 0.363636  |         1 |
| Buffalo    | 0.817582 | 0.452991  | 0.168293  | 0.681159 | 0.309091  |         2 |
| Guinea Pig | 0.813187 | 0.581197  | 0.15122   | 0.391304 | 0.340909  |         2 |
| Cat        | 0.806593 | 0.811966  | 0.129268  | 0.637681 | 0.29