In [1]:
import numpy as np
import pandas as pd
import math
import json
import sys
import time

### data manipulation

In [2]:
def readFiles(filename):
    df = pd.read_csv(filename, header=None)
    
    # restrictions are in first row
    restr = pd.to_numeric(df.iloc[0])
    
    # drop metadata columns
    df = df.drop([0], axis=0)
    
    return df, restr

In [3]:
def restrictdf(df, restr, setIndex=False, getDropped=False):
    # remove restricted cols and convert to numeric
    dropped=pd.DataFrame()
    for i, v in enumerate(df.columns):
        if restr[i] == 0 and setIndex:
            df = df.set_index(df.columns[i])
        elif restr[i] < 1:
            dropped[v] = df[v]
            df = df.drop(columns=[v])
        else:
            df[v] = pd.to_numeric(df[v], errors='coerce')
            
    # drop unknown values
    df = df.dropna()
    df = df[(df != '?').all(axis=1)]
    if getDropped:
        return df, dropped
    return df

In [4]:
# normalizes all columns
def normalizedf(indf):
    df=indf.copy()
    for c in df.columns:
        colMax = df[c].max()
        colMin = df[c].min()
        
        # probably no need to normalize if the values are very small. Might have to adjust the value
#         if colMax < 1:
#             continue
        df[c] = df[c].apply(lambda x: (x - colMin)/(colMax-colMin))
    return df

### helper functions

In [5]:
def euclideanDist(point, pointArray):
    return np.sqrt(np.sum((pointArray - point) ** 2, axis=1))

In [6]:
def initCentroids_random(numdf, k):
    if k>=len(numdf):
        print('k cannot be bigger than the length of data')
        exit(1)
    sample=np.array(numdf.sample(k))
    while len(np.unique(sample)) < k:
        sample=np.array(numdf.sample(k))
    return sample


In [101]:
def calcCentroid(dfarray):
    return np.divide(np.sum(dfarray, axis=0),len(dfarray))

In [110]:
def initCentroids_max(indf, k, distFunc):
    if indf.columns[-1] == 'cluster':
        numdf = indf.drop('cluster')
    else:
        numdf=indf.copy()
    
    dfarray = np.array(numdf)
    c = calcCentroid(dfarray)
    centroids=[]
    centroids.append(np.array(numdf.iloc[np.argmax(distFunc(c, dfarray))]))
    for i in range(k-1):
        sumdist=np
    return centroids
print(initCentroids_max(df, 4, euclideanDist))
# print(df.to_markdown())

[]
[array([42,  9], dtype=int64)]
[array([42,  9], dtype=int64)]
[array([42,  9], dtype=int64)]


In [66]:
# pass a ***vectorized*** distance function: dist(point, pointArray)
# dataframe must be numeric other than last column
def assignCentroids(numdf, centroids, distfunc, getNumAssign = False):
    if df.columns[-1] != 'cluster':
        df['cluster'] = -1
    numAssign=0
    for i, row in df.iloc[:,:-1].iterrows():
        prev = df.at[i,'cluster']
        new = np.argmin(distfunc(row.values, centroids))
        if prev != new:
            numAssign += 1
        df.at[i,'cluster'] = new
    if getNumAssign:
        return df, numAssign
    return df

In [8]:
def reCalcCentroids(numdf, centroids):
    # centroid index is tag
    for i in range(len(centroids)):
        cluster = numdf[numdf['cluster'] == i]
        centroids[i] = np.mean(np.array(cluster.iloc[:,:-1]), axis=0)
    return centroids

In [9]:
def calcSSE(dfarray, c, distFunc):
    return np.sum(np.square(distFunc(c, dfarray)))

In [10]:
# takes labeled numeric df
def calcTotSSE(numdf, centroids, distFunc):
    totSSE=None
    for i, c in enumerate(centroids):
        cSSE = calcSSE(np.array(numdf[numdf['cluster'] == i].iloc[:,:-1]), c, distFunc)
        if totSSE is None:
            totSSE = cSSE
        else:
            totSSE += cSSE
    return totSSE

### kmeans 

In [40]:
def kmeans_sse(df, centroids, distFunc, minSSE):
    df = assignCentroids(df, centroids, distFunc)
    prevSSE = calcTotSSE(df, centroids, distFunc)
    
    centroids = reCalcCentroids(df, centroids)
    df = assignCentroids(df, centroids, distFunc)
    currSSE = calcTotSSE(df, centroids, distFunc)
    print(f"current sse: {currSSE}")
    
    counter = 0
    while counter < 10 and prevSSE - currSSE > minSSE:
        centroids = reCalcCentroids(df, centroids)
        df = assignCentroids(df, centroids, distFunc)
        prevSSE = currSSE
        currSSE = calcTotSSE(df, centroids, distFunc)
        print(f"current sse: {currSSE}")
        
        counter += 1
    return df, centroids

In [12]:
def kmeans_reassign(df, centroids, distFunc, minReassign):
    df, numAssign = assignCentroids(df, centroids, distFunc, getNumAssign=True)
    print(f"{numAssign} points reassigned")
    
    centroids = reCalcCentroids(df, centroids)
    df, numAssign = assignCentroids(df, centroids, distFunc, getNumAssign=True)
    print(f"{numAssign} points reassigned")
    
    counter = 0
    while counter < 100 and numAssign > minReassign:
        centroids = reCalcCentroids(df, centroids)
        df, numAssign = assignCentroids(df, centroids, distFunc, getNumAssign=True)
        print(f"{numAssign} points reassigned")
        counter += 1
    return df, centroids

### analytical functions

In [36]:
# takes numeric df with cluster labe
def analyzeClusters(df, centroids, distFunc, extraCols = None):
    clusters=[]
    for i, c in enumerate(centroids):
        info = {}
        info["clusterID"] = i
        pnts = df[df['cluster'] == i]
        if len(pnts) == 0:
            continue
        
        info["SSE"] = calcSSE(np.array(pnts.iloc[:,:-1]), c, distFunc)
        info["centroid"] = c
        
        dists = distFunc(c, np.array(pnts.iloc[:,:-1]))
        info["maxDistToCentroid"] = max(dists)
        info["minDistToCentroid"] = min(dists)
        info["avgDistToCentroid"] = np.sum(dists)/len(pnts)
        info["numPoints"] = len(pnts)
        if extraCols is not None:
            pnts = pnts.join(extraCols)
        info["dataPoints"] = pnts
        clusters.append(info)
    return clusters

In [56]:
def printClusterInfo(clusters, noData=False):
    for clusterInfo in clusters:
        for key in clusterInfo:
            if key == "dataPoints":
                if not noData:
                    print(f"{key}: \n{clusterInfo[key].to_markdown()}")
            else:
                print(f"{key}: {clusterInfo[key]}")
        print('\n')

### running

In [15]:
# k=4

# sys.argv = f"dbscan.py ./data/4clusters.csv {k}".split(" ")
# if __name__ == "__main__":
#     if len(sys.argv) == 3:
#         _, datafile, k = sys.argv
#     else:
#         print("Usage: python3 dbscan.py <datafile.csv> <epsilon> <numPoints>")
#         exit(1)
        
#     k = int(k)
#     df_full, restr = readFiles(datafile)
#     df = restrictdf(df_full, restr, getDropped=True)
#     df = df
#     centroids = initCentroids(df,k)
#     df, centroids = kmeans_lite(df, centroids, euclideanDist)
#     df.plot.scatter(x=0,y=1,c='cluster',colormap='viridis')    
# #     print(dropped)
# #     df = df.join(dropped)
# #     print(df.to_markdown())
#     printClusterInfo(analyzeClusters(df, centroids, euclideanDist), extraCols = dropped)

### Iris

In [48]:
def calcPurity(col):
    vals = col.value_counts()
    return max(vals)/len(col)

In [70]:
df_full, restr = readFiles('./data/iris.csv')
df,dropped = restrictdf(df_full, restr, getDropped=True)
centroids = initCentroids(df, 3)

df, centroids = kmeans_reassign(df, centroids, euclideanDist, 0.1)
clusters = analyzeClusters(df, centroids, euclideanDist, extraCols = dropped)
print()
for c in clusters:
    pnts = c["dataPoints"]
    print(f"purity of cluster {c['clusterID']}: {calcPurity(pnts.iloc[:,-1])}")
print()
printClusterInfo(clusters, noData=True)
# df.plot.scatter(x=0,y=1,c='cluster',colormap='viridis')    

150 points reassigned
6 points reassigned
3 points reassigned
4 points reassigned
5 points reassigned
4 points reassigned
4 points reassigned
3 points reassigned
3 points reassigned
1 points reassigned
0 points reassigned

purity of cluster 0: 0.7704918032786885
purity of cluster 1: 1.0
purity of cluster 2: 0.9230769230769231

clusterID: 0
SSE: 38.29081967213115
centroid: [5.88360656 2.74098361 4.38852459 1.43442623]
maxDistToCentroid: 1.6468010735564589
minDistToCentroid: 0.23571239518479192
avgDistToCentroid: 0.7311084910642591
numPoints: 61


clusterID: 1
SSE: 15.2404
centroid: [5.006 3.418 1.464 0.244]
maxDistToCentroid: 1.2393514432960495
minDistToCentroid: 0.059933296255086886
avgDistToCentroid: 0.4841322496689401
numPoints: 50


clusterID: 2
SSE: 25.413846153846155
centroid: [6.85384615 3.07692308 5.71538462 2.05384615]
maxDistToCentroid: 1.5515596276348875
minDistToCentroid: 0.23945203717343622
avgDistToCentroid: 0.7318458783535899
numPoints: 39




### 4clusters

In [91]:
df_full, restr = readFiles('./data/4clusters.csv')
df = restrictdf(df_full, restr)

### many clusters

### mammal milk

### accidents 1

### accidents 3