In [96]:
import argparse 
import re
import pandas as pd
import random
import sys
import numpy as np
import itertools
class Datum:

    def __init__(self, position, cluster):
        self.position = position
        self.cluster = cluster

class Cluster:

    def __init__(self, number, position):
        self.number = number
        self.position = position
        
def get_data(file_name):
    with open(file_name) as f:
        restrictions = get_restrictions_vector(f.readline())
        df = pd.read_csv(f, header=None, skiprows=0, usecols=restrictions)
        df = (df - df.mean())/(df.std()) # normalize
        
        return df

#takes two arrays as parameters
#assumes two points being compared have same # of attributes
def sq_eucledian_dist(x,y):
    sum_sq_dist= 0
    for i in range(len(x)):
        sum_sq_dist += np.power(x[i] - y[i],2)
    #Check for the case that a cluster is being compared to itself
    #Make the value infinite because we dont want to consider this in our
    #min distance calculation
    if(sum_sq_dist == 0):
        sum_sq_dist = float("inf")
    return sum_sq_dist
def get_min_dist(points1, points2):
    distances = []
    for pt1 in points1:
        for pt2 in points2:
            distances.append(sq_eucledian_dist(pt1.position,pt2.position))
    return np.min(distances)

def get_restrictions_vector(line):
    one_hot = re.split(',|\n', line)
    by_index = [i for i, value in enumerate(one_hot) if value == '1']

    return by_index

def dist_matrix(clusters):
    dm = []
    for points1 in clusters:
        distances = []
        for points2 in clusters:
            min_dist = get_min_dist(points1, points2)
            distances.append(min_dist)
        
        dm.append(distances)
    return dm

#Each data point starts as its own cluster
def init_data(df):
    clusters = {}
    for i, row in df.iterrows():
        position = [coord for coord in row]
        clusters[i] = [(Datum(position, i))]
    return clusters

def merge_clusters(dm):
    
    #Need to find the clusters with the smallest distance between them
    
    #Get min dist of each column
    col_mins = []
    for col in dm:
        col_mins.append(np.min(col))
    min_dist = np.min(col_mins)
    print(min_dist)
    #get two clusters that have the min dist
    
    #cluster1
    i = 0
    idx = 0
    for col in dm:
        try: 
            idx = col.index(min_dist)
        except ValueError:
            idx = -1
        #Break out of for loop if min dist was found
        if(idx >=0):
            break
        i += 1
    return [i,idx]

def agg_clustering(df):
    #Assign each data point to its own cluster
    clusters = init_data(df)
    #calculate distance matrix for current clusters
    dm = dist_matrix(clusters.values())
    
    print(merge_clusters(dm))
    return dm
    

In [97]:
df = get_data("data/planets.csv")
dm = agg_clustering(df)


1.49384097617e-05
[5, 6]


In [99]:
dm[5][6]

1.4938409761683472e-05