### BA8A | Implement FarthestFirstTraversal 

In [1]:
with open('../data/rosalind_ba8a.txt') as f:
    k, m = [int(num) for num in f.readline().strip().split()]
    points = [tuple(map(float, line.strip().split())) for line in f.readlines()]

In [2]:
def FarthestFirstTraversal(points, k):
    centers = set()
    centers.add(points[0])
    while len(centers) < k:
        datapoint = max(points, key=lambda point: MinDist(point, centers))
        centers.add(datapoint)
    for center in centers:
        print ' '.join(str(c) for c in center)
    return centers

In [3]:
def EuclideanDist(v,w):
    m = len(v)
    s = sum([(v[dim] - w[dim]) **2 for dim in range(m)])
    return s**(1.0/2)

In [4]:
def MinDist(datapoint, centers):
    return min([EuclideanDist(datapoint, x) for x in centers])

In [5]:
FarthestFirstTraversal(points, k)

0.0 0.0
0.0 5.0
5.0 5.0


{(0.0, 0.0), (0.0, 5.0), (5.0, 5.0)}

### BA8B | Squared Error Distortion Problem

In [6]:
with open('../data/rosalind_ba8b.txt') as f:
    k, m = [int(num) for num in f.readline().strip().split()]
    centers = set()
    line = f.readline()
    while line:
        if '--' in line:
            break
        else:
            centers.add(tuple(map(float, line.strip().split())))
            line = f.readline()
    points = [tuple(map(float, line.strip().split())) for line in f.readlines()]

In [7]:
def EuclideanDist(v,w):
    m = len(v)
    s = sum([(v[dim] - w[dim]) **2 for dim in range(m)])
    return s**(1.0/2)

In [8]:
def MinDist(datapoint, centers):
    return min([EuclideanDist(datapoint, x) for x in centers])

In [9]:
def Distortion(Data,Centers):
    return sum([MinDist(datapoint, centers) **2 for datapoint in Data])/len(Data)

In [10]:
Distortion(points, centers)

18.24556

### BA8C | Implement the Lloyd algorithm

In [11]:
with open('../data/rosalind_ba8c.txt') as f:
    k, m = [int(num) for num in f.readline().strip().split()]
    points = [tuple(map(float, line.strip().split())) for line in f.readlines()]    

In [12]:
def Gravity(points, m):
    return tuple([sum([point[i] for point in points])/float(len(points)) for i in range(m)])

In [13]:
centers = points[:k]
clusters = {}
while True:
    for i in range(k):
        clusters[i] = set()
    #centers to clusters
    for datapoint in points:
        closest_c = min(centers, key=lambda center: EuclideanDist(datapoint, center))
        cluster_assign = centers.index(closest_c)
        clusters[cluster_assign].add(datapoint)
    #clusters to centers
    new_centers = [Gravity(clusters[c], m) for c in clusters]
    
    if new_centers == centers:
        break
    else:
        centers = new_centers

In [14]:
for c in centers:
    for num in c:
        print "{:.3f}".format(num),
    print ''

1.800 2.867 
1.044 1.156 


### BA8D | Implement the Soft k-Means Clustering Algorithm

In [15]:
import math
with open('../data/rosalind_ba8d.txt') as f:
    k, m = [int(num) for num in f.readline().strip().split()]
    b = float(f.readline().strip())
    points = [tuple(map(float, line.strip().split())) for line in f.readlines()]    

In [16]:
centers = points[:k]
clusters = {}

In [17]:
%%time
for itr in range(100):
    for i in range(k):
        clusters[i] = set()
    #hidden matrix should be of size k x n
    hidden_mat = []   
    for j in range(len(points)):
        dnum = sum([math.e**(-b*EuclideanDist(points[j],centers[x])) for x in range(k)])
        vect = [math.e**(-b*EuclideanDist(points[j],centers[i]))/dnum for i in range(k)]
        hidden_mat.append(vect)
    #print hidden_mat
    hidden_mat = [list(vect) for vect in zip(*hidden_mat)]
   
    new_centers = [tuple([sum([points[i][x]*hidden_mat[c][i]/sum(hidden_mat[c]) for i in range(len(points))]) 
                          for x in range(m)]) for c in range(k)]
    if new_centers != centers:
        centers = new_centers

Wall time: 29 ms


In [18]:
for c in centers:
    for num in c:
        print "{:.3f}".format(num),
    print ''

1.662 2.623 
1.075 1.148 


### BA8E | Implement Hierarchical Clustering

In [19]:
with open('../data/rosalind_ba8e.txt') as f:
    n = int(f.readline().strip())
    mat = [map(float, line.strip().split()) for line in f.readlines()]
    
for vect in mat:
    for i in range(len(vect)):
        if vect[i] == 0:
            vect[i] = float('inf') 

In [20]:
#initialize clusters
dist = dict(zip(range(n),mat))
clusters = range(n)
curr_clusters = range(n)
count = n-1
maps = {}

In [21]:
#print dist
while len(curr_clusters) > 1:
    # find the two clusters that are most close together 
    min_vals = [min([val for val in dist[point] if val > 0]) for point in clusters]
    c1 = min_vals.index(min(min_vals))
    c2 = dist[c1].index(min(min_vals))
    
    w1, w2 = 1, 1
    if c1 in maps:
        w1 = len(maps[c1])
    if c2 in maps:
        w2 = len(maps[c2])
    
    new_dist = [(x*w1 + y*w2)/(w1+w2) for x, y in zip(dist[c1], dist[c2])] + [float('inf')]

    for i in dist.keys():
        dist[i].append(new_dist[i])
        dist[i][c1] = float('inf')
        dist[i][c2] = float('inf')
    dist[c1] = [float('inf')]*len(dist[c1])
    dist[c2] = [float('inf')]*len(dist[c2])

    count += 1
    dist[count] = new_dist
    clusters.append(count)
    curr_clusters.append(count)
    maps[count] = []
    for c in [c1, c2]:
        if c in maps:
            maps[count] += maps[c]
        else:
            maps[count] += [c]

    curr_clusters.remove(c1)
    curr_clusters.remove(c2)
    result = []
    for c in [c1, c2]:
        if c in maps:
            result+=maps[c]
        else:
            result.append(c)
    print ' '.join([str(x+1) for x in result])

4 6
5 7
3 4 6
1 2
5 7 3 4 6
1 2 5 7 3 4 6
