## to import GraphFrame
##### download and unzip tar spark-1.6.3-bin-hadoop2.6
##### export SPARK_HOME="/usr/local/bin/spark-1.6.3-bin-hadoop2.6"
##### export PATH=/home/vagrant/hadoop

In [1]:
import random, operator, subprocess
from pyspark.sql.types import *
# from pyspark import SparkContext
# sc =SparkContext()

In [2]:
rdd = sc.textFile('data-smaller.csv') \
        .map(lambda line: line.split(',')) \
        .map(lambda elements: tuple([int(elements[i]) for i in range(len(elements))])) \
        .cache()

k = 10
dimension = 3
headers = ['age', 'height', 'weight', 'blood_sugar_level', 'child', 'exercise_hours']
# max_cluster = rdd.count() / k
# min_cluster = rdd.count() / (2*k-1)
loop_for_converge = 20
different_combination = 30

In [3]:
def dist(x, y):
    return sum([abs(x[i]-y[i]) for i in range(dimension)])

def get_nearest_centroid_idx(x, centroids):
    dists = {}
    for cluster in centroids:
        dists[cluster] = dist(x, centroids[cluster])
        
    cluster = min(dists, key=dists.get)
    return cluster

def assign_to_cluster(pt, available_centroids):
    nearest_centroid = get_nearest_centroid_idx(pt, available_centroids)
    return (nearest_centroid, ([pt], [dist(pt, available_centroids[nearest_centroid])]))

def calculate_pts_sum(pts):
    pts_sum = [0 for _ in range(dimension)]
    for pt in pts:
        for i in range(dimension):
            pts_sum[i] += pt[i]
    return pts_sum

def calculate_centroid(pts_sum, nb_pts):
    nb_pts = float(nb_pts)
    return [pts_sum[i]/nb_pts for i in range(dimension)]

def calculate_cost(pts, centroid):
    cost = 0
    for pt in pts:
        cost += dist(pt, centroid)
    return cost

def is_converge(old_cens, new_cens):
    diff = 0
    old_sum = 0
    for i in range(dimension):
        old_cen = old_cens[i]
        new_cen = new_cens[i]
        for j in range(dimension):
            diff += abs(new_cen[j] - old_cen[j])
            old_sum += old_cen[j]
    return abs(float(diff) / old_sum) < 0.000001

def write_to_output(assignment, centroids):
    tmp = assignment.flatMap(lambda (cluster, pts): [centroids[cluster] for _ in range(len(pts))])
    sqlContext.createDataFrame(tmp, headers[:dimension]).save('output.txt', mode='overwrite')
    
def calc_error(cluster_data):
    '''
    cluster_data : (cluster_id, [list of row of pts])
    '''
    #print cluster_data
    pts=cluster_data[1]
    pts_sum= [0 for _ in range(dimension)]
    for pt in pts:
        for i in range(dimension):
            pts_sum[i]=pts_sum[i]+pt[i]
    avg_di = [pts_sum[i]/float(len(cluster_data)) for i in range(dimension)]
    error = 0
    for pt in pts:
        error = error + dist(pt,avg_di)
    return (avg_di, error)

In [4]:
min_cost_rdd = None
min_cost = float('inf')

In [5]:
eps = 17
minPts = k
visited = []

In [6]:
from collections import deque

def fromCluster(pt1):
    print "from cluster"
    clusterSet=[]
    print pt1
    if (pt1 not in visited):
        if (pt1 in ptsFullNeighborDict):
            visited += pt1
            queue = deque(ptsFullNeighborDict[pt1]) # ptsFullNeighborDict include itself #still a list
            print queue
            while (not queue.empty()):
                nextpt = queue.dequeue()
                if (nextpt in visited): 
                    # TODO:
                    # need to merge cluster since it is parallel
                    # may fail to achieve k-anonymity if don't merge
                    # neighbor of neighbor -> can ignore
                    pass
                else:
                    visited += nextpt
                    clusterSet += nextpt
                    if (nextpt in ptsFullNeighborDict):
                        for nn_pt in ptsFullNeighborDict[nextpt]:
                            queue.push(nn_pt)
    return clusterSet # still not mark as visit, may be visited by other point

In [7]:
ptsFullNeighborRDD=rdd.cartesian(rdd)\
                        .filter(lambda (pt1,pt2): dist(pt1,pt2)<eps)\
                        .map(lambda (pt1,pt2):(pt1,[pt2]))\
                        .reduceByKey(lambda pts1,pts2: pts1+pts2)\
                        .filter(lambda (pt, pts): len(pts)>=minPts)
print ptsFullNeighborRDD.take(3)


def flattenPair(pt,pts):
    # print pts
    pairs=[]
    for neighbor in pts:
        pairs += [(pt,neighbor)]
        print pairs
    return pairs

edgeRDD=ptsFullNeighborRDD.flatMap(lambda (pt,pts):flattenPair(pt,pts))

[((43, 183, 1), [(40, 179, 3), (35, 181, 1), (37, 175, 3), (32, 188, 1), (48, 178, 3), (41, 178, 3), (53, 189, 1), (34, 180, 3), (38, 185, 1), (43, 183, 1), (47, 189, 3)]), ((74, 167, 3), [(74, 177, 4), (68, 161, 2), (71, 166, 4), (66, 170, 3), (74, 180, 1), (87, 166, 5), (74, 162, 4), (74, 157, 2), (74, 167, 3), (68, 166, 5)]), ((74, 157, 2), [(68, 161, 2), (71, 166, 4), (76, 149, 4), (77, 147, 1), (74, 162, 4), (74, 157, 2), (66, 153, 4), (75, 150, 5), (74, 167, 3), (82, 153, 3)])]


In [8]:
from graphframes import *
vertics = sqlContext.createDataFrame(rdd.map(lambda pt: (pt, "pt")),['id','name'])
edges = sqlContext.createDataFrame(edgeRDD,['src','dst'])
graph = GraphFrame(vertics, edges)
sc.setCheckpointDir("checkpoint") # required for connectedComponents version > 0.3
result = graph.connectedComponents()
print result.take(3)

[Row(id=Row(_1=29, _2=151, _3=3), name=u'pt', component=85899345920L), Row(id=Row(_1=40, _2=179, _3=3), name=u'pt', component=85899345920L), Row(id=Row(_1=17, _2=147, _3=5), name=u'pt', component=85899345920L)]


In [9]:
print rdd.take(3)
resultRDD = result.rdd.map(tuple).map(lambda (row_pt, name, component):(tuple(row_pt),component))
# TODO left outer join the original vertic point so that preserve 2 point with same location
# FullResult = rdd.leftOuterJoin(resultRDD)
groupRDD= resultRDD.map(lambda (id_pt,component):(component,[id_pt])).reduceByKey(lambda pt1,pt2:pt1+pt2)
noiseRDD= groupRDD.filter(lambda (component, pts):len(pts)<k).flatMap(lambda (component, pts):pts).cache()
print "noise: ",noiseRDD.count(), "\n", noiseRDD.take(3)
clusterRDD = groupRDD.filter(lambda (component, pts):len(pts)>=k)
print "number of cluster:", clusterRDD.count()
print "example of clusters:", clusterRDD.take(3)

[(29, 151, 3), (40, 179, 3), (17, 147, 5)]
noise:  9 
[(12, 190, 5), (53, 149, 4), (98, 178, 2)]
number of cluster: 2
example of clusters: [(25769803776L, [(86, 140, 4), (79, 183, 4), (55, 149, 4), (100, 142, 3), (74, 177, 4), (90, 140, 5), (68, 161, 2), (88, 161, 5), (71, 166, 4), (83, 184, 2), (76, 149, 4), (83, 183, 2), (68, 178, 2), (66, 170, 3), (74, 180, 1), (88, 185, 3), (87, 166, 5), (77, 147, 1), (66, 189, 2), (58, 153, 5), (57, 178, 2), (61, 171, 2), (95, 153, 4), (74, 162, 4), (85, 175, 3), (89, 171, 2), (97, 140, 1), (93, 142, 2), (74, 157, 2), (66, 153, 4), (89, 155, 4), (75, 150, 5), (85, 182, 2), (54, 154, 5), (74, 167, 3), (100, 143, 1), (68, 166, 5), (94, 142, 5), (93, 173, 5), (91, 179, 3), (57, 178, 4), (82, 147, 1), (81, 141, 2), (64, 190, 2), (96, 150, 4), (80, 142, 2), (82, 153, 3), (54, 152, 4)]), (85899345920L, [(29, 151, 3), (40, 179, 3), (17, 147, 5), (13, 178, 4), (27, 154, 3), (23, 143, 3), (17, 156, 5), (25, 158, 5), (24, 184, 5), (13, 152, 1), (19, 161, 4)

In [10]:
print clusterRDD.map(calc_error).take(3)
print "error without noise", clusterRDD.map(calc_error).map(lambda (c,e):e).reduce(lambda e1,e2:e1+e2)

[([1873.5, 3884.5, 75.5], 268341.0), ([484.0, 3075.0, 56.0], 122910.0)]
error without noise 391251.0
