# Lab 14: Using K-means clustering to identify crime hot-spots

As always, setup a Spark Context and HiveContext:

In [1]:
# Set up Spark Context
from pyspark import SparkContext, SparkConf

SparkContext.setSystemProperty('spark.executor.memory', '2g')
conf = SparkConf()
conf.set('spark.executor.instances', 15)
sc = SparkContext('yarn-client', 'Spark-lab14', conf=conf)

from pyspark.sql import HiveContext
hc = HiveContext(sc)
hc.sql("use demo")

DataFrame[result: string]

From the "crimes" table create a Spark RDD. 
* Each element of the RDD should hold a NumPy array with two values: longitude and latitude of the crime event. 
* Recall how some events have invalid longitude/latitude values - remove any events with invalid longitude/latitude values.

In [2]:
from numpy import array
crimes = hc.table('crimes').filter('latitude > 37 and latitude < 38') \
           .map(lambda row: array([float(row.longitude), float(row.latitude)])).cache()

Use ML-Lib's K-means clustering algorithm to cluster the data into either 10 clusters (1 per district) or 37 clusters (1 per neighborhood):

In [3]:
from pyspark.mllib.clustering import KMeans, KMeansModel
from math import sqrt

numClusters = 37   # Or 10 if you so choose
clusters = KMeans.train(crimes, numClusters, maxIterations=20, runs=10, initializationMode="k-means||")

In clustering, a popular measure to evaluate the compactness of clusters is the Within-Set-Sum-Of-Squared-Error (aka WSSSE). We will not compute this metric for our clustering result

In [4]:
# Evaluate clustering by computing Within Set Sum of Squared Errors
def error(point):
    center = clusters.centers[clusters.predict(point)]
    return sqrt(sum([x**2 for x in (point - center)]))

mean_WSSSE = crimes.map(lambda point: error(point)).reduce(lambda x, y: x + y) / float(crimes.count())
print("Within Set Sum of Squared Error = " + str(mean_WSSSE))

Within Set Sum of Squared Error = 0.0047798139527


We now define again the helper functions for map drawing with Folium:

In [5]:
from IPython.display import HTML
map_width=1000
map_height=600

def inline_map(m, width=map_width, height=map_height):
    m.create_map()
    srcdoc = m.HTML.replace('"', '&quot;')
    embed = HTML('<iframe srcdoc="{}" '
                 'style="width: {}px; height: {}px; '
                 'border: none"></iframe>'.format(srcdoc, width, height))
    return embed

Use Spark and the resulting clusters to create a Pandas dataframe that counts the number of data points that belong to each cluster. 

In [6]:
import pandas as pd

cdata = crimes.map(lambda point: (clusters.predict(point), 1)) \
               .reduceByKey(lambda a,b: a+b).collect()
counts = pd.DataFrame(cdata, columns=['cluster', 'count']).set_index('cluster')['count'].to_dict()

Plot a map of SF using Folium and add a marker at each crime hotspot, showing the number of crimes in that centroid

In [7]:
import folium

sf_lat = 37.77
sf_long = -122.4

map_sf = folium.Map(location=[sf_lat, sf_long], zoom_start=12, width=map_width, height=map_height)
for cl,ctr in enumerate(clusters.centers):
    num = counts[cl]
    map_sf.circle_marker(location=[ctr[1], ctr[0]], radius = num/300, 
                         popup = "hotspot %d, n = %d" % (cl, num),
                         line_color='#3186cc', fill_color='#3186cc', fill_opacity=0.5)
inline_map(map_sf)