In [1]:
# Konfigurasi Spark
import os
import sys

# 1. Mengeset variabel yang menyimpan lokasi di mana Spark diinstal
spark_path = "C:/spark"

# 2. Menentukan environment variable SPARK_HOME
os.environ['SPARK_HOME'] = spark_path

# 3. Simpan lokasi winutils.exe sebagai environment variable HADOOP_HOME
os.environ['HADOOP_HOME'] = spark_path

# 4. Lokasi Python yang dijalankan --> punya Anaconda
#    Apabila Python yang diinstall hanya Anaconda, maka tidak perlu menjalankan baris ini.
os.environ['PYSPARK_PYTHON'] = sys.executable

# 5. Konfigurasi path library PySpark
sys.path.append(spark_path + "/bin")
sys.path.append(spark_path + "/python")
sys.path.append(spark_path + "/python/pyspark/")
sys.path.append(spark_path + "/python/lib")
sys.path.append(spark_path + "/python/lib/pyspark.zip")
sys.path.append(spark_path + "/python/lib/py4j-0.10.4-src.zip")

# 6. Import library Spark
#    Dua library yang WAJIB di-import adalah **SparkContext** dan **SparkConf**.
from pyspark import SparkContext
from pyspark import SparkConf

# Setting konfigurasi (opsional)
conf = SparkConf()
conf.set("spark.executor.memory", "2g")
conf.set("spark.cores.max", "4")

sc = SparkContext("local", conf=conf)
#    Apabila berhasil, maka ketika sc di-print akan mengeluarkan nilai <pyspark.context.SparkContext object>
print sc

<pyspark.context.SparkContext object at 0x0000000004379F60>


In [65]:
from numpy import array
from math import sqrt
import re
import matplotlib.pyplot as plt
import numpy as np

from pyspark.mllib.clustering import KMeans, KMeansModel

In [49]:
data = sc.textFile('data/scrubbed.csv')

In [50]:
# Remove header
data = data.filter(lambda x:"latitude" not in x)

In [51]:
# Split and get latitude and longitude
parsedData = data.map(lambda row: array([x for x in row.split(',')[9:11]]))
# Clean Data
# Remove empty data
parsedData = parsedData.filter(lambda x:  x[0] != '' and x[0] != '0' and x[1] != '' and x[1] != '0')
# Filter data
# Only accept X and X.Y
parsedData = parsedData.filter(lambda x: re.match('^[-+]?[0-9]*\.?[0-9]+$', x[0]) and re.match('[-+]?[0-9]*\.?[0-9]+', x[1]))
# Cast to float
parsedData = parsedData.map(lambda x: array([float(x[0]), float(x[1])]))

In [85]:
clusters = KMeans.train(parsedData, 5, maxIterations=10, initializationMode="random")

In [86]:
def error(point):
    center = clusters.centers[clusters.predict(point)]
    return sqrt(sum([x**2 for x in (point - center)]))

In [87]:
WSSSE = parsedData.map(lambda point: error(point)).reduce(lambda x, y: x + y)
print("Within Set Sum of Squared Error = " + str(WSSSE))

Within Set Sum of Squared Error = 766728.134532


In [88]:
clusterData = parsedData.map(clusters.predict)

In [89]:
label = clusterData.collect()

In [90]:
len(label)

80325

In [93]:
len(parsedData.collect())

80325

In [94]:
parsedDataCollection = parsedData.collect()

In [95]:
parsedDataWithLabel = map(lambda(i,x): array([x[0], x[1], label[i]]), enumerate(parsedDataCollection))

In [96]:
parsedDataWithLabel

[array([ 29.8830556, -97.9411111,   3.       ]),
 array([ 29.38421 , -98.581082,   3.      ]),
 array([ 53.2     ,  -2.916667,   4.      ]),
 array([ 28.9783333, -96.6458333,   3.       ]),
 array([  21.4180556, -157.8036111,    1.       ]),
 array([ 36.595    , -82.1888889,   0.       ]),
 array([ 51.434722,  -3.18    ,   4.      ]),
 array([ 41.1175   , -73.4083333,   2.       ]),
 array([ 33.5861111, -86.2861111,   0.       ]),
 array([ 30.2947222, -82.9841667,   0.       ]),
 array([  33.9163889, -118.3516667,    1.       ]),
 array([ 35.2333333, -82.7344444,   0.       ]),
 array([ 40.6686111, -73.5275   ,   2.       ]),
 array([ 37.1536111, -83.7619444,   0.       ]),
 array([ 35.8238889, -80.2536111,   0.       ]),
 array([ 36.8430556, -83.3219444,   0.       ]),
 array([ 42.5377778, -83.2330556,   2.       ]),
 array([ 41.3252778, -72.1936111,   2.       ]),
 array([ 32.364167, -64.678611,   2.      ]),
 array([ 42.3916667, -71.5666667,   2.       ]),
 array([ 51.5,  -3.2,   4.

In [None]:
colors = ['r', 'g', 'b', 'y', 'k']
shapes = ['.', 'o', 'v', '*', 'x']
for current in parsedDataWithLabel:
    plt.setp(plt.plot(current[0], current[1], shapes[int(current[2])]), color=colors[int(current[2])], linewidth=2.0)

  agg_filter: unknown
  alpha: float (0.0 transparent through 1.0 opaque) 
  animated: [True | False] 
  antialiased or aa: [True | False] 
  axes: an :class:`~matplotlib.axes.Axes` instance 
  clip_box: a :class:`matplotlib.transforms.Bbox` instance 
  clip_on: [True | False] 
  clip_path: [ (:class:`~matplotlib.path.Path`, :class:`~matplotlib.transforms.Transform`) | :class:`~matplotlib.patches.Patch` | None ] 
  color or c: any matplotlib color 
  contains: a callable function 
  dash_capstyle: ['butt' | 'round' | 'projecting'] 
  dash_joinstyle: ['miter' | 'round' | 'bevel'] 
  dashes: sequence of on/off ink in points 
  drawstyle: ['default' | 'steps' | 'steps-pre' | 'steps-mid' | 'steps-post'] 
  figure: a :class:`matplotlib.figure.Figure` instance 
  fillstyle: ['full' | 'left' | 'right' | 'bottom' | 'top' | 'none'] 
  gid: an id string 
  label: string or anything printable with '%s' conversion. 
  linestyle or ls: ['solid' | 'dashed', 'dashdot', 'dotted' | (offset, on-off-dash