In [2]:
# The usual preamble
%matplotlib inline

import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from numpy import array
from math import sqrt

from pyspark.mllib.clustering import KMeans, KMeansModel
# Make the graphs a bit prettier, and bigger
pd.set_option('display.mpl_style', 'default')
plt.rcParams['figure.figsize'] = (15, 5)
plt.rcParams['font.family'] = 'sans-serif'

# This is necessary to show lots of columns in pandas 0.12. 
# Not necessary in pandas 0.13.
pd.set_option('display.width', 5000) 
pd.set_option('display.max_columns', 60)

In [4]:
services = pd.read_csv('/home/ponighzwa/Desktop/big data/complete1.csv', dtype={'longitude': str})

In [5]:
services['latitude'].unique()

array(['29.8830556', '29.38421', '53.2', ..., '44.1269444', '39.078889',
       '50.465843'], dtype=object)

In [6]:
rows_with_slash = services['latitude'].str.contains('/').fillna(False)
len(services[rows_with_slash])

125

In [7]:
services['latitude'][rows_with_slash] = '0'

In [8]:
rows_with_slash = services['latitude'].str.contains('/').fillna(False)
len(services[rows_with_slash])

0

In [9]:
services['latitude'].unique()

array(['29.8830556', '29.38421', '53.2', ..., '44.1269444', '39.078889',
       '50.465843'], dtype=object)

In [10]:
services['longitude'].unique()

array(['-97.9411111', '-98.581082', '-2.916667', ..., '-78.427222',
       '-113.783333', '22.891814'], dtype=object)

In [11]:
print type(services['longitude'])
print type(services['latitude'])
print len(services['longitude'])
print len(services['latitude'])

<class 'pandas.core.series.Series'>
<class 'pandas.core.series.Series'>
88875
88875


In [12]:
print len(services[services['longitude'] == '0'])

1690


In [13]:
services[services['longitude'] == '0'].index

Int64Index([   19,    42,    50,    76,    88,    97,   102,   219,   310,   322,
            ...
            88148, 88151, 88177, 88178, 88267, 88554, 88627, 88632, 88680, 88819], dtype='int64', length=1690)

In [14]:
jumlahN = len(services[services['longitude'] == '0'].index)
jumlahN

1690

In [15]:
services['longitude']

0         -97.9411111
1          -98.581082
2           -2.916667
3         -96.6458333
4        -157.8036111
5         -82.1888889
6               -3.18
7         -73.4083333
8         -86.2861111
9         -82.9841667
10       -118.3516667
11        -82.7344444
12           -73.5275
13        -83.7619444
14        -80.2536111
15        -83.3219444
16        -83.2330556
17        -72.1936111
18         -64.678611
19                  0
20        -71.5666667
21               -3.2
22        -98.6597222
23            -79.975
24       -122.3522222
25          -0.783333
26        -98.4933333
27        -85.7594444
28        -73.7133333
29        -73.9213889
             ...     
88845     -74.0063889
88846     -79.1216667
88847    -119.2922222
88848     -79.9311111
88849     -83.8241667
88850     -76.2855556
88851     -78.8786111
88852     -80.6080556
88853    -117.1563889
88854     -75.9272222
88855      -78.427222
88856          -97.82
88857     -113.783333
88858     -92.0163889
88859     

In [16]:
services.drop(services[services['longitude'] == '0'].index).reset_index(drop=True)

Unnamed: 0,datetime,city,state,country,shape,duration (seconds),duration (hours/min),comments,date posted,latitude,longitude
0,10/10/1949 20:30,san marcos,tx,us,cylinder,2700,45 minutes,This event took place in early fall around 194...,4/27/2004,29.8830556,-97.9411111
1,10/10/1949 21:00,lackland afb,tx,,light,7200,1-2 hrs,1949 Lackland AFB&#44 TX. Lights racing acros...,12/16/2005,29.38421,-98.581082
2,10/10/1955 17:00,chester (uk/england),,gb,circle,20,20 seconds,Green/Orange circular disc over Chester&#44 En...,1/21/2008,53.2,-2.916667
3,10/10/1956 21:00,edna,tx,us,circle,20,1/2 hour,My older brother and twin sister were leaving ...,1/17/2004,28.9783333,-96.6458333
4,10/10/1960 20:00,kaneohe,hi,us,light,900,15 minutes,AS a Marine 1st Lt. flying an FJ4B fighter/att...,1/22/2004,21.4180556,-157.8036111
5,10/10/1961 19:00,bristol,tn,us,sphere,300,5 minutes,My father is now 89 my brother 52 the girl wit...,4/27/2007,36.595,-82.1888889
6,10/10/1965 21:00,penarth (uk/wales),,gb,circle,180,about 3 mins,penarth uk circle 3mins stayed 30ft above m...,2/14/2006,51.434722,-3.18
7,10/10/1965 23:45,norwalk,ct,us,disk,1200,20 minutes,A bright orange color changing to reddish colo...,10/2/1999,41.1175,-73.4083333
8,10/10/1966 20:00,pell city,al,us,disk,180,3 minutes,Strobe Lighted disk shape object observed clos...,3/19/2009,33.5861111,-86.2861111
9,10/10/1966 21:00,live oak,fl,us,disk,120,several minutes,Saucer zaps energy from powerline as my pregna...,5/11/2005,30.2947222,-82.9841667


In [17]:
finalLat=services.drop(services[services['longitude'] == '0'].index).reset_index(drop=True)['latitude']

In [18]:
finalLong=services.drop(services[services['longitude'] == '0'].index).reset_index(drop=True)['longitude']

In [19]:
listLat = finalLat.tolist()
listLong = finalLong.tolist()

In [20]:
listFinal = np.column_stack((listLat, listLong))

In [21]:
type(listFinal)

numpy.ndarray

In [22]:
import os
import sys

# 1. Mengeset variabel yang menyimpan lokasi di mana Spark diinstal
spark_path = "/opt/spark"

# 2. Menentukan environment variable SPARK_HOME
os.environ['SPARK_HOME'] = spark_path

# 3. Simpan lokasi winutils.exe sebagai environment variable HADOOP_HOME
os.environ['HADOOP_HOME'] = spark_path

# 4. Lokasi Python yang dijalankan --> punya Anaconda
#    Apabila Python yang diinstall hanya Anaconda, maka tidak perlu menjalankan baris ini.
os.environ['PYSPARK_PYTHON'] = sys.executable

# 5. Konfigurasi path library PySpark
sys.path.append(spark_path + "/bin")
sys.path.append(spark_path + "/python")
sys.path.append(spark_path + "/python/pyspark/")
sys.path.append(spark_path + "/python/lib")
sys.path.append(spark_path + "/python/lib/pyspark.zip")
sys.path.append(spark_path + "/python/lib/py4j-0.9-src.zip")

# 6. Import library Spark
#    Dua library yang WAJIB di-import adalah **SparkContext** dan **SparkConf**.
from pyspark import SparkContext
from pyspark import SparkConf

# Setting konfigurasi (opsional)
conf = SparkConf()
conf.set("spark.executor.memory", "2g")
conf.set("spark.cores.max", "4")

sc = SparkContext("local", conf=conf)
#    Apabila berhasil, maka ketika sc di-print akan mengeluarkan nilai <pyspark.context.SparkContext object>
print sc

<pyspark.context.SparkContext object at 0x7f44c0c56390>


In [23]:
rdd = sc.parallelize(listFinal)

In [24]:
rdd.collect()

[array(['29.8830556', '-97.9411111'], 
       dtype='|S12'), array(['29.38421', '-98.581082'], 
       dtype='|S12'), array(['53.2', '-2.916667'], 
       dtype='|S12'), array(['28.9783333', '-96.6458333'], 
       dtype='|S12'), array(['21.4180556', '-157.8036111'], 
       dtype='|S12'), array(['36.595', '-82.1888889'], 
       dtype='|S12'), array(['51.434722', '-3.18'], 
       dtype='|S12'), array(['41.1175', '-73.4083333'], 
       dtype='|S12'), array(['33.5861111', '-86.2861111'], 
       dtype='|S12'), array(['30.2947222', '-82.9841667'], 
       dtype='|S12'), array(['33.9163889', '-118.3516667'], 
       dtype='|S12'), array(['35.2333333', '-82.7344444'], 
       dtype='|S12'), array(['40.6686111', '-73.5275'], 
       dtype='|S12'), array(['37.1536111', '-83.7619444'], 
       dtype='|S12'), array(['35.8238889', '-80.2536111'], 
       dtype='|S12'), array(['36.8430556', '-83.3219444'], 
       dtype='|S12'), array(['42.5377778', '-83.2330556'], 
       dtype='|S12'), array

In [25]:
listSpark = rdd.map(lambda line: ' '.join(line[0:2]))

In [26]:
listSpark.collect()

['29.8830556 -97.9411111',
 '29.38421 -98.581082',
 '53.2 -2.916667',
 '28.9783333 -96.6458333',
 '21.4180556 -157.8036111',
 '36.595 -82.1888889',
 '51.434722 -3.18',
 '41.1175 -73.4083333',
 '33.5861111 -86.2861111',
 '30.2947222 -82.9841667',
 '33.9163889 -118.3516667',
 '35.2333333 -82.7344444',
 '40.6686111 -73.5275',
 '37.1536111 -83.7619444',
 '35.8238889 -80.2536111',
 '36.8430556 -83.3219444',
 '42.5377778 -83.2330556',
 '41.3252778 -72.1936111',
 '32.364167 -64.678611',
 '42.3916667 -71.5666667',
 '51.5 -3.2',
 '38.1055556 -98.6597222',
 '32.8544444 -79.975',
 '45.5827778 -122.3522222',
 '51.783333 -0.783333',
 '29.4238889 -98.4933333',
 '38.2541667 -85.7594444',
 '40.7008333 -73.7133333',
 '41.7002778 -73.9213889',
 '53.970571 -111.689885',
 '32.7152778 -117.1563889',
 '38.7583333 -104.7425',
 '29.7630556 -95.3630556',
 '32.7833333 -96.8',
 '42.9955556 -71.4552778',
 '-38.662334 178.017649',
 '29.7630556 -95.3630556',
 '44.7630556 -85.6205556',
 '41.0338889 -73.7633333',
 '5

In [27]:
parsedData = listSpark.map(lambda line: array([float(x.strip()) for x in line.split(' ')]))
parsedData.collect()

[array([ 29.8830556, -97.9411111]),
 array([ 29.38421 , -98.581082]),
 array([ 53.2     ,  -2.916667]),
 array([ 28.9783333, -96.6458333]),
 array([  21.4180556, -157.8036111]),
 array([ 36.595    , -82.1888889]),
 array([ 51.434722,  -3.18    ]),
 array([ 41.1175   , -73.4083333]),
 array([ 33.5861111, -86.2861111]),
 array([ 30.2947222, -82.9841667]),
 array([  33.9163889, -118.3516667]),
 array([ 35.2333333, -82.7344444]),
 array([ 40.6686111, -73.5275   ]),
 array([ 37.1536111, -83.7619444]),
 array([ 35.8238889, -80.2536111]),
 array([ 36.8430556, -83.3219444]),
 array([ 42.5377778, -83.2330556]),
 array([ 41.3252778, -72.1936111]),
 array([ 32.364167, -64.678611]),
 array([ 42.3916667, -71.5666667]),
 array([ 51.5,  -3.2]),
 array([ 38.1055556, -98.6597222]),
 array([ 32.8544444, -79.975    ]),
 array([  45.5827778, -122.3522222]),
 array([ 51.783333,  -0.783333]),
 array([ 29.4238889, -98.4933333]),
 array([ 38.2541667, -85.7594444]),
 array([ 40.7008333, -73.7133333]),
 array([

In [28]:
model = KMeans.train(parsedData, 4, maxIterations=100, initializationMode="random")

In [31]:
clusterData = parsedData.map(model.predict)
clusterData.collect()

[2,
 2,
 1,
 2,
 3,
 0,
 1,
 0,
 2,
 0,
 3,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 2,
 0,
 3,
 1,
 2,
 0,
 0,
 0,
 3,
 3,
 2,
 2,
 2,
 0,
 1,
 2,
 0,
 0,
 1,
 0,
 2,
 0,
 2,
 3,
 3,
 3,
 2,
 0,
 0,
 0,
 0,
 2,
 0,
 2,
 1,
 2,
 0,
 2,
 0,
 2,
 0,
 0,
 2,
 3,
 2,
 1,
 0,
 0,
 0,
 3,
 0,
 2,
 3,
 3,
 2,
 3,
 3,
 0,
 3,
 0,
 0,
 0,
 3,
 2,
 2,
 3,
 3,
 3,
 2,
 3,
 2,
 0,
 3,
 0,
 2,
 0,
 0,
 2,
 2,
 2,
 0,
 3,
 3,
 2,
 1,
 3,
 3,
 2,
 3,
 3,
 0,
 2,
 3,
 1,
 3,
 0,
 0,
 3,
 0,
 1,
 2,
 3,
 0,
 0,
 3,
 0,
 0,
 2,
 1,
 2,
 0,
 0,
 2,
 2,
 0,
 0,
 0,
 3,
 2,
 1,
 0,
 0,
 2,
 0,
 2,
 2,
 3,
 3,
 0,
 0,
 3,
 0,
 1,
 2,
 0,
 3,
 0,
 2,
 0,
 2,
 3,
 0,
 2,
 0,
 2,
 2,
 1,
 0,
 2,
 0,
 3,
 3,
 0,
 2,
 0,
 3,
 3,
 0,
 0,
 2,
 0,
 0,
 0,
 0,
 0,
 2,
 2,
 2,
 0,
 2,
 2,
 1,
 0,
 2,
 0,
 3,
 2,
 0,
 2,
 3,
 3,
 0,
 2,
 0,
 0,
 3,
 2,
 0,
 3,
 2,
 0,
 3,
 2,
 3,
 0,
 3,
 0,
 0,
 2,
 2,
 3,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 3,
 0,
 0,
 3,
 2,
 0,
 2,
 3,
 3,
 0,
 3,
 0,
 3,
 0,
 3,
 3,
 0,
 1,
 0,
 0,
 0,


In [None]:
import numpy as np
import matplotlib.pyplot as plt

X = parsedData.collect()
label = clusterData.collect()

for point in range(0,len(label)):
    if (label[point]==0):
        color = "r"
    elif (label[point]==1):
        color = "g"
    elif (label[point]==2):
        color = "b"
    elif (label[point]==3):
        color = "y"
    lines = plt.plot(X[point][0], X[point][1], 'ro')
    plt.setp(lines, color=color, linewidth=2.0)
plt.show()