In [1]:
from pyspark.sql import SQLContext
from pyspark.sql import DataFrameNaFunctions
import pandas as pd
from pyspark.ml import Pipeline

In [2]:
sqlContext = SQLContext(sc)

In [3]:
dataDF = sqlContext.read.load('file:///home/cloudera/coursera/courseraDataSimulation/course4-ML/MW-timestamp-NEW.csv', 
                          format='com.databricks.spark.csv', 
                          header='true',inferSchema='true')

In [4]:
dataDF.describe().toPandas().transpose()

Unnamed: 0,0,1,2,3,4
summary,count,mean,stddev,min,max
rowID,1587257,793628.0,458201.7724491035,0,1587256
air_pressure,1587257,916.8301266904964,3.051593126680745,905.0,929.5
air_temp,1587257,61.85144042834878,11.833623786835721,31.64,99.5
avg_wind_direction,1586824,161.96537927331576,95.20811970203971,0.0,359.0
avg_wind_speed,1586824,2.774272067979844,2.0607577935630355,0.0,32.3
max_wind_direction,1586824,163.40304784903682,92.3672342806429,0.0,359.0
max_wind_speed,1586824,3.3998134008569685,2.423167433617133,0.1,36.0
min_wind_direction,1586824,166.82637078844283,97.46274620077615,0.0,359.0
min_wind_speed,1586824,2.1331304542917913,1.745345084932679,0.0,32.0


In [5]:
dataDF.count()

1587257

In [6]:
#Down sample to take every 10th value
filtered = dataDF.filter((dataDF.rowID % 10) == 0)

In [7]:
filtered.count()

158726

In [11]:
#SELECT required columns and drop rows with NULL

cols = [
 'air_pressure', 'air_temp', 'avg_wind_direction', 'avg_wind_speed', 'max_wind_direction', 'max_wind_speed',
 'relative_humidity']

workingDF = filtered.select(cols).na.drop()

In [12]:
workingDF.take(2)

[Row(air_pressure=912.3, air_temp=64.76, avg_wind_direction=97.0, avg_wind_speed=1.2, max_wind_direction=106.0, max_wind_speed=1.6, relative_humidity=60.5),
 Row(air_pressure=912.3, air_temp=62.24, avg_wind_direction=144.0, avg_wind_speed=1.2, max_wind_direction=167.0, max_wind_speed=1.8, relative_humidity=38.5)]

In [13]:
#CONVERT SPEED from meters/sec to miles per hour
workingDF = workingDF.withColumn('avg_wind_speed', workingDF.avg_wind_speed * 2.236)
workingDF = workingDF.withColumn('max_wind_speed', workingDF.max_wind_speed * 2.236)

In [14]:
workingDF.columns

['air_pressure',
 'air_temp',
 'avg_wind_direction',
 'avg_wind_speed',
 'max_wind_direction',
 'max_wind_speed',
 'relative_humidity']

In [30]:
#ASSEMBLE all the features.

from pyspark.ml.feature import VectorAssembler

assembler = VectorAssembler(inputCols=cols, outputCol="features_pre")
assembled = assembler.transform(workingDF)

In [31]:
#SCALE the dataset.

from pyspark.ml.feature import StandardScaler

scaler = StandardScaler(inputCol="features_pre", outputCol="features",
                        withStd=True, withMean=True)

# Fit the StandardScaler
scalerModel = scaler.fit(assembled)

# Normalize each feature
scaledData = scalerModel.transform(assembled)

In [33]:
#Perform Clustering.

from pyspark.ml.clustering import KMeans
howManyClusters = 12

# Trains a k-means model.
kmeans = KMeans(k=howManyClusters, seed = 1)setK(2)
model = kmeans.fit(scaledData)


In [76]:
#How many members per clusters
import pyspark.sql.functions as func

transformed = model.transform(scaledData).select("features", "prediction")
transformed.groupby(transformed.prediction).agg(func.count('prediction')).show()

+----------+-----------------+
|prediction|count(prediction)|
+----------+-----------------+
|         0|            10573|
|         1|            13422|
|         2|            14833|
|         3|            18638|
|         4|             4633|
|         5|            15793|
|         6|            11535|
|         7|            18995|
|         8|            25630|
|         9|            11968|
|        10|             2249|
|        11|            10411|
+----------+-----------------+



In [50]:
# Print cluster centers.

print(cols)
centers = model.clusterCenters()
print("\nCluster Centers: \n")
for center in centers:
    print(center)

['air_pressure', 'air_temp', 'avg_wind_direction', 'avg_wind_speed', 'max_wind_direction', 'max_wind_speed', 'relative_humidity']

Cluster Centers: 

[ 1.3043177  -0.24030114 -1.15387809  1.73371066 -1.04692201  1.83870394
 -1.12200481]
[ 0.15751205 -0.74345552 -1.21984653 -0.54988543 -1.07080566 -0.56542011
  0.76523163]
[-0.84408569 -1.18714659  0.35906729  0.37705119  0.48557782  0.3622125
  1.3504302 ]
[-0.22585128  0.64177068  0.40939938  0.70750203  0.51838698  0.64736436
 -0.14786832]
[ 0.1316567   0.84599709  1.89333604 -0.62811164 -1.54748661 -0.55455231
 -0.75396777]
[ 0.20473291 -0.9831286   0.6422715  -0.54885846  0.8595796  -0.53230441
  1.16837617]
[ 0.97090599  0.23516064  0.21466114 -0.49957322  0.40200752 -0.5031493
 -0.80817985]
[-0.61793223  0.61682555  0.13398891 -0.6199921   0.30534198 -0.63439095
 -0.19456452]
[ 0.28451552  0.6349466  -1.30641379 -0.47416785 -1.167003   -0.48287242
 -0.79755924]
[ 0.13321964  0.86068327  1.37398803 -0.63856793  1.63826957 -0.58949

In [32]:
# Evaluate Within Set Sum of Squared Errors.

wssse = model.computeCost(scaledData)
print("Within Set Sum of Squared Errors = " + str(wssse))

AttributeError: 'KMeansModel' object has no attribute 'computeCost'