## Loading Packages

In [34]:
from pyspark.sql import Row
from pyspark.sql.types import *
from pyspark.ml.clustering import KMeans
from pyspark.ml.feature import VectorAssembler, StringIndexer
from pyspark.sql import functions as F
from datetime import datetime

INPUT_DATA = '../data/'

In [3]:
!head -5 {INPUT_DATA}status.csv

station_id,bikes_available,docks_available,time
2,2,25,2013/08/29 12:06:01
2,2,25,2013/08/29 12:07:01
2,2,25,2013/08/29 12:08:01
2,2,25,2013/08/29 12:09:01


## Loading Data from Mongo

In [4]:
status_df = sqlContext.read.format("com.mongodb.spark.sql.DefaultSource")\
.option("uri","mongodb://54.245.37.88:27017/bikeshare.status").load()
status_df.printSchema()

root
 |-- _id: struct (nullable = true)
 |    |-- oid: string (nullable = true)
 |-- bikes_available: integer (nullable = true)
 |-- docks_available: integer (nullable = true)
 |-- station_id: integer (nullable = true)
 |-- time: string (nullable = true)



## Processing Data

In [22]:
#functions to transform dataframe

def toIntSafe(num):
    try:
        return int(num)
    except ValueError:
        return None
    
def toFloatSafe(num):
    try:
        return float(num)
    except ValueError:
        return None    
    
def toTimeStampSafe(data):
    try:
        if '/' in data:
            return datetime.strptime(data, "%Y/%m/%d %H:%M:%S") 
        elif '-' in data:
            return datetime.strptime(data, "%Y-%m-%d %H:%M:%S") 
    except ValueError:
        return None
    
def get_time_of_day(x):
    x = toTimeStampSafe(x)
    if x:
        hour = x.hour
        if hour >= 6 and hour < 12:
            return 'Morning'
        if hour >= 12 and hour < 16:
            return 'Afternoon'
        if hour >= 16 and hour < 20:
            return 'Evening'
        if hour >= 20 or hour < 6:
            return 'Night'
    else:
        return None
    
def get_day_of_week(x):
    x = toTimeStampSafe(x)
    if x:
        return x.isoweekday()
    else:
        return None
    
def get_month(x):
    x = toTimeStampSafe(x)
    if x:
        return x.month
    else:
        return None

timefunction = udf(lambda x: get_time_of_day(x))
weekfunction = udf(lambda x: get_day_of_week(x), IntegerType())
monthfunction = udf(lambda x: get_month(x), IntegerType())

In [23]:
status_df = status_df.withColumn('bike_util', \
        ((col('docks_available') * 1.0) /(col('bikes_available') + col('docks_available'))))

status = status_df.select('station_id', 'bike_util', timefunction('time').alias('day_part'), \
                 weekfunction('time').alias('day_of_week'), monthfunction('time').alias('month')).cache()

In [24]:
status.show(5)

+----------+------------------+---------+-----------+-----+
|station_id|         bike_util| day_part|day_of_week|month|
+----------+------------------+---------+-----------+-----+
|         2|0.9259259259259259|Afternoon|          4|    8|
|         2|0.9259259259259259|Afternoon|          4|    8|
|         2|0.9259259259259259|Afternoon|          4|    8|
|         2|0.9259259259259259|Afternoon|          4|    8|
|         2|0.9259259259259259|Afternoon|          4|    8|
+----------+------------------+---------+-----------+-----+
only showing top 5 rows



In [25]:
status.count()

71984434

## Sanity Check

In [26]:
status.printSchema()

root
 |-- station_id: integer (nullable = true)
 |-- bike_util: double (nullable = true)
 |-- day_part: string (nullable = true)
 |-- day_of_week: integer (nullable = true)
 |-- month: integer (nullable = true)



In [27]:
status.select('day_of_week').distinct().count()

7

In [31]:
status.select('month').distinct().count()

12

In [32]:
status.select('day_part').distinct().count()

4

## Processing Data

In [35]:
bike_status_final = status.groupBy('station_id', 'month', 'day_part', 'day_of_week')\
                        .agg(F.variance('bike_util').alias('var_util_perc'),
                             F.mean('bike_util').alias('avg_util_perc'))
bike_status_final.show(5)

+----------+-----+--------+-----------+--------------------+------------------+
|station_id|month|day_part|day_of_week|       var_util_perc|     avg_util_perc|
+----------+-----+--------+-----------+--------------------+------------------+
|         5|   12| Evening|          6| 0.01589297288090089|0.6118421052631583|
|        34|   12| Evening|          1|0.021263267644319888| 0.573987154150196|
|        38|   12| Evening|          7|0.005679260136055975| 0.548456790123464|
|        41|   11| Evening|          7|0.005722599626027153|0.5439814814814791|
|        45|    8|   Night|          5|0.023555367191730767|0.7002754820936453|
+----------+-----+--------+-----------+--------------------+------------------+
only showing top 5 rows



In [36]:
bike_status_final.cache()
bike_status_final.count()

23520

## Running K Means

### Converting categorical columns to numerical columns

In [37]:
def implement_string_indexer(cols, df):
    for c in cols:
        si = StringIndexer(inputCol=c, outputCol=c+'_si')
        sm = si.fit(df)
        df = sm.transform(df).drop(c)
        df = df.withColumnRenamed(c + '_si', c)
        return df

cols = ['day_part']

final_df = implement_string_indexer(cols, bike_status_final)

In [38]:
final_df.cache()
final_df.count()
final_df.show(5)

+----------+-----+-----------+--------------------+------------------+--------+
|station_id|month|day_of_week|       var_util_perc|     avg_util_perc|day_part|
+----------+-----+-----------+--------------------+------------------+--------+
|         5|   12|          6| 0.01589297288090089|0.6118421052631583|     0.0|
|        34|   12|          1|0.021263267644319888| 0.573987154150196|     0.0|
|        38|   12|          7|0.005679260136055975| 0.548456790123464|     0.0|
|        41|   11|          7|0.005722599626027153|0.5439814814814791|     0.0|
|        45|    8|          5|0.023555367191730767|0.7002754820936453|     1.0|
+----------+-----+-----------+--------------------+------------------+--------+
only showing top 5 rows



### Removing station_id from input data

In [39]:
station_ids = final_df.select('station_id').collect()
station_ids = [x.asDict()['station_id'] for x in station_ids]
df_without_sid = final_df.drop('station_id')

### Converting input_data to required_format for K Means i.e getting features

In [40]:
input_cols = ['day_of_week', 'avg_util_perc', 'month', 
              'day_part', 'var_util_perc']

va = VectorAssembler(inputCols= input_cols, outputCol= 'features')
df_transformed = va.transform(df_without_sid).select('features')
df_transformed.show(5)

+--------------------+
|            features|
+--------------------+
|[6.0,0.6118421052...|
|[1.0,0.5739871541...|
|[7.0,0.5484567901...|
|[7.0,0.5439814814...|
|[5.0,0.7002754820...|
+--------------------+
only showing top 5 rows



In [108]:
df_transformed.show(5, False)

+------------------------------------------------------+
|features                                              |
+------------------------------------------------------+
|[6.0,0.6118421052631583,12.0,0.0,0.01589297288090089] |
|[1.0,0.573987154150196,12.0,0.0,0.021263267644319888] |
|[7.0,0.548456790123464,12.0,0.0,0.005679260136055975] |
|[7.0,0.5439814814814791,11.0,0.0,0.005722599626027153]|
|[5.0,0.7002754820936453,8.0,1.0,0.023555367191730767] |
+------------------------------------------------------+
only showing top 5 rows



In [41]:
df_transformed.cache()
df_transformed.count()

23520

### Normalising Data

In [70]:
from pyspark.ml.feature import StandardScaler

scaler = StandardScaler(inputCol="features", outputCol="scaledFeatures",
                        withStd=True, withMean=True)

# Compute summary statistics by fitting the StandardScaler
scalerModel = scaler.fit(df_transformed)

# Normalize each feature to have unit standard deviation.
scaledData = scalerModel.transform(df_transformed).select('scaledFeatures').\
                withColumnRenamed('scaledFeatures', 'features')
    
scaledData.cache()
scaledData.count()

23520

### Implementing K-Means

In [84]:
min_wssse = 1e10
final_data = scaledData

for clusters in range(2,4):

    # Trains a k-means model.
    kmeans = KMeans().setK(clusters).setSeed(1)
    model = kmeans.fit(final_data)

    # Evaluate clustering by computing Within Set Sum of Squared Errors.
    wssse = model.computeCost(final_data)
    if wssse <= min_wssse:
        min_wssse = wssse
        best_n_clusters = clusters

In [85]:
print("Best Within Set Sum of Squared Errors = " + str(min_wssse))
print("Best number of clusters = " + str(best_n_clusters))

kmeans = KMeans().setK(best_n_clusters).setSeed(1)
model = kmeans.fit(final_data)

# Shows the result.
centers = model.clusterCenters()
print("Cluster Centers: ")
for center in centers:
    print(center)

Best Within Set Sum of Squared Errors = 86945.6121498
Best number of clusters = 3
Cluster Centers: 
[ 0.13371221 -0.20714931  0.91652861  0.07345454 -0.39622161]
[-0.2390639  -0.06985938  0.00163782 -0.2316089   1.44021768]
[ 0.00832689  0.23807379 -0.87807141  0.06174463 -0.44188773]


In [107]:
for center in centers:
    print(center*scalerModel.mean + scalerModel.std)

[  2.53489137e+00  -1.45322870e-02   9.40956186e+00   1.22823957e+00
   5.35692946e-03]
[ 1.04378693  0.05774723  3.46277178  0.7706444   0.05748734]
[ 2.03335007  0.21986591 -2.25533824  1.2106747   0.00406062]


### Finding Clusters

In [86]:
prediction = model.transform(final_data).select('prediction').collect()
predicted_cluster = [x.asDict()['prediction'] for x in prediction]

In [87]:
from collections import Counter
Counter(predicted_cluster)

Counter({0: 8910, 1: 5296, 2: 9314})

In [88]:
station_id_rd =  sc.parallelize(zip(station_ids, predicted_cluster))

station_id_schema = StructType([StructField('station_id', IntegerType()),
                                StructField('cluster', IntegerType())])

station_id_df = sqlContext.createDataFrame(station_id_rd, station_id_schema)


station_final_cluster = sqlContext.createDataFrame(station_id_df.toPandas().groupby('station_id').\
                                                   agg(lambda x:x.value_counts().index[0]).reset_index())

# station_final_cluster = sqlContext.createDataFrame(station_id_df.toPandas().groupby('station_id').\
#                                                    median().reset_index())

In [89]:
Counter([x.asDict()['cluster'] for x in station_final_cluster.select('cluster').collect()])

Counter({0: 15, 1: 20, 2: 35})

In [104]:
final_clusters = station_final_cluster.groupBy('cluster').\
                    agg(F.sort_array(F.collect_list('station_id')).alias('stations'))

final_clusters.show(3, False)

+-------+---------------------------------------------------------------------------------------------------------------------------------------+
|cluster|stations                                                                                                                               |
+-------+---------------------------------------------------------------------------------------------------------------------------------------+
|0      |[3, 7, 8, 11, 12, 23, 25, 26, 30, 35, 36, 38, 46, 63, 84]                                                                              |
|1      |[39, 48, 50, 54, 55, 57, 60, 61, 64, 65, 66, 68, 69, 70, 71, 72, 73, 74, 76, 77]                                                       |
|2      |[2, 4, 5, 6, 9, 10, 13, 14, 16, 21, 22, 24, 27, 28, 29, 31, 32, 33, 34, 37, 41, 42, 45, 47, 49, 51, 56, 58, 59, 62, 67, 75, 80, 82, 83]|
+-------+-------------------------------------------------------------------------------------------------------------------

In [123]:
station_final_cluster.collect()

[Row(station_id=2, cluster=2),
 Row(station_id=3, cluster=0),
 Row(station_id=4, cluster=2),
 Row(station_id=5, cluster=2),
 Row(station_id=6, cluster=2),
 Row(station_id=7, cluster=0),
 Row(station_id=8, cluster=0),
 Row(station_id=9, cluster=2),
 Row(station_id=10, cluster=2),
 Row(station_id=11, cluster=0),
 Row(station_id=12, cluster=0),
 Row(station_id=13, cluster=2),
 Row(station_id=14, cluster=2),
 Row(station_id=16, cluster=2),
 Row(station_id=21, cluster=2),
 Row(station_id=22, cluster=2),
 Row(station_id=23, cluster=0),
 Row(station_id=24, cluster=2),
 Row(station_id=25, cluster=0),
 Row(station_id=26, cluster=0),
 Row(station_id=27, cluster=2),
 Row(station_id=28, cluster=2),
 Row(station_id=29, cluster=2),
 Row(station_id=30, cluster=0),
 Row(station_id=31, cluster=2),
 Row(station_id=32, cluster=2),
 Row(station_id=33, cluster=2),
 Row(station_id=34, cluster=2),
 Row(station_id=35, cluster=0),
 Row(station_id=36, cluster=0),
 Row(station_id=37, cluster=2),
 Row(station_id=

In [105]:
final_clusters.toPandas().to_csv(INPUT_DATA + 'final_clusters.csv', index = False)

In [106]:
!cat {INPUT_DATA}final_clusters.csv

cluster,stations
0,"[3, 7, 8, 11, 12, 23, 25, 26, 30, 35, 36, 38, 46, 63, 84]"
1,"[39, 48, 50, 54, 55, 57, 60, 61, 64, 65, 66, 68, 69, 70, 71, 72, 73, 74, 76, 77]"
2,"[2, 4, 5, 6, 9, 10, 13, 14, 16, 21, 22, 24, 27, 28, 29, 31, 32, 33, 34, 37, 41, 42, 45, 47, 49, 51, 56, 58, 59, 62, 67, 75, 80, 82, 83]"


In [128]:
station_cluster = dict([(x['station_id'], x['cluster']) for x in station_final_cluster.collect()])

In [134]:
get_cluster = udf(lambda x: station_cluster[x])

status_cluster = status.withColumn('cluster', get_cluster(final_df['station_id']))

status_cluster = status_cluster.drop('month', 'day_of_week', 'day_part', 'station_id')

status_cluster.show(3)

In [142]:
status_cluster.cache()
status_cluster.count()

71984434

In [143]:
status_cluster.show(3)

+------------------+-------+
|         bike_util|cluster|
+------------------+-------+
|0.9259259259259259|      2|
|0.9259259259259259|      2|
|0.9259259259259259|      2|
+------------------+-------+
only showing top 3 rows



In [147]:
status_cluster.groupBy('cluster').agg(F.variance('bike_util').alias('var_util_perc'),
                             F.mean('bike_util').alias('avg_util_perc')).show()

+-------+--------------------+------------------+
|cluster|       var_util_perc|     avg_util_perc|
+-------+--------------------+------------------+
|      0|0.027781789432157228|0.4870160515236175|
|      1| 0.05256571108005761|0.4930798261639537|
|      2|0.030443411074540416|0.5579522346854525|
+-------+--------------------+------------------+

