In [1]:
from pyspark.sql.types import *
from datetime import datetime

INPUT_DATA = '../data/sf-bay-area-bike-share/'

In [2]:
!head -5 {INPUT_DATA}status.csv

station_id,bikes_available,docks_available,time
2,2,25,2013/08/29 12:06:01
2,2,25,2013/08/29 12:07:01
2,2,25,2013/08/29 12:08:01
2,2,25,2013/08/29 12:09:01


In [3]:
!head -5 {INPUT_DATA}station.csv

id,name,lat,long,dock_count,city,installation_date
2,San Jose Diridon Caltrain Station,37.329732,-121.90178200000001,27,San Jose,8/6/2013
3,San Jose Civic Center,37.330698,-121.888979,15,San Jose,8/5/2013
4,Santa Clara at Almaden,37.333988,-121.894902,11,San Jose,8/6/2013
5,Adobe on Almaden,37.331415,-121.8932,19,San Jose,8/5/2013


In [4]:
bike_status = sc.textFile(INPUT_DATA + 'status.csv').cache()
header = bike_status.first()
bike_status = bike_status.filter(lambda x: x != header).map(lambda x: x.split(',')).cache()

In [5]:
bike_status.count()

71984434

In [6]:
test_ts = bike_status.take(1)[-1][-1]
dt = datetime.strptime(test_ts, "%Y/%m/%d %H:%M:%S")
dt

datetime.datetime(2013, 8, 29, 12, 6, 1)

In [7]:
def toIntSafe(num):
    try:
        return int(num)
    except ValueError:
        return None
    
def toFloatSafe(num):
    try:
        return float(num)
    except ValueError:
        return None    
    
def toTimeStampSafe(data):
    try:
        return datetime.strptime(data, "%Y/%m/%d %H:%M:%S") 
    except ValueError:
        return None
    
def convertData(data):
    return (toIntSafe(data[0]),
            toIntSafe(data[1]),
            toIntSafe(data[2]),
            toTimeStampSafe(data[3]))

def preprocess_data(data):
    try:
        return (data[0], (data[2]*1.0)/(data[1] + data[2]), data[3].year,
            data[3].month, data[3].day, data[3].hour, data[3].minute, data[3].isoweekday())
    except (AttributeError, ValueError, ZeroDivisionError):
        return None

bike_status_processed = bike_status.map(lambda x:
                                        convertData(x)).map(lambda x:preprocess_data(x)).filter(lambda x: x != None).cache()

In [8]:
bike_status_processed.count()

16994602

In [9]:
bike_status.unpersist()

PythonRDD[3] at RDD at PythonRDD.scala:48

In [10]:
bike_status_processed.take(3)

[(2, 0.9259259259259259, 2013, 8, 29, 12, 6, 4),
 (2, 0.9259259259259259, 2013, 8, 29, 12, 7, 4),
 (2, 0.9259259259259259, 2013, 8, 29, 12, 8, 4)]

In [11]:
bike_status_schema = StructType([StructField('station_id', IntegerType(),False),
                                StructField('bikes_utilised_percentage', FloatType(),False),
                                StructField('year', IntegerType(), False),
                                StructField('month', IntegerType(), False),
                                StructField('day', IntegerType(), False),
                                StructField('hour', IntegerType(), False),
                                StructField('minute', IntegerType(), False),
                                StructField('day_of_week', IntegerType(), False)])

In [12]:
bike_status_df = sqlContext.createDataFrame(bike_status_processed, bike_status_schema).cache()
bike_status_df.show(5)

+----------+-------------------------+----+-----+---+----+------+-----------+
|station_id|bikes_utilised_percentage|year|month|day|hour|minute|day_of_week|
+----------+-------------------------+----+-----+---+----+------+-----------+
|         2|                0.9259259|2013|    8| 29|  12|     6|          4|
|         2|                0.9259259|2013|    8| 29|  12|     7|          4|
|         2|                0.9259259|2013|    8| 29|  12|     8|          4|
|         2|                0.9259259|2013|    8| 29|  12|     9|          4|
|         2|                0.9259259|2013|    8| 29|  12|    10|          4|
+----------+-------------------------+----+-----+---+----+------+-----------+
only showing top 5 rows



In [13]:
bike_status_df.count()

16994602

In [14]:
bike_status_df.printSchema()

root
 |-- station_id: integer (nullable = false)
 |-- bikes_utilised_percentage: float (nullable = false)
 |-- year: integer (nullable = false)
 |-- month: integer (nullable = false)
 |-- day: integer (nullable = false)
 |-- hour: integer (nullable = false)
 |-- minute: integer (nullable = false)
 |-- day_of_week: integer (nullable = false)



In [15]:
bike_status_processed.unpersist()

PythonRDD[6] at RDD at PythonRDD.scala:48

In [16]:
bike_status_df.select('day_of_week').distinct().collect()

[Row(day_of_week=1),
 Row(day_of_week=6),
 Row(day_of_week=3),
 Row(day_of_week=5),
 Row(day_of_week=4),
 Row(day_of_week=7),
 Row(day_of_week=2)]

In [17]:
bike_status_df.select('hour').distinct().collect()

[Row(hour=12),
 Row(hour=22),
 Row(hour=1),
 Row(hour=13),
 Row(hour=16),
 Row(hour=6),
 Row(hour=3),
 Row(hour=20),
 Row(hour=5),
 Row(hour=19),
 Row(hour=15),
 Row(hour=17),
 Row(hour=9),
 Row(hour=4),
 Row(hour=8),
 Row(hour=23),
 Row(hour=7),
 Row(hour=10),
 Row(hour=21),
 Row(hour=11),
 Row(hour=14),
 Row(hour=2),
 Row(hour=0),
 Row(hour=18)]

In [50]:
from pyspark.sql import functions as F

bike_status_period = bike_status_df.withColumn('day_part',
                                               F.when((bike_status_df["hour"] >= 20) | (bike_status_df["hour"] < 6), 'night').\
                                               when((bike_status_df["hour"] >= 6) & (bike_status_df["hour"] < 12), 'morning').\
                                               when((bike_status_df["hour"] >= 12) & (bike_status_df["hour"] < 16), 'afternoon').\
                                               otherwise('evening'))

bike_status_period = bike_status_period.withColumn('isWeekday',
                                                   F.when(bike_status_period["day_of_week"] <= 5, 1).otherwise(0)).cache()
bike_status_period.show(5)

+----------+-------------------------+----+-----+---+----+------+-----------+---------+---------+
|station_id|bikes_utilised_percentage|year|month|day|hour|minute|day_of_week| day_part|isWeekday|
+----------+-------------------------+----+-----+---+----+------+-----------+---------+---------+
|         2|                0.9259259|2013|    8| 29|  12|     6|          4|afternoon|        1|
|         2|                0.9259259|2013|    8| 29|  12|     7|          4|afternoon|        1|
|         2|                0.9259259|2013|    8| 29|  12|     8|          4|afternoon|        1|
|         2|                0.9259259|2013|    8| 29|  12|     9|          4|afternoon|        1|
|         2|                0.9259259|2013|    8| 29|  12|    10|          4|afternoon|        1|
+----------+-------------------------+----+-----+---+----+------+-----------+---------+---------+
only showing top 5 rows



In [51]:
bike_status_period.select('day_part').distinct().collect()

[Row(day_part=u'afternoon'),
 Row(day_part=u'night'),
 Row(day_part=u'morning'),
 Row(day_part=u'evening')]

In [52]:
bike_status_period.count()

16994602

In [53]:
bike_status_period = bike_status_period.drop('year', 'hour', 'minute', 'day', 'isWeekday').cache()
bike_status_period.count()

16994602

In [57]:
bike_status_final = bike_status_period.groupBy('station_id', 'month', 'day_part', 'day_of_week').mean('bikes_utilised_percentage')
bike_status_final.show(5)

+----------+-----+--------+-----------+------------------------------+
|station_id|month|day_part|day_of_week|avg(bikes_utilised_percentage)|
+----------+-----+--------+-----------+------------------------------+
|         5|   10| evening|          4|            0.5196670549274289|
|         6|   12| morning|          6|            0.6242195955995057|
|         7|    9| morning|          1|            0.5324195619312885|
|         8|   12| morning|          2|           0.44103705240620505|
|        10|    9| morning|          6|            0.5929139699094639|
+----------+-----+--------+-----------+------------------------------+
only showing top 5 rows



In [65]:
bike_status_final = bike_status_final.withColumnRenamed('avg(bikes_utilised_percentage)', 'avg_util_perc')
bike_status_final.cache()
bike_status_final.count()

11720

In [66]:
# daily_avg = bike_status_period.select('station_id', 'day_part', 'isWeekday', 'bikes_utilised_percentage').\
#                                    groupBy('station_id', 'day_part', 'isWeekday').\
#     mean('bikes_utilised_percentage').cache()

# daily_avg.show(5)
# daily_avg = daily_avg.withColumnRenamed('avg(bikes_utilised_percentage)', 'avg_bike_util')

In [67]:
# station = sc.textFile(INPUT_DATA + 'station.csv')
# header = station.first()
# station = station.filter(lambda x: x != header).map(lambda x: x.split(',')).map(lambda x:
#                                                                                 (int(x[0]), float(x[2]),
#                                                                                  float(x[3]), x[5]))

# station_schema = StructType([StructField('station_id', IntegerType(), False),
#                             StructField('latitude', DoubleType(), False),
#                             StructField('longitude', DoubleType(), False),
#                             StructField('city', StringType(), False),])

# station_df = sqlContext.createDataFrame(station, station_schema)

# df_daily_avg = daily_avg.join(station_df, on='station_id')
# df_daily_avg.show(5)

# df_daily_avg.toPandas().to_csv(INPUT_DATA + 'daily_avg_with_city.csv', index = False)

# !head {INPUT_DATA}daily_avg_with_city.csv

In [68]:
from pyspark.ml.feature import StringIndexer, VectorAssembler
from pyspark.ml.clustering import KMeans

def implement_string_indexer(cols, df):
    for c in cols:
        si = StringIndexer(inputCol=c, outputCol=c+'_si')
        sm = si.fit(df)
        df = sm.transform(df).drop(c)
        df = df.withColumnRenamed(c + '_si', c)
        return df

cols = ['day_part']

final_df = implement_string_indexer(cols, bike_status_final)

In [69]:
final_df.cache()
final_df.count()
final_df.show(5)

+----------+-----+-----------+-------------------+--------+
|station_id|month|day_of_week|      avg_util_perc|day_part|
+----------+-----+-----------+-------------------+--------+
|         5|   10|          4| 0.5196670549274289|     2.0|
|         6|   12|          6| 0.6242195955995057|     3.0|
|         7|    9|          1| 0.5324195619312885|     3.0|
|         8|   12|          2|0.44103705240620505|     3.0|
|        10|    9|          6| 0.5929139699094639|     3.0|
+----------+-----+-----------+-------------------+--------+
only showing top 5 rows



In [71]:
input_cols = ['station_id', 'avg_util_perc', 'month', 
              'day_of_week', 'day_part']

va = VectorAssembler(inputCols= input_cols, outputCol= 'features')
df_transformed = va.transform(final_df).select('features')
df_transformed.show(5)

+--------------------+
|            features|
+--------------------+
|[5.0,0.5196670549...|
|[6.0,0.6242195955...|
|[7.0,0.5324195619...|
|[8.0,0.4410370524...|
|[10.0,0.592913969...|
+--------------------+
only showing top 5 rows



In [72]:
df_transformed.cache()
df_transformed.count()

11720

In [84]:
min_wssse = 1e10

for clusters in range(2,6):

    # Trains a k-means model.
    kmeans = KMeans().setK(clusters).setSeed(1)
    model = kmeans.fit(df_transformed)

    # Evaluate clustering by computing Within Set Sum of Squared Errors.
    wssse = model.computeCost(df_transformed)
    if wssse <= min_wssse:
        min_wssse = wssse
        best_n_clusters = clusters

In [85]:
print("Best Within Set Sum of Squared Errors = " + str(min_wssse))
print("Best number of clusters = " + str(best_n_clusters))

kmeans = KMeans().setK(best_n_clusters).setSeed(1)
model = kmeans.fit(df_transformed)

# Shows the result.
centers = model.clusterCenters()
print("Cluster Centers: ")
for center in centers:
    print(center)

Best Within Set Sum of Squared Errors = 485012.12266
Best number of clusters = 5
Cluster Centers: 
[ 28.22810358   0.48059937   7.28712871   4.05788271   1.49200305]
[ 72.63795551   0.52525977   7.14623758   4.05868434   1.4921912 ]
[ 8.57142857  0.49021524  7.53072626  4.06703911  1.49162011]
[ 60.           0.5406415    7.53072626   4.06703911   1.49162011]
[ 44.41666667   0.516114     7.53072626   4.06703911   1.49162011]
