In [1]:
from pyspark.sql.types import *
from datetime import datetime

INPUT_DATA = '../data/sf-bay-area-bike-share/'

In [2]:
!head {INPUT_DATA}status.csv

station_id,bikes_available,docks_available,time
2,2,25,2013/08/29 12:06:01
2,2,25,2013/08/29 12:07:01
2,2,25,2013/08/29 12:08:01
2,2,25,2013/08/29 12:09:01
2,2,25,2013/08/29 12:10:01
2,2,25,2013/08/29 12:11:01
2,2,25,2013/08/29 12:12:01
2,2,25,2013/08/29 12:13:01
2,2,25,2013/08/29 12:15:01


In [3]:
!head {INPUT_DATA}station.csv

id,name,lat,long,dock_count,city,installation_date
2,San Jose Diridon Caltrain Station,37.329732,-121.90178200000001,27,San Jose,8/6/2013
3,San Jose Civic Center,37.330698,-121.888979,15,San Jose,8/5/2013
4,Santa Clara at Almaden,37.333988,-121.894902,11,San Jose,8/6/2013
5,Adobe on Almaden,37.331415,-121.8932,19,San Jose,8/5/2013
6,San Pedro Square,37.336721000000004,-121.894074,15,San Jose,8/7/2013
7,Paseo de San Antonio,37.333798,-121.88694299999999,15,San Jose,8/7/2013
8,San Salvador at 1st,37.330165,-121.88583100000001,15,San Jose,8/5/2013
9,Japantown,37.348742,-121.89471499999999,15,San Jose,8/5/2013
10,San Jose City Hall,37.337391,-121.886995,15,San Jose,8/6/2013


In [4]:
bike_status = sc.textFile(INPUT_DATA + 'status.csv').cache()
header = bike_status.first()
bike_status = bike_status.filter(lambda x: x != header).map(lambda x: x.split(',')).cache()

[[u'2', u'2', u'25', u'2013/08/29 12:06:01'],
 [u'2', u'2', u'25', u'2013/08/29 12:07:01'],
 [u'2', u'2', u'25', u'2013/08/29 12:08:01']]

In [5]:
bike_status.count()

71984434

In [13]:
test_ts = bike_status.take(1)[-1][-1]
dt = datetime.strptime(test_ts, "%Y/%m/%d %H:%M:%S")
dt

datetime.datetime(2013, 8, 29, 12, 6, 1)

In [16]:
def toIntSafe(num):
    try:
        return int(num)
    except ValueError:
        return None
    
def toFloatSafe(num):
    try:
        return float(num)
    except ValueError:
        return None    
    
def toTimeStampSafe(data):
    try:
        return datetime.strptime(data, "%Y/%m/%d %H:%M:%S") 
    except ValueError:
        return None
    
def convertData(data):
    return (toIntSafe(data[0]),
            toIntSafe(data[1]),
            toIntSafe(data[2]),
            toTimeStampSafe(data[3]))

def preprocess_data(data):
    try:
        return (data[0], (data[2]*1.0)/(data[1] + data[2]), data[3].year,
            data[3].month, data[3].day, data[3].hour, data[3].minute, data[3].isoweekday())
    except (AttributeError, ValueError, ZeroDivisionError):
        return None

bike_status_processed = bike_status.map(lambda x:
                                        convertData(x)).map(lambda x:preprocess_data(x)).filter(lambda x: x != None).cache()

In [17]:
bike_status_processed.count()

16994602

In [18]:
bike_status.unpersist()

PythonRDD[3] at RDD at PythonRDD.scala:48

In [19]:
bike_status_processed.take(3)

[(2, 0.9259259259259259, 2013, 8, 29, 12, 6, 4),
 (2, 0.9259259259259259, 2013, 8, 29, 12, 7, 4),
 (2, 0.9259259259259259, 2013, 8, 29, 12, 8, 4)]

In [20]:
bike_status_schema = StructType([StructField('station_id', IntegerType(),False),
                                StructField('bikes_utilised_percentage', FloatType(),False),
                                StructField('year', IntegerType(), False),
                                StructField('month', IntegerType(), False),
                                StructField('day', IntegerType(), False),
                                StructField('hour', IntegerType(), False),
                                StructField('minute', IntegerType(), False),
                                StructField('day_of_week', IntegerType(), False)])

In [21]:
bike_status_df = sqlContext.createDataFrame(bike_status_processed, bike_status_schema).cache()
bike_status_df.show(5)

+----------+-------------------------+----+-----+---+----+------+-----------+
|station_id|bikes_utilised_percentage|year|month|day|hour|minute|day_of_week|
+----------+-------------------------+----+-----+---+----+------+-----------+
|         2|                0.9259259|2013|    8| 29|  12|     6|          4|
|         2|                0.9259259|2013|    8| 29|  12|     7|          4|
|         2|                0.9259259|2013|    8| 29|  12|     8|          4|
|         2|                0.9259259|2013|    8| 29|  12|     9|          4|
|         2|                0.9259259|2013|    8| 29|  12|    10|          4|
+----------+-------------------------+----+-----+---+----+------+-----------+
only showing top 5 rows



In [22]:
bike_status_df.count()

16994602

In [23]:
bike_status_df.printSchema()

root
 |-- station_id: integer (nullable = false)
 |-- bikes_utilised_percentage: float (nullable = false)
 |-- year: integer (nullable = false)
 |-- month: integer (nullable = false)
 |-- day: integer (nullable = false)
 |-- hour: integer (nullable = false)
 |-- minute: integer (nullable = false)
 |-- day_of_week: integer (nullable = false)



In [24]:
bike_status_processed.unpersist()

PythonRDD[15] at RDD at PythonRDD.scala:48

In [34]:
from pyspark.sql import functions as F

bike_status_period = bike_status_df.withColumn('day_part',
                                               F.when((bike_status_df.hour > 20) | (bike_status_df.hour < 6), 'night').\
                                               when((bike_status_df.hour >= 6) | (bike_status_df.hour < 12), 'morning').\
                                               when((bike_status_df.hour >= 12) | (bike_status_df.hour < 16), 'afternoon').\
                                               otherwise('evening'))

bike_status_period = bike_status_period.withColumn('isWeekday', F.when(daily_avg.day_of_week < 5, 1).otherwise(0)).cache()
bike_status_period.show(5)

+----------+-------------------------+----+-----+---+----+------+-----------+--------+---------+
|station_id|bikes_utilised_percentage|year|month|day|hour|minute|day_of_week|day_part|isWeekday|
+----------+-------------------------+----+-----+---+----+------+-----------+--------+---------+
|         2|                0.9259259|2013|    8| 29|  12|     6|          4| morning|        1|
|         2|                0.9259259|2013|    8| 29|  12|     7|          4| morning|        1|
|         2|                0.9259259|2013|    8| 29|  12|     8|          4| morning|        1|
|         2|                0.9259259|2013|    8| 29|  12|     9|          4| morning|        1|
|         2|                0.9259259|2013|    8| 29|  12|    10|          4| morning|        1|
+----------+-------------------------+----+-----+---+----+------+-----------+--------+---------+
only showing top 5 rows



In [35]:
bike_status_period.count()

16994602

In [36]:
daily_avg = bike_status_period.select('station_id', 'day_part', 'isWeekday', 'bikes_utilised_percentage').\
                                   groupBy('station_id', 'day_part', 'isWeekday').\
    mean('bikes_utilised_percentage').cache()

In [37]:
daily_avg.show(5)

+----------+--------+---------+------------------------------+
|station_id|day_part|isWeekday|avg(bikes_utilised_percentage)|
+----------+--------+---------+------------------------------+
|         2| morning|        1|            0.5386181238037825|
|        65| morning|        1|           0.47457618634477855|
|        27| morning|        1|           0.46505270561616696|
|        12|   night|        1|            0.5201876582364764|
|        31| morning|        0|           0.49243916947135946|
+----------+--------+---------+------------------------------+
only showing top 5 rows



In [54]:
daily_avg = daily_avg.withColumnRenamed('avg(bikes_utilised_percentage)', 'avg_bike_util')

In [55]:
!head {INPUT_DATA}/station.csv

id,name,lat,long,dock_count,city,installation_date
2,San Jose Diridon Caltrain Station,37.329732,-121.90178200000001,27,San Jose,8/6/2013
3,San Jose Civic Center,37.330698,-121.888979,15,San Jose,8/5/2013
4,Santa Clara at Almaden,37.333988,-121.894902,11,San Jose,8/6/2013
5,Adobe on Almaden,37.331415,-121.8932,19,San Jose,8/5/2013
6,San Pedro Square,37.336721000000004,-121.894074,15,San Jose,8/7/2013
7,Paseo de San Antonio,37.333798,-121.88694299999999,15,San Jose,8/7/2013
8,San Salvador at 1st,37.330165,-121.88583100000001,15,San Jose,8/5/2013
9,Japantown,37.348742,-121.89471499999999,15,San Jose,8/5/2013
10,San Jose City Hall,37.337391,-121.886995,15,San Jose,8/6/2013


In [56]:
station = sc.textFile(INPUT_DATA + 'station.csv')
header = station.first()
station = station.filter(lambda x: x != header).map(lambda x: x.split(',')).map(lambda x:
                                                                                (int(x[0]), float(x[2]), float(x[3])))

station_schema = StructType([StructField('station_id', IntegerType(), False),
                            StructField('latitude', DoubleType(), False),
                            StructField('longitude', DoubleType(), False)])

station_df = sqlContext.createDataFrame(station, station_schema)

In [57]:
df_daily_avg = daily_avg.join(station_df, on='station_id')
df_daily_avg.show(5)

+----------+--------+---------+-------------------+---------+-------------------+
|station_id|day_part|isWeekday|      avg_bike_util| latitude|          longitude|
+----------+--------+---------+-------------------+---------+-------------------+
|         2| morning|        0| 0.5062563260739318|37.329732|-121.90178200000001|
|         2|   night|        1|0.49013946846399314|37.329732|-121.90178200000001|
|         2|   night|        0|0.49348935233878916|37.329732|-121.90178200000001|
|         2| morning|        1| 0.5386181238037825|37.329732|-121.90178200000001|
|         3|   night|        1|0.46943447933314636|37.330698|        -121.888979|
+----------+--------+---------+-------------------+---------+-------------------+
only showing top 5 rows



In [58]:
df_daily_avg.toPandas().to_csv(INPUT_DATA + 'daily_avg.csv', index = False)

In [59]:
!head {INPUT_DATA}daily_avg.csv

station_id,day_part,isWeekday,avg_bike_util,latitude,longitude
2,morning,0,0.506256326074,37.329732,-121.901782
2,night,1,0.490139468464,37.329732,-121.901782
2,night,0,0.493489352339,37.329732,-121.901782
2,morning,1,0.538618123804,37.329732,-121.901782
3,night,1,0.469434479333,37.330698,-121.888979
3,morning,0,0.462371542447,37.330698,-121.888979
3,night,0,0.469105954345,37.330698,-121.888979
3,morning,1,0.472111697251,37.330698,-121.888979
4,morning,1,0.511852826496,37.333988,-121.894902
