In [2]:
sc

In [3]:
sqlContext = SQLContext(sc)

In [4]:
from pyspark.sql import Row
from pyspark.sql.types import *
from pyspark.sql import SQLContext
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.classification import DecisionTreeClassifier
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.sql.functions import *
from datetime import datetime
from pyspark.ml.feature import StringIndexer

In [5]:
from mpl_toolkits.basemap import Basemap

In [6]:
trips = sc.textFile('sf-bay-area-bike-share/trip.csv').map(lambda x: x.split(','))
trip_cols = trips.first()
trips = trips.filter(lambda x: x[0] != 'id') #remove header

status = sc.textFile('sf-bay-area-bike-share/status.csv').map(lambda x: x.split(','))
status_cols = status.first()
status = status.filter(lambda x: x[0] != u'station_id') #remove header

station = sc.textFile('sf-bay-area-bike-share/station.csv').map(lambda x: x.split(','))
station_cols = station.first()
station = station.filter(lambda x: x[0] != 'id') #remove header

In [7]:
print trips.first()
print status.first()
print station.first()

[u'4576', u'63', u'8/29/2013 14:13', u'South Van Ness at Market', u'66', u'8/29/2013 14:14', u'South Van Ness at Market', u'66', u'520', u'Subscriber', u'94127']
[u'2', u'2', u'25', u'2013/08/29 12:06:01']
[u'2', u'San Jose Diridon Caltrain Station', u'37.329732', u'-121.90178200000001', u'27', u'San Jose', u'8/6/2013']


In [64]:
def toIntSafe(inval):
    try:
        return int(inval)
    except ValueError:
        return None
    
def toFloatSafe(inval):
    try:
        return float(inval)
    except ValueError:
        return None

def trip_toTimeStampSafe(data):
    try:
        return datetime.strptime(data, "%m/%d/%Y %H:%M") 
    except ValueError:
        return None
    
def status_toTimeStampSafe(data):
    try:
        return datetime.strptime(data, "%Y/%m/%d %H:%M:%S") 
    except ValueError:
        return None   

def station_toTimeStampSafe(data):
    try:
        return datetime.strptime(data, "%m/%d/%Y") 
    except ValueError:
        return None 

def trip_toRow(x):
    return Row(toIntSafe(x[0]),toIntSafe(x[1]),trip_toTimeStampSafe(x[2]),x[3], 
        toIntSafe(x[4]),trip_toTimeStampSafe(x[5]),x[6],toIntSafe(x[7]),
            toIntSafe(x[8]),x[9],x[10])

def status_toRow(x):
    return Row(toIntSafe(x[0]), toIntSafe(x[1]), toIntSafe(x[2]), 
               status_toTimeStampSafe(x[3]), float(x[2])/(float(x[1]) + float(x[2])))
                         
def station_toRow(x):
    return Row(toIntSafe(x[0]), x[1], toFloatSafe(x[2]), toFloatSafe(x[3]), 
               toIntSafe(x[4]), x[5], station_toTimeStampSafe(x[6]))                 

In [83]:
tripSchema = StructType([StructField("id", IntegerType(), True), 
                         StructField("duration", IntegerType(), True),
                         StructField("start_date", TimestampType(), True), 
                         StructField("start_station_name", StringType(), True),
                         StructField("start_station_id", IntegerType(), True),
                       StructField("end_date", TimestampType(), True),
                       StructField("end_station_name", StringType(), True),
                       StructField("end_station_id", IntegerType(), True),
                       StructField("bike_id", IntegerType(), True),
                       StructField("subscription_type", StringType(), True),
                       StructField("zip_code", StringType(), True)])
trip_rowRDD = trips.map(lambda x: trip_toRow(x))
trip_df = sqlContext.createDataFrame(trip_rowRDD, tripSchema)


statusSchema = StructType([StructField("station_id", IntegerType(), True), 
                         StructField("bikes_available", IntegerType(), True),
                         StructField("docks_available", IntegerType(), True), 
                         StructField("time", TimestampType(), True),
                          StructField("bikes_utilised_percentage", FloatType(), True)])
status_rowRDD = status.map(lambda x: status_toRow(x))
status_df = sqlContext.createDataFrame(status_rowRDD, statusSchema)

stationSchema = StructType([StructField("id", IntegerType(), True), 
                         StructField("name", StringType(), True),
                         StructField("lat", FloatType(), True), 
                         StructField("long", FloatType(), True),
                         StructField("dock_count", IntegerType(), True),
                         StructField("city", StringType(), True),
                         StructField("installation_date", DateType(), True)])

station_rowRDD = station.map(lambda x: station_toRow(x))
station_df = sqlContext.createDataFrame(station_rowRDD, stationSchema)

In [84]:
status_df.show(10)

+----------+---------------+---------------+-------------------+-------------------------+
|station_id|bikes_available|docks_available|               time|bikes_utilised_percentage|
+----------+---------------+---------------+-------------------+-------------------------+
|         2|              2|             25|2013-08-29 12:06:01|                0.9259259|
|         2|              2|             25|2013-08-29 12:07:01|                0.9259259|
|         2|              2|             25|2013-08-29 12:08:01|                0.9259259|
|         2|              2|             25|2013-08-29 12:09:01|                0.9259259|
|         2|              2|             25|2013-08-29 12:10:01|                0.9259259|
|         2|              2|             25|2013-08-29 12:11:01|                0.9259259|
|         2|              2|             25|2013-08-29 12:12:01|                0.9259259|
|         2|              2|             25|2013-08-29 12:13:01|                0.9259259|

Morning: [6-10)
Noon: [10-2)
Evening: [2-6)
Night: [6-10)

In [85]:
def time_of_day(x):
    hour = x.hour
    if hour >= 6 and hour < 10:
        return 0
    if hour >= 10 and hour < 14:
        return 1
    if hour >= 14 and hour < 18:
        return 2
    if hour >= 18 and hour < 22:
        return 3
    
timefunction = udf(lambda x: time_of_day(x))
bike_status_period = status_df.select('*', timefunction('time').alias('time_of_day')).cache()

In [86]:
bike_status_period.show(5)

+----------+---------------+---------------+-------------------+-------------------------+-----------+
|station_id|bikes_available|docks_available|               time|bikes_utilised_percentage|time_of_day|
+----------+---------------+---------------+-------------------+-------------------------+-----------+
|         2|              2|             25|2013-08-29 12:06:01|                0.9259259|          1|
|         2|              2|             25|2013-08-29 12:07:01|                0.9259259|          1|
|         2|              2|             25|2013-08-29 12:08:01|                0.9259259|          1|
|         2|              2|             25|2013-08-29 12:09:01|                0.9259259|          1|
|         2|              2|             25|2013-08-29 12:10:01|                0.9259259|          1|
+----------+---------------+---------------+-------------------+-------------------------+-----------+
only showing top 5 rows



In [None]:
daily_avg = bike_status_period.select('station_id', 'bikes_utilised_percentage','time_of_day').\
groupBy('station_id', 'time_of_day').mean('bikes_utilised_percentage').cache()

In [None]:
daily_avg.show(5)

In [None]:
station_filtered = station_df.select('id', 'lat', 'long')

In [None]:
daily_avg.join(station_filtered, 'id', 'outer').show(10)

In [79]:
station_df.show(5)

+---+--------------------+---------+-----------+----------+--------+-----------------+
| id|                name|      lat|       long|dock_count|    city|installation_date|
+---+--------------------+---------+-----------+----------+--------+-----------------+
|  2|San Jose Diridon ...| 37.32973| -121.90178|        27|San Jose|       2013-08-06|
|  3|San Jose Civic Ce...|37.330696| -121.88898|        15|San Jose|       2013-08-05|
|  4|Santa Clara at Al...| 37.33399|-121.894905|        11|San Jose|       2013-08-06|
|  5|    Adobe on Almaden|37.331413|  -121.8932|        19|San Jose|       2013-08-05|
|  6|    San Pedro Square| 37.33672| -121.89407|        15|San Jose|       2013-08-07|
+---+--------------------+---------+-----------+----------+--------+-----------------+
only showing top 5 rows



In [None]:
bbox = (-122.517901,37.638179,-122.258863,37.836884)

plotHeatmap(jobs15, plot_type='scatter', fig_kwargs={'figsize':[20,20]},
            bmap_kwargs={'epsg':'26943','resolution':'h'},
            plot_kwargs={'cmap':'gist_heat_r','s':4,'edgecolor':'none'})

In [None]:
#expects column 
def plotHeatmap(data,bbox,plot_type,fig_kwargs,bmap_kwargs,plot_kwargs):
    fig, ax = plt.subplots(**fig_kwargs)
    bmap = Basemap(bbox[1], bbox[0], bbox[3], bbox[2], ax=ax, **bmap_kwargs)
    bmap.drawcoastlines()
    bmap.drawmapboundary()
    x, y = bmap('long', 'lat')
    if plot_type == 'scatter':
        plot = bmap.scatter(
            x, y, c=data.values, **plot_kwargs)
    elif plot_type == 'hexbin':
        plot = bmap.hexbin(
            x, y, C=data.values, **plot_kwargs)

    bmap.colorbar(plot, **cbar_kwargs)
    return bmap, fig, ax

In [None]:
#Inspect hist of column freqs
pairs = trips.map(lambda x: (x[9],1))
pairs.reduceByKey(lambda x, y: x+y).collect()

In [None]:
#Check if everything is as expected
print trips.filter(lambda x: not x[7].isdigit()).take(5)
print trips.filter(lambda x: not is_trip_date(x[5])).take(5)