In [1]:
from pyspark import SparkContext
from pyspark import SparkContext, SparkConf
from pyspark.sql import Row
from pyspark.sql.types import *
from pyspark.sql import SQLContext
from pyspark.sql.functions import *
from datetime import datetime

def toIntSafe(inval):
  try:
    return int(inval)
  except ValueError:
    return None

def toTimeSafe_trip(inval):
  try:
    return datetime.strptime(inval, "%m/%d/%Y %H:%M")
  except ValueError:
    return None

def toTimeSafe_status(inval):
  try:
    return datetime.strptime(inval, "%Y/%m/%d %H:%M:%S")
  except ValueError:
    return None

def toTimeSafe_weather(inval):
  try:
    return datetime.strptime(inval, "%m/%d/%Y")
  except ValueError:
    return None

def toTimeSafe_station(inval):
  try:
    return datetime.strptime(inval, "%m/%d/%Y")
  except ValueError:
    return None

def toLongSafe(inval):
  try:
    return float(inval)
  except ValueError:
    return None

sc = SparkContext.getOrCreate()

In [2]:
toTimeSafe_trip(u'8/29/2013 14:13')

datetime.datetime(2013, 8, 29, 14, 13)

In [3]:
trip = sc.textFile('Data/trip.csv', 8).map(lambda x : x.split(',')).cache()
trip_cols = trip.first()
trip = trip.filter(lambda x: x[0] != 'id').cache()

status = sc.textFile('Data/status.csv', 8).map(lambda x : x.split(',')).cache()
status_cols = status.first()
status = status.filter(lambda x: x[0] != u'station_id').cache()

weather = sc.textFile('Data/weather.csv', 8).map(lambda x : x.split(',')).cache()

station = sc.textFile('Data/station.csv', 8).map(lambda x : x.split(',')).cache()
station_cols = station.first()
station = station.filter(lambda x: x[0] != 'id').cache()

In [4]:
trip.count()

669959

In [5]:
tripSchema = StructType(
    [StructField("id", IntegerType(), True), 
         StructField("duration", IntegerType(), True),
         StructField("start_date", TimestampType(), True), 
         StructField("start_station_name", StringType(), True),
         StructField("start_station_id", IntegerType(), True),
       StructField("end_date", TimestampType(), True),
       StructField("end_station_name", StringType(), True),
       StructField("end_station_id", IntegerType(), True),
       StructField("bike_id", IntegerType(), True),
       StructField("subscription_type", StringType(), True),
       StructField("zip_code", StringType(), True)]
)

tripDF = sqlContext.createDataFrame(trip.map(lambda x: Row(toIntSafe(x[0]), 
                                                           toIntSafe(x[1]), 
                                                           toTimeSafe_trip(x[2]), 
                                                           x[3], 
                                                           toIntSafe(x[4]), 
                                                           toTimeSafe_trip(x[5]), 
                                                           x[6], 
                                                           toIntSafe(x[7]), 
                                                           toIntSafe(x[8]),
                                                           x[9], 
                                                           x[10])), tripSchema)


statusSchema = StructType([
    StructField('station_id', StringType(), True),
    StructField('bikes_available', IntegerType(), True),
    StructField('docks_available', IntegerType(), True),
    StructField('time', TimestampType(), True),
    StructField("bikes_utilised_percentage", FloatType(), True)
])

statusDF = sqlContext.createDataFrame(status.map(lambda x: Row(x[0], 
                                                               toIntSafe(x[1]), 
                                                               toIntSafe(x[2]), 
                                                               toTimeSafe_status(x[3]),
                                                               float(x[2])/(float(x[1]) + float(x[2])))), statusSchema)

weatherSchema = StructType([
    StructField('date', TimestampType(), True),
    StructField('max_temperature_f', FloatType(), True),
    StructField('mean_temperature_f', FloatType(), True),
    StructField('min_temperature_f', FloatType(), True),
    StructField('max_dew_point_f', FloatType(), True),
    StructField('mean_dew_point_f', FloatType(), True),
    StructField('min_dew_point_f', FloatType(), True),
    StructField('max_humidity', FloatType(), True),
    StructField('mean_humidity', FloatType(), True),
    StructField('min_humidity', FloatType(), True),
    StructField('max_sea_level_pressure_inches', FloatType(), True),
    StructField('mean_sea_level_pressure_inches', FloatType(), True),
    StructField('min_sea_level_pressure_inches', FloatType(), True),
    StructField('max_visibility_miles', FloatType(), True),
    StructField('mean_visibility_miles', FloatType(), True),
    StructField('min_visibility_miles', FloatType(), True),
    StructField('max_wind_Speed_mph', FloatType(), True),
    StructField('mean_wind_speed_mph', FloatType(), True),
    StructField('max_gust_speed_mph', FloatType(), True),
    StructField('precipitation_inches', FloatType(), True),
    StructField('cloud_cover', FloatType(), True),
    StructField('events', StringType(), True),
    StructField('wind_dir_degrees', FloatType(), True),
    StructField('zip_code', StringType(), True)
])

weatherDF = sqlContext.createDataFrame(weather.map(lambda x: Row(toTimeSafe_weather(x[0]), 
                                                                 toLongSafe(x[1]), 
                                                                 toLongSafe(x[2]),
                                                                 toLongSafe(x[3]), 
                                                                 toLongSafe(x[4]), 
                                                                 toLongSafe(x[5]), 
                                                                 toLongSafe(x[6]), 
                                                                 toLongSafe(x[7]), 
                                                                 toLongSafe(x[8]),
                                                                 toLongSafe(x[9]), 
                                                                 toLongSafe(x[10]), 
                                                                 toLongSafe(x[11]), 
                                                                 toLongSafe(x[12]), 
                                                                 toLongSafe(x[13]), 
                                                                 toLongSafe(x[14]), 
                                                                 toLongSafe(x[15]), 
                                                                 toLongSafe(x[16]),
                                                                 toLongSafe(x[17]), 
                                                                 toLongSafe(x[18]), 
                                                                 toLongSafe(x[19]), 
                                                                 toLongSafe(x[20]), 
                                                                 toLongSafe(x[21]), 
                                                                 toLongSafe(x[22]), 
                                                                 toLongSafe(x[23]))), weatherSchema)



stationSchema = StructType([
    StructField('id', StringType(), True),
    StructField('name', StringType(), True),
    StructField('lat', FloatType(), True),
    StructField('long', FloatType(), True),
    StructField('dock_count', IntegerType(), True),
    StructField('city', StringType(), True),
    StructField('installation_date', TimestampType(), True)
])

stationDF = sqlContext.createDataFrame(station.map(lambda x: Row(x[0], 
                                                                 x[1], 
                                                                 toLongSafe(x[2]),
                                                                 toLongSafe(x[3]), 
                                                                 toIntSafe(x[4]), 
                                                                 x[5], 
                                                                 toTimeSafe_station(x[6]))), stationSchema)

tripDF = tripDF.cache()
weatherDF = weatherDF.cache()
statusDF = statusDF.cache()
stationDF = stationDF.cache()

In [6]:
tripDF.count()

669959

In [7]:
trip_counts = tripDF.select('start_station_name', 'end_station_name', 'duration').\
                groupBy('start_station_name', 'end_station_name').\
                agg(count(tripDF['*']), avg(tripDF['duration']), max(tripDF['duration']), min(tripDF['duration'])).\
                orderBy(count(tripDF['*']), ascending=0).\
                cache()

In [8]:
trip_counts.show(10)

+--------------------+--------------------+--------+------------------+-------------+-------------+
|  start_station_name|    end_station_name|count(1)|     avg(duration)|max(duration)|min(duration)|
+--------------------+--------------------+--------+------------------+-------------+-------------+
|San Francisco Cal...|     Townsend at 7th|    6216| 317.3016409266409|        91522|          144|
|Harry Bridges Pla...|Embarcadero at Sa...|    6164|1264.3875730045424|        71180|          219|
|     Townsend at 7th|San Francisco Cal...|    5041|285.19738147193016|        40606|          132|
|     2nd at Townsend|Harry Bridges Pla...|    4839| 583.0818350898946|        71188|          294|
|Harry Bridges Pla...|     2nd at Townsend|    4357|  668.652513197154|        37907|          325|
|Embarcadero at Sa...|   Steuart at Market|    4269| 542.5350199109862|        24885|          228|
|Embarcadero at Fo...|San Francisco Cal...|    3967| 716.9868918578271|        87612|          345|


In [9]:
trip_durations = tripDF.select('start_station_name', 'end_station_name', 'duration').\
                groupBy('start_station_name', 'end_station_name').avg('duration').\
                orderBy('avg(duration)',ascending=0).cache()
trip_durations.show()

+--------------------+--------------------+------------------+
|  start_station_name|    end_station_name|     avg(duration)|
+--------------------+--------------------+------------------+
|Redwood City Medi...|SJSU - San Salvad...|          229914.0|
|Castro Street and...|       Howard at 2nd|          179212.5|
|Santa Clara Count...|California Ave Ca...|          169308.0|
|South Van Ness at...|       2nd at Folsom|156461.03603603604|
|Mountain View Cal...|San Jose Diridon ...|          101207.5|
|    Adobe on Almaden|Rengstorff Avenue...|           84633.0|
|San Jose Diridon ...|Mountain View Cal...|           83205.0|
|San Francisco Cit...|  San Jose City Hall|           80183.0|
|Castro Street and...|San Jose Diridon ...|           78068.5|
|           Japantown|Cowper at University|           77725.5|
|Palo Alto Caltrai...|San Francisco Cal...|           76833.0|
|San Jose Diridon ...|Cowper at University|           75844.0|
|San Antonio Shopp...|Castro Street and...| 73043.11538

In [17]:
trip_durations_max = tripDF.select('start_station_name', 'end_station_name', 'duration').\
                groupBy('start_station_name', 'end_station_name').max('duration').\
                orderBy('max(duration)',ascending=0).cache()
trip_durations_max.show()

+--------------------+--------------------+-------------+
|  start_station_name|    end_station_name|max(duration)|
+--------------------+--------------------+-------------+
|South Van Ness at...|       2nd at Folsom|     17270400|
|   Market at Sansome|Yerba Buena Cente...|      2137000|
|San Antonio Shopp...|Castro Street and...|      1852590|
|University and Em...|University and Em...|      1133540|
|Redwood City Calt...|Stanford in Redwo...|       720454|
|Harry Bridges Pla...|Civic Center BART...|       716480|
|Arena Green / SAP...|    Adobe on Almaden|       715339|
|Palo Alto Caltrai...|California Ave Ca...|       688899|
|San Jose Civic Ce...|SJSU 4th at San C...|       655939|
|South Van Ness at...|     Clay at Battery|       644771|
|San Jose Diridon ...|SJSU 4th at San C...|       619322|
|University and Em...|San Antonio Shopp...|       611240|
|San Jose Civic Ce...|San Jose Civic Ce...|       602338|
|California Ave Ca...|Palo Alto Caltrai...|       597517|
|    San Pedro

In [10]:
subscription_types = tripDF.select('subscription_type').groupBy('subscription_type').count().cache()
subscription_types.show()

+-----------------+------+
|subscription_type| count|
+-----------------+------+
|       Subscriber|566746|
|         Customer|103213|
+-----------------+------+



In [11]:
subscription_types = tripDF.select('subscription_type','duration').groupBy('subscription_type').avg('duration').cache()
subscription_types.show()

+-----------------+-----------------+
|subscription_type|    avg(duration)|
+-----------------+-----------------+
|       Subscriber|590.0488561013223|
|         Customer|3951.761328514819|
+-----------------+-----------------+



In [12]:
trip_durations_by_sub = tripDF.select('start_station_name', 'end_station_name', 'duration', 'subscription_type').\
                groupBy('start_station_name', 'end_station_name', 'subscription_type').\
                agg(avg(tripDF['duration']), count(tripDF['*'])).\
                orderBy('avg(duration)',ascending=0).cache()
trip_durations_by_sub.show()

+--------------------+--------------------+-----------------+------------------+--------+
|  start_station_name|    end_station_name|subscription_type|     avg(duration)|count(1)|
+--------------------+--------------------+-----------------+------------------+--------+
|South Van Ness at...|       2nd at Folsom|         Customer|1570936.0909090908|      11|
|Redwood City Medi...|SJSU - San Salvad...|       Subscriber|          229914.0|       1|
|Castro Street and...|       Howard at 2nd|         Customer|          179212.5|       2|
|Santa Clara Count...|California Ave Ca...|         Customer|          169308.0|       1|
|San Antonio Shopp...|Castro Street and...|       Subscriber|143587.46153846153|      13|
|University and Em...|San Antonio Shopp...|         Customer|          104252.5|       6|
|Mountain View Cal...|San Jose Diridon ...|         Customer|          101207.5|       2|
|Redwood City Publ...|    Broadway at Main|         Customer|           85258.0|       1|
|    Adobe

In [13]:
trip_durations_by_sub = tripDF.select('start_station_name', 'end_station_name', 'duration', 'subscription_type').\
                groupBy('start_station_name', 'end_station_name', 'subscription_type').avg('duration').\
                orderBy('avg(duration)').cache()
trip_durations_by_sub.show()

+--------------------+--------------------+-----------------+------------------+
|  start_station_name|    end_station_name|subscription_type|     avg(duration)|
+--------------------+--------------------+-----------------+------------------+
|Redwood City Medi...|    Broadway at Main|       Subscriber|             116.8|
|    Broadway at Main|Redwood City Medi...|       Subscriber|124.85714285714286|
|     Beale at Market|Temporary Transba...|       Subscriber|138.51313485113835|
|     Spear at Folsom|Embarcadero at Br...|       Subscriber| 165.3045112781955|
|Broadway St at Ba...|    Davis at Jackson|       Subscriber|173.33522727272728|
|Commercial at Mon...|     Clay at Battery|       Subscriber|177.49107142857142|
|Temporary Transba...|       Howard at 2nd|       Subscriber|184.04651162790697|
|    Davis at Jackson|Broadway St at Ba...|       Subscriber|187.57333333333332|
|Powell at Post (U...|      Post at Kearny|       Subscriber| 187.6577540106952|
|Redwood City Calt...|San Ma

In [14]:
bike_usage = tripDF.select('bike_id').groupBy('bike_id').count().orderBy('count', ascending=0).cache()
bike_usage.show()

+-------+-----+
|bike_id|count|
+-------+-----+
|    392| 2061|
|    489| 1975|
|    558| 1955|
|    267| 1951|
|    631| 1948|
|    518| 1942|
|    532| 1933|
|    592| 1932|
|    395| 1927|
|    540| 1926|
|    368| 1926|
|    421| 1922|
|    491| 1921|
|    524| 1915|
|    327| 1911|
|    503| 1909|
|    334| 1908|
|    507| 1906|
|    366| 1904|
|    389| 1904|
+-------+-----+
only showing top 20 rows



In [15]:
station_dates = stationDF.select('name','installation_date').orderBy('installation_date', ascending=0).cache()
station_dates.show()

+--------------------+-------------------+
|                name|  installation_date|
+--------------------+-------------------+
|         Ryland Park|2014-04-09 00:00:00|
|          Mezes Park|2014-02-20 00:00:00|
|Broadway St at Ba...|2014-01-22 00:00:00|
|San Antonio Shopp...|2013-12-31 00:00:00|
|Castro Street and...|2013-12-31 00:00:00|
|Santa Clara Count...|2013-12-31 00:00:00|
|  Powell Street BART|2013-08-25 00:00:00|
|   Steuart at Market|2013-08-25 00:00:00|
|Mechanics Plaza (...|2013-08-25 00:00:00|
|       Market at 4th|2013-08-25 00:00:00|
|   Market at Sansome|2013-08-25 00:00:00|
|South Van Ness at...|2013-08-23 00:00:00|
|      Market at 10th|2013-08-23 00:00:00|
|Yerba Buena Cente...|2013-08-23 00:00:00|
|San Francisco Cal...|2013-08-23 00:00:00|
|San Francisco Cal...|2013-08-23 00:00:00|
|Powell at Post (U...|2013-08-23 00:00:00|
|Civic Center BART...|2013-08-23 00:00:00|
|     2nd at Townsend|2013-08-22 00:00:00|
|       2nd at Folsom|2013-08-22 00:00:00|
+----------