In [1]:
sc

In [2]:
from pyspark.sql import Row
from pyspark.sql.types import *
from pyspark.sql import SQLContext
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.classification import DecisionTreeClassifier
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.sql.functions import *
from datetime import datetime
from pyspark.ml.feature import StringIndexer

In [3]:
sqlContext = SQLContext(sc)

In [9]:
status_df = spark.read.format("com.mongodb.spark.sql.DefaultSource")\
.option("uri","mongodb://54.245.37.88:27017/bikeshare.status").load()
status_df.printSchema()

trip_df = spark.read.format("com.mongodb.spark.sql.DefaultSource")\
.option("uri","mongodb://54.245.37.88:27017/bikeshare.trip").load()
trip_df.printSchema()

station_df = spark.read.format("com.mongodb.spark.sql.DefaultSource")\
.option("uri","mongodb://54.245.37.88:27017/bikeshare.station").load()
station_df.printSchema()

root
 |-- _id: struct (nullable = true)
 |    |-- oid: string (nullable = true)
 |-- bikes_available: integer (nullable = true)
 |-- docks_available: integer (nullable = true)
 |-- station_id: integer (nullable = true)
 |-- time: string (nullable = true)



In [40]:
#functions to transform dataframe

def toIntSafe(num):
    try:
        return int(num)
    except ValueError:
        return None
    
def toFloatSafe(num):
    try:
        return float(num)
    except ValueError:
        return None    
    
def toTimeStampSafe(data):
    try:
        return datetime.strptime(data, "%Y/%m/%d %H:%M:%S") 
    except ValueError:
        return None
    
def time_of_day(x):
    x = toTimeStampSafe(x)
    if x:
        hour = x.hour
        if hour >= 6 and hour < 10:
            return 0
        if hour >= 10 and hour < 14:
            return 1
        if hour >= 14 and hour < 20:
            return 2
        if hour >= 20 or hour < 6:
            return 3
    else:
        return None
    
def isWeekday(x):
    x = toTimeStampSafe(x)
    if x:
        weekday = x.isoweekday()
        if weekday < 5:
            return 1
        else:
            return 0
    else:
        return None
    
timefunction = udf(lambda x: time_of_day(x))
weekfunction = udf(lambda x: isWeekday(x))

In [41]:
status_df = status_df.withColumn('bike_util', \
        ((col('docks_available') * 1.0) /(col('bikes_available') + col('docks_available'))))

status = status_df.select('station_id', 'bike_util', timefunction('time').alias('time_of_day'), \
                 weekfunction('time').alias('is_weekday')).cache()


In [None]:
#groupBy takes forever
daily_avg = status.groupBy('station_id', 'time_of_day', 'is_weekday')\
.mean('bike_util').cache()
daily_avg.show(5)

In [None]:
status.unpersist()

In [30]:
#join station and transformed bike usage df 
station = station_df.select('id'.alias('station_id'), 'city', 'lat', 'long').cache()
df_daily_avg = daily_avg.join(station, on='station_id')

+---+--------+------------------+-------------------+
| id|    city|               lat|               long|
+---+--------+------------------+-------------------+
|  2|San Jose|         37.329732|-121.90178200000001|
|  3|San Jose|         37.330698|        -121.888979|
|  4|San Jose|         37.333988|        -121.894902|
|  5|San Jose|         37.331415|          -121.8932|
|  6|San Jose|37.336721000000004|        -121.894074|
+---+--------+------------------+-------------------+
only showing top 5 rows



root
 |-- _id: struct (nullable = true)
 |    |-- oid: string (nullable = true)
 |-- city: string (nullable = true)
 |-- dock_count: integer (nullable = true)
 |-- id: integer (nullable = true)
 |-- installation_date: string (nullable = true)
 |-- lat: double (nullable = true)
 |-- long: double (nullable = true)
 |-- name: string (nullable = true)



root
 |-- _id: struct (nullable = true)
 |    |-- oid: string (nullable = true)
 |-- bike_id: integer (nullable = true)
 |-- duration: integer (nullable = true)
 |-- end_date: string (nullable = true)
 |-- end_station_id: integer (nullable = true)
 |-- end_station_name: string (nullable = true)
 |-- id: integer (nullable = true)
 |-- start_date: string (nullable = true)
 |-- start_station_id: integer (nullable = true)
 |-- start_station_name: string (nullable = true)
 |-- subscription_type: string (nullable = true)
 |-- zip_code: string (nullable = true)



In [6]:
pddf = trip_df.toPandas()
pddf.head()

Unnamed: 0,_id,bike_id,duration,end_date,end_station_id,end_station_name,id,start_date,start_station_id,start_station_name,subscription_type,zip_code
0,"(5a5fdd105b417fe9768b076f,)",520,63,8/29/2013 14:14,66,South Van Ness at Market,4576,8/29/2013 14:13,66,South Van Ness at Market,Subscriber,94127
1,"(5a5fdd105b417fe9768b0770,)",661,70,8/29/2013 14:43,10,San Jose City Hall,4607,8/29/2013 14:42,10,San Jose City Hall,Subscriber,95138
2,"(5a5fdd105b417fe9768b0771,)",48,71,8/29/2013 10:17,27,Mountain View City Hall,4130,8/29/2013 10:16,27,Mountain View City Hall,Subscriber,97214
3,"(5a5fdd105b417fe9768b0772,)",26,77,8/29/2013 11:30,10,San Jose City Hall,4251,8/29/2013 11:29,10,San Jose City Hall,Subscriber,95060
4,"(5a5fdd105b417fe9768b0773,)",319,83,8/29/2013 12:04,67,Market at 10th,4299,8/29/2013 12:02,66,South Van Ness at Market,Subscriber,94103


In [10]:
import pandas as pd
df = pd.read_csv('Data/daily_avg.csv')
df.head()

Unnamed: 0,station_id,day_part,isWeekday,avg_bike_util,latitude,longitude
0,2,morning,0,0.506256,37.329732,-121.901782
1,2,night,1,0.490139,37.329732,-121.901782
2,2,night,0,0.493489,37.329732,-121.901782
3,2,morning,1,0.538618,37.329732,-121.901782
4,3,night,1,0.469434,37.330698,-121.888979
