### Citibike Trip Data

In [2]:
import pandas as pd

In [2]:
# Data file for May 2016
df = pd.read_csv("201605-citibike-tripdata.csv")

In [3]:
df.tail(5)

Unnamed: 0,tripduration,starttime,stoptime,start station id,start station name,start station latitude,start station longitude,end station id,end station name,end station latitude,end station longitude,bikeid,usertype,birth year,gender
1212275,309,5/31/2016 23:59:21,6/1/2016 00:04:31,397,Fulton St & Clermont Ave,40.684157,-73.969223,3049,Cambridge Pl & Gates Ave,40.68488,-73.96304,20805,Subscriber,1987.0,2
1212276,985,5/31/2016 23:59:21,6/1/2016 00:15:46,531,Forsyth St & Broome St,40.718939,-73.992663,402,Broadway & E 22 St,40.740343,-73.989551,22010,Subscriber,1966.0,1
1212277,674,5/31/2016 23:59:35,6/1/2016 00:10:50,128,MacDougal St & Prince St,40.727103,-74.002971,276,Duane St & Greenwich St,40.717488,-74.010455,25086,Subscriber,1998.0,1
1212278,533,5/31/2016 23:59:37,6/1/2016 00:08:31,281,Grand Army Plaza & Central Park S,40.764397,-73.973715,477,W 41 St & 8 Ave,40.756405,-73.990026,14903,Subscriber,1970.0,1
1212279,457,5/31/2016 23:59:43,6/1/2016 00:07:20,477,W 41 St & 8 Ave,40.756405,-73.990026,457,Broadway & W 58 St,40.766953,-73.981693,15362,Subscriber,1975.0,1


### Bike Availability Data

In [10]:
import json
from urllib.request import urlopen

response = urlopen("https://feeds.citibikenyc.com/stations/stations.json")

j = response.read().decode('utf-8')
s = json.loads(j)
station_json = s['stationBeanList']
station_df = pd.DataFrame(station_json)

In [9]:
#station_df.tail(5)
station_df[station_df['stationName'].str.startswith('E 72')]

Unnamed: 0,altitude,availableBikes,availableDocks,city,id,landMark,lastCommunicationTime,latitude,location,longitude,postalCode,stAddress1,stAddress2,stationName,statusKey,statusValue,testStation,totalDocks
401,,2,40,,3139,,2016-11-16 11:34:01 AM,40.771183,,-73.964094,,E 72 St & Park Ave,,E 72 St & Park Ave,1,In Service,False,43
416,,19,19,,3156,,2016-11-16 11:35:43 AM,40.766638,,-73.953483,,E 72 St & York Ave,,E 72 St & York Ave,1,In Service,False,39


In [12]:
station_df = station_df[station_df["statusValue"] == "In Service"]
p72_df = station_df[station_df["stationName"] == "E 72 St & Park Ave"]
p72_df.head()

Unnamed: 0,altitude,availableBikes,availableDocks,city,id,landMark,lastCommunicationTime,latitude,location,longitude,postalCode,stAddress1,stAddress2,stationName,statusKey,statusValue,testStation,totalDocks
401,,2,40,,3139,,2016-11-16 11:34:01 AM,40.771183,,-73.964094,,E 72 St & Park Ave,,E 72 St & Park Ave,1,In Service,False,43


In [11]:
# Write table to file
import datetime

fname = "citibike_" + datetime.datetime.strftime(datetime.datetime.now(),"%Y%m%d%H%M%S") + ".json"
with open(fname,"w") as f:
    f.write(j)

### Mapping Bike Availability

In [13]:
# Courtesy of:
# http://www.gal-systems.com/2011/07/convert-coordinates-between-web.html

import math

def lon_to_web_mercator(lon):

    num = lon * 0.017453292519943295;
    x = 6378137.0 * num;
    mercatorX = x;
    return mercatorX

def lat_to_web_mercator(lat):

    a = lat * 0.017453292519943295;
    mercatorY = 3189068.5 * math.log((1.0 + math.sin(a)) / (1.0 - math.sin(a)));

    return mercatorY

In [15]:
X = station_df['longitude'].map(lon_to_web_mercator)
Y = station_df['latitude'].map(lat_to_web_mercator)

In [16]:
from bokeh.plotting import figure, output_notebook, show

output_notebook()

x_range=(-8255000,-8225000)
y_range=(4963000,4983000)

def base_plot(tools='pan,wheel_zoom,reset,hover',plot_width=900, plot_height=600, **plot_args):
    p = figure(tools=tools, plot_width=plot_width, plot_height=plot_height,
        x_range=x_range, y_range=y_range, outline_line_color=None,
        min_border=0, min_border_left=0, min_border_right=0,
        min_border_top=0, min_border_bottom=0, **plot_args)
    
    p.axis.visible = True
    p.xgrid.grid_line_color = None
    p.ygrid.grid_line_color = None
    return p
    
options = dict(line_color=None, fill_color='blue', size=5)

from IPython.core.display import HTML, display
display(HTML("<style>.container { width:90% !important; }</style>"))

In [17]:
from bokeh.tile_providers import STAMEN_TONER
output_notebook()

p = base_plot()
p.add_tile(STAMEN_TONER)
#p.circle(mercatorX, mercatorY)
p.circle(X,Y,size=station_df['availableBikes'])
show(p)

### Weather Data

We use forecast.io.  Sign up for an API key good for 1000 calls per day.  This is more than adequate for polling every 5 minutes

In [18]:
weather_raw = urlopen("https://api.forecast.io/forecast/62b3f08596594954c4856e61610736ee/40.71,-74.01")
weather_json = json.loads(weather_raw.read().decode('utf-8'))

In [19]:
weather_minutely = weather_json['minutely']['data']
minutely_df = pd.DataFrame(weather_minutely)
minutely_df.head(5)

Unnamed: 0,precipIntensity,precipProbability,time
0,0,0,1479315120
1,0,0,1479315180
2,0,0,1479315240
3,0,0,1479315300
4,0,0,1479315360


In [20]:
weather_hourly = weather_json['hourly']['data']
hourly_df = pd.DataFrame(weather_hourly)
hourly_df.head(10)

Unnamed: 0,apparentTemperature,cloudCover,dewPoint,humidity,icon,ozone,precipIntensity,precipProbability,pressure,summary,temperature,time,visibility,windBearing,windSpeed
0,52.6,0.04,38.55,0.59,clear-day,275.6,0,0,1008.38,Clear,52.6,1479312000,9.66,262,5.09
1,57.22,0.05,41.59,0.56,clear-day,273.35,0,0,1008.39,Clear,57.22,1479315600,9.78,248,6.34
2,58.96,0.08,42.0,0.53,clear-day,271.58,0,0,1008.38,Clear,58.96,1479319200,9.86,245,6.88
3,60.51,0.12,42.88,0.52,clear-day,270.86,0,0,1008.37,Clear,60.51,1479322800,9.92,243,6.76
4,60.84,0.12,43.5,0.53,clear-day,270.63,0,0,1008.41,Clear,60.84,1479326400,9.96,240,6.42
5,59.54,0.17,43.96,0.56,clear-day,270.82,0,0,1008.64,Clear,59.54,1479330000,9.98,238,5.78
6,57.18,0.33,43.7,0.61,partly-cloudy-night,271.66,0,0,1009.31,Partly Cloudy,57.18,1479333600,10.0,234,6.15
7,55.37,0.37,43.51,0.64,partly-cloudy-night,272.92,0,0,1010.09,Partly Cloudy,55.37,1479337200,9.67,244,6.03
8,53.87,0.35,42.95,0.66,partly-cloudy-night,273.95,0,0,1010.78,Partly Cloudy,53.87,1479340800,9.5,262,6.07
9,52.48,0.36,42.68,0.69,partly-cloudy-night,274.46,0,0,1011.27,Partly Cloudy,52.48,1479344400,9.39,271,5.97


In [21]:
weather_daily = weather_json['daily']['data']
daily_df = pd.DataFrame(weather_daily)
daily_df.head(7)

Unnamed: 0,apparentTemperatureMax,apparentTemperatureMaxTime,apparentTemperatureMin,apparentTemperatureMinTime,cloudCover,dewPoint,humidity,icon,moonPhase,ozone,...,sunriseTime,sunsetTime,temperatureMax,temperatureMaxTime,temperatureMin,temperatureMinTime,time,visibility,windBearing,windSpeed
0,60.84,1479326400,40.69,1479286800,0.24,41.42,0.7,partly-cloudy-night,0.58,271.86,...,1479296760,1479332321,60.84,1479326400,44.52,1479290400,1479272400,9.72,260,5.91
1,60.93,1479412800,43.05,1479380400,0.04,43.16,0.72,clear-day,0.62,263.57,...,1479383231,1479418676,60.93,1479412800,46.44,1479380400,1479358800,9.75,315,6.72
2,63.02,1479495600,39.38,1479466800,0.01,41.76,0.7,clear-day,0.65,242.94,...,1479469701,1479505032,63.02,1479495600,43.05,1479466800,1479445200,10.0,322,4.28
3,65.66,1479585600,39.92,1479614400,0.31,43.81,0.71,wind,0.69,244.02,...,1479556171,1479591391,65.66,1479585600,45.02,1479553200,1479531600,10.0,270,6.12
4,41.1,1479672000,29.69,1479639600,0.24,34.58,0.7,rain,0.72,295.54,...,1479642641,1479677751,48.96,1479672000,40.23,1479636000,1479618000,,287,25.77
5,38.44,1479762000,23.41,1479726000,0.01,27.29,0.63,snow,0.76,291.15,...,1479729110,1479764113,45.66,1479762000,34.7,1479729600,1479704400,,301,18.77
6,44.32,1479852000,25.41,1479812400,0.4,27.97,0.61,partly-cloudy-night,0.79,269.3,...,1479815578,1479850478,47.41,1479852000,34.07,1479812400,1479790800,,283,8.29


In [22]:
# this is what we'd want to dump to a file

weather_currently = weather_json['currently']
print(json.dumps(weather_currently, indent=4))

{
    "temperature": 56.64,
    "summary": "Clear",
    "humidity": 0.56,
    "visibility": 9.76,
    "precipProbability": 0,
    "precipIntensity": 0,
    "nearestStormDistance": 55,
    "ozone": 273.63,
    "windBearing": 249,
    "windSpeed": 6.17,
    "pressure": 1008.39,
    "nearestStormBearing": 11,
    "time": 1479315150,
    "icon": "clear-day",
    "cloudCover": 0.05,
    "apparentTemperature": 56.64,
    "dewPoint": 41.21
}


### Batch processing of weather data

In [23]:
# once we have a bunch of files, we can stitch them together into a matrix

import os

weather_dir = "/Users/zcarwile/Documents/content/sales_engineering_demos/citibike/weatherData/"

i = 1
for file in os.listdir(weather_dir):
    
    timestamp = file.replace("weather_","").replace(".txt","")
    year = int(timestamp[0:4])
    month = int(timestamp[4:6])
    day = int(timestamp[6:8])
    hour = int(timestamp[8:10])
    minute =  int(timestamp[10:12])

    with open(weather_dir + file,"r") as f:
    
        weather_currently = eval(f.read())
    
        if i == 1:
            cols = ['TS_year','TS_month','TS_day','TS_hour','TS_minute']
            for key, value in weather_currently.items():
                cols.append(key)
            weather_df = pd.DataFrame(columns=cols)

        row = [year,month,day,hour,minute]
        for col in cols:
            if "TS_" not in col:
                row.append(weather_currently[col])
        weather_df.loc[i] = row          

        i = i + 1

weather_df.head()

FileNotFoundError: [Errno 2] No such file or directory: '/Users/zcarwile/Documents/content/sales_engineering_demos/citibike/weatherData/'