In [35]:
import pandas as pd
import os
import json
from shapely.geometry import Point
import geopandas as gp
import numpy as np
import urllib
import matplotlib.pyplot as plt
%matplotlib inline

# Citibike Stations

In [36]:
url = "https://feeds.citibikenyc.com/stations/stations.json"
response = urllib.urlopen(url)
stations = json.loads(response.read())
    
# https://feeds.citibikenyc.com/stations/stations.json

In [37]:
st = map(lambda x: (x['id'],x['latitude'], x['longitude'], x['stAddress1']),stations['stationBeanList'])

In [38]:
cb_stations = pd.DataFrame(st, columns=['id', 'latitude', 'longitude', 'station_address'])

In [39]:
cb_stations.duplicated().sum()

0

In [40]:
cb_stations.head()

Unnamed: 0,id,latitude,longitude,station_address
0,72,40.767272,-73.993929,W 52 St & 11 Ave
1,79,40.719116,-74.006667,Franklin St & W Broadway
2,82,40.711174,-74.000165,St James Pl & Pearl St
3,83,40.683826,-73.976323,Atlantic Ave & Fort Greene Pl
4,116,40.741776,-74.001497,W 17 St & 8 Ave


# Taxi zones

In [41]:
tzones = gp.read_file("../../Data/taxi_zones/taxi_zones_updated.shp")

In [42]:
tzones.to_crs(epsg=4326, inplace=True)

# Merge all

In [43]:
def merge(stations, tzones):
    crs = {'init': 'epsg:4326', 'no_defs': True}
    geometry = [Point(xy) for xy in zip(stations["longitude"], stations["latitude"])]
    stations = stations.drop(["longitude", "latitude"], axis=1)
    stations = gp.GeoDataFrame(stations, crs=crs, geometry=geometry)
    
    print "Spatially joining cb_stations and taxi zones"
    stations_geo = gp.sjoin(tzones, stations) 
    stations_geo.drop(['borough', u'OBJECTID', 'Shape_Area', 'Shape_Leng',
                    u'geometry', 'index_right', 'zone'], axis = 1, inplace = True)  
    stations_geo.reset_index(inplace=True, drop=True)
    
    return stations_geo

In [14]:
def data_aggregator(data, stations_geo):
    data = data[['starttime', 'start station id']]
    data["starttime"] = pd.to_datetime(data["starttime"])

    data = pd.merge(data, stations_geo, left_on='start station id', right_on= 'id', how='left')
    data.dropna(inplace=True)
    data_agg = data.set_index('starttime').groupby([pd.TimeGrouper('D'),'LocationID']).agg({'start station id':'count'})
    data_agg.reset_index('LocationID', inplace=True)
    data_agg.columns = ['locationid', 'count']
    return data_agg

In [15]:
def citibike(years, months):
    
    '''Agrregate citibike data by taxi zones'''

#     year = ['2013', '2014', '2015', '2016']
#     months = ['07' ,'08', '09', '10', '11', '12']
#     months = ['01' ,'02', '03', '04', '05', '06', '07' ,'08', '09', '10', '11', '12']
    stations_geo = merge(cb_stations, tzones)
    
    citibike = pd.DataFrame()
    for y in years:
        for m in months:
            
            print "Reading Data/Citibike/" + y + m + "-citibike-tripdata.csv"
            data = pd.read_csv("../../Data/Citibike/" + y + m + "-citibike-tripdata.csv")
#             print "Cleaning columns"
#             data = clean_columns(data, y, m)
            print "Aggregating data"
            data_agg = data_aggregator(data, stations_geo)
            data_agg.to_csv("../../Data/Aggregated/individual_files/Citibike"+ y + "-" + m + ".csv")
            citibike = citibike.append(data_agg)
    
    return citibike

In [16]:
cb1 = citibike(['2013'], ['07', '08', '09', '10', '11', '12'])

Spatially joining cb_stations and taxi zones
Reading Data/Citibike/201307-citibike-tripdata.csv
Aggregating data


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  app.launch_new_instance()


Reading Data/Citibike/201308-citibike-tripdata.csv
Aggregating data
Reading Data/Citibike/201309-citibike-tripdata.csv
Aggregating data
Reading Data/Citibike/201310-citibike-tripdata.csv
Aggregating data
Reading Data/Citibike/201311-citibike-tripdata.csv
Aggregating data
Reading Data/Citibike/201312-citibike-tripdata.csv
Aggregating data


In [17]:
cb2 = citibike(years=['2014'], months = ['01' ,'02', '03', '04', '05', '06', '07' ,'08', '09', '10', '11', '12'])

Spatially joining cb_stations and taxi zones
Reading Data/Citibike/201401-citibike-tripdata.csv
Aggregating data


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  app.launch_new_instance()


Reading Data/Citibike/201402-citibike-tripdata.csv
Aggregating data
Reading Data/Citibike/201403-citibike-tripdata.csv
Aggregating data
Reading Data/Citibike/201404-citibike-tripdata.csv
Aggregating data
Reading Data/Citibike/201405-citibike-tripdata.csv
Aggregating data
Reading Data/Citibike/201406-citibike-tripdata.csv
Aggregating data
Reading Data/Citibike/201407-citibike-tripdata.csv
Aggregating data
Reading Data/Citibike/201408-citibike-tripdata.csv
Aggregating data
Reading Data/Citibike/201409-citibike-tripdata.csv
Aggregating data
Reading Data/Citibike/201410-citibike-tripdata.csv
Aggregating data
Reading Data/Citibike/201411-citibike-tripdata.csv
Aggregating data
Reading Data/Citibike/201412-citibike-tripdata.csv
Aggregating data


In [27]:
cb3 = citibike(years=['2015'], months = ['01' ,'02', '03', '04','06', '07' ,'08', '09', '10', '11', '12'])

Spatially joining cb_stations and taxi zones
Reading Data/Citibike/201501-citibike-tripdata.csv
Aggregating data


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  app.launch_new_instance()


Reading Data/Citibike/201502-citibike-tripdata.csv
Aggregating data
Reading Data/Citibike/201503-citibike-tripdata.csv
Aggregating data
Reading Data/Citibike/201504-citibike-tripdata.csv
Aggregating data
Reading Data/Citibike/201506-citibike-tripdata.csv
Aggregating data
Reading Data/Citibike/201507-citibike-tripdata.csv
Aggregating data
Reading Data/Citibike/201508-citibike-tripdata.csv
Aggregating data
Reading Data/Citibike/201509-citibike-tripdata.csv
Aggregating data
Reading Data/Citibike/201510-citibike-tripdata.csv
Aggregating data
Reading Data/Citibike/201511-citibike-tripdata.csv
Aggregating data
Reading Data/Citibike/201512-citibike-tripdata.csv
Aggregating data


In [28]:
cb4 = citibike(years=['2016'], months = ['01' ,'02', '03', '04', '05', '06', '07' ,'08', '09', '10', '11', '12'])

In [29]:
d = pd.read_csv("../../Data/Citibike/201505-citibike-tripdata.zip")

In [46]:
# data = d[['starttime', 'start station id']]
# data["starttime"] = pd.to_datetime(data["starttime"])

# data = pd.merge(data, stations_geo, left_on='start station id', right_on= 'id', how='left')
# data.dropna(inplace=True)
data_agg = data.set_index('starttime').groupby([pd.TimeGrouper('D'),'LocationID']).agg({'start station id':'count'})
data_agg.reset_index('LocationID', inplace=True)
data_agg.columns = ['locationid', 'count']


In [48]:
data_agg.to_csv("../../Data/Aggregated/individual_files/Citibike"+ '2015' + "-" + '05' + ".csv")

In [44]:
stations_geo = merge(cb_stations, tzones)

Spatially joining cb_stations and taxi zones
