In [1]:
import pandas as pd
import geopandas as gp
import datetime
import dateutil
from sets import Set
import csv
import matplotlib.pylab as plt
from shapely.geometry import Point
%matplotlib inline



# Uber data is downloaded from - 
 https://github.com/fivethirtyeight/uber-tlc-foil-response

# Taxi zones geocoded data

In [2]:
tzones = gp.read_file("../Data/taxi_zones/taxi_zones_updated.shp")
tzones.to_crs(epsg=4326, inplace=True)
crs = {'init': 'epsg:4326'}
tzones = gp.GeoDataFrame(tzones, crs=crs)

In [3]:
tzones.head(3)

Unnamed: 0,LocationID,OBJECTID,Shape_Area,Shape_Leng,borough,geometry,zone
0,1,1,0.000782,0.116357,EWR,POLYGON ((-74.18445300000005 40.69499600003324...,Newark Airport
1,2,2,0.004866,0.43347,Queens,(POLYGON ((-73.82337597260654 40.6389870472100...,Jamaica Bay
2,3,3,0.000314,0.084341,Bronx,POLYGON ((-73.84792614099977 40.87134223403326...,Allerton/Pelham Gardens


In [4]:
len(tzones.LocationID.unique())

263

In [5]:
zones = tzones[['LocationID', 'zone']]

# Uber 2014 data

In [6]:
apr14 = pd.read_csv("../Data/Uber/uber-raw-data-apr14.csv",
                     parse_dates= ['Date/Time'])
may14 = pd.read_csv('../Data/Uber/uber-raw-data-may14.csv', 
                    parse_dates=['Date/Time'])
jun14 = pd.read_csv('../Data/Uber/uber-raw-data-jun14.csv', 
                    parse_dates=['Date/Time'])
july14 = pd.read_csv('../Data/Uber/uber-raw-data-jul14.csv', 
                    parse_dates=['Date/Time'])
aug14 = pd.read_csv('../Data/Uber/uber-raw-data-aug14.csv', 
                    parse_dates=['Date/Time'])
sep14 = pd.read_csv('../Data/Uber/uber-raw-data-sep14.csv', 
                    parse_dates=['Date/Time'])

# Uber 14 clearing up

In [7]:
crs = {'init': 'epsg:4326'}
geometry = [Point(xy) for xy in zip(apr14.Lon, apr14.Lat)]
apr14 = apr14.drop(['Lon', 'Lat'], axis=1)
apr14 = gp.GeoDataFrame(apr14, crs=crs, geometry=geometry)

geometry = [Point(xy) for xy in zip(may14.Lon, may14.Lat)]
may14 = may14.drop(['Lon', 'Lat'], axis=1)
may14 = gp.GeoDataFrame(may14, crs=crs, geometry=geometry)

geometry = [Point(xy) for xy in zip(jun14.Lon, jun14.Lat)]
jun14 = jun14.drop(['Lon', 'Lat'], axis=1)
jun14 = gp.GeoDataFrame(jun14, crs=crs, geometry=geometry)

geometry = [Point(xy) for xy in zip(july14.Lon, july14.Lat)]
july14 = july14.drop(['Lon', 'Lat'], axis=1)
july14 = gp.GeoDataFrame(july14, crs=crs, geometry=geometry)

geometry = [Point(xy) for xy in zip(aug14.Lon, aug14.Lat)]
aug14 = aug14.drop(['Lon', 'Lat'], axis=1)
aug14 = gp.GeoDataFrame(aug14, crs=crs, geometry=geometry)

geometry = [Point(xy) for xy in zip(sep14.Lon, sep14.Lat)]
sep14 = sep14.drop(['Lon', 'Lat'], axis=1)
sep14 = gp.GeoDataFrame(sep14, crs=crs, geometry=geometry)

Merge Uber and taxi zones

In [8]:
apr14 = gp.sjoin(tzones, apr14)
may14 = gp.sjoin(tzones, may14)
jun14 = gp.sjoin(tzones, jun14)
july14 = gp.sjoin(tzones, july14)
aug14 = gp.sjoin(tzones, aug14)
sep14 = gp.sjoin(tzones, sep14)

In [28]:
apr14 = apr14.set_index('Date/Time').groupby([pd.TimeGrouper('D'),'LocationID']).agg({'Base': 'count'})
apr14.reset_index('LocationID', inplace = True)
apr14.columns = ['LocationID', 'counts']

may14 = may14.set_index('Date/Time').groupby([pd.TimeGrouper('D'),'LocationID']).agg({'Base': 'count'})
may14.reset_index('LocationID', inplace = True)
may14.columns = ['LocationID', 'counts']

jun14 = jun14.set_index('Date/Time').groupby([pd.TimeGrouper('D'),'LocationID']).agg({'Base': 'count'})
jun14.reset_index('LocationID', inplace = True)
jun14.columns = ['LocationID', 'counts']

july14 = july14.set_index('Date/Time').groupby([pd.TimeGrouper('D'),'LocationID']).agg({'Base': 'count'})
july14.reset_index('LocationID', inplace = True)
july14.columns = ['LocationID', 'counts']

aug14 = aug14.set_index('Date/Time').groupby([pd.TimeGrouper('D'),'LocationID']).agg({'Base': 'count'})
aug14.reset_index('LocationID', inplace = True)
aug14.columns = ['LocationID', 'counts']

sep14 = sep14.set_index('Date/Time').groupby([pd.TimeGrouper('D'),'LocationID']).agg({'Base': 'count'})
sep14.reset_index('LocationID', inplace = True)
sep14.columns = ['LocationID', 'counts']

In [30]:
uber_14 = pd.concat([apr14, may14, jun14, july14, aug14, sep14])

In [32]:
uber_14.to_csv("../Data/Aggregated/individual_files/uber14.csv")

# Uber 2015 data

In [33]:
uber_15 = pd.read_csv("../Data/Uber/uber-raw-data-janjune-15.csv", 
                                parse_dates= ['Pickup_date'])

In [34]:
uber_15.head(3)

Unnamed: 0,Dispatching_base_num,Pickup_date,Affiliated_base_num,locationID
0,B02617,2015-05-17 09:47:00,B02617,141
1,B02617,2015-05-17 09:47:00,B02617,65
2,B02617,2015-05-17 09:47:00,B02617,100


# Merge taxi zones and uber15 data

In [35]:
uber_15_geo = (pd.merge(uber_15, tzones, how='left', left_on='locationID', 
                       right_on='LocationID'))

In [36]:
uber_15_geo.drop(['OBJECTID', 'Shape_Area','Shape_Leng', 'Dispatching_base_num', 'Affiliated_base_num'], inplace = True, axis = 1)

In [37]:
uber_15_geo.head(3)

Unnamed: 0,Pickup_date,locationID,LocationID,borough,geometry,zone
0,2015-05-17 09:47:00,141,141.0,Manhattan,POLYGON ((-73.96177668399996 40.75987971603325...,Lenox Hill West
1,2015-05-17 09:47:00,65,65.0,Brooklyn,POLYGON ((-73.98712491499988 40.70063448003331...,Downtown Brooklyn/MetroTech
2,2015-05-17 09:47:00,100,100.0,Manhattan,POLYGON ((-73.98729377099981 40.75045160903321...,Garment District


In [38]:
uber_15_geo.dropna(inplace=True)

In [39]:
uber_zones = uber_15_geo.set_index('Pickup_date').groupby([pd.TimeGrouper('D'),'LocationID']).agg({'borough': 'count'})

In [41]:
uber_zones.reset_index('LocationID', inplace=True)

In [43]:
uber_zones.columns = ['LocationID', 'counts']

In [45]:
uber_zones.to_csv("../Data/Aggregated/individual_files/Uber15.csv")

In [48]:
uber_zones.head()

Unnamed: 0_level_0,LocationID,counts
Pickup_date,Unnamed: 1_level_1,Unnamed: 2_level_1
2015-01-01,1.0,1
2015-01-01,3.0,15
2015-01-01,4.0,480
2015-01-01,5.0,1
2015-01-01,6.0,2


In [49]:
uber_14.head()

Unnamed: 0_level_0,LocationID,counts
Date/Time,Unnamed: 1_level_1,Unnamed: 2_level_1
2014-04-01,1,115
2014-04-01,3,1
2014-04-01,4,81
2014-04-01,7,24
2014-04-01,10,6


In [53]:
uber_14.reset_index(inplace=True)
uber_zones.reset_index(inplace=True)

uber_14.columns = ['Date', 'LocationID', 'counts']
uber_zones.columns = ['Date', 'LocationID', 'counts']

In [54]:
uber_14.head()

Unnamed: 0,Date,LocationID,counts
0,2014-04-01,1,115
1,2014-04-01,3,1
2,2014-04-01,4,81
3,2014-04-01,7,24
4,2014-04-01,10,6


In [55]:
uber_zones.head()

Unnamed: 0,Date,LocationID,counts
0,2015-01-01,1.0,1
1,2015-01-01,3.0,15
2,2015-01-01,4.0,480
3,2015-01-01,5.0,1
4,2015-01-01,6.0,2


In [57]:
uber_zones.LocationID = uber_zones.LocationID.apply(lambda x: int(x))

In [58]:
uber_zones.head()

Unnamed: 0,Date,LocationID,counts
0,2015-01-01,1,1
1,2015-01-01,3,15
2,2015-01-01,4,480
3,2015-01-01,5,1
4,2015-01-01,6,2


In [59]:
uber = pd.concat([uber_14, uber_zones])

In [61]:
uber.to_csv("../Data/Aggregated/uber.csv")