In [1]:
import pandas as pd
import geopandas as gp
from fiona.crs import from_epsg
import shapely
import matplotlib.pylab as plt
import csv
import dateutil
import os
from shapely.geometry import Point
%matplotlib inline

In [2]:
def clean_columns(data,year):
    
    data.columns = map(lambda x: x.strip(), list(data.columns))
    data.columns = map(lambda x: x.lower(), list(data.columns))
        
    if year == '2014':
        data.drop([u'vendor_id', u'passenger_count', u'rate_code', u'store_and_fwd_flag',
                   'dropoff_longitude','dropoff_latitude', u'payment_type',u'fare_amount', 
                   u'surcharge', u'mta_tax', u'tip_amount',
                   u'tolls_amount', u'total_amount'], axis = 1, inplace = True)
        
    elif year == '2015':
        data.drop([u'vendorid', u'passenger_count', u'ratecodeid', u'store_and_fwd_flag',
                   u'dropoff_longitude', u'dropoff_latitude', u'payment_type',
                   u'fare_amount', u'extra', u'mta_tax', u'tip_amount', u'tolls_amount',
                   u'improvement_surcharge', u'total_amount'], axis = 1, inplace = True)
    
    elif year == '2016':    
        data.drop([u'vendorid', u'passenger_count', u'ratecodeid', u'store_and_fwd_flag',
                   u'dropoff_longitude', u'dropoff_latitude', u'payment_type',
                   u'fare_amount', u'extra', u'mta_tax', u'tip_amount', u'tolls_amount',
                   u'improvement_surcharge', u'total_amount'], axis = 1, inplace = True)
    
    data.columns = ['date', 'dropoff_datetime', 'trip_distance', 
                    'pickup_longitude', 'pickup_latitude']
    
    return data

In [3]:
def data_aggregator(data, tzone):
    
    data["date"] = pd.to_datetime(data["date"])
    data = data[(data['trip_distance'] <= 30) & (data['trip_distance'] > 0)]
    
    crs = {'init': 'epsg:4326', 'no_defs': True}
    geometry = [Point(xy) for xy in zip(data["pickup_longitude"], data["pickup_latitude"])]
    data = data.drop(["pickup_longitude", "pickup_latitude"], axis=1)
    data = gp.GeoDataFrame(data, crs=crs, geometry=geometry)

    print "Spatially joining data and taxi zones"
    data_geo = gp.sjoin(tzones, data) 
    data_geo.drop(['borough', u'OBJECTID', 'Shape_Area', 'Shape_Leng',
                   u'geometry', 'index_right', 'zone'], axis = 1, inplace = True)   
    

    data_agg = data_geo.set_index('date').groupby([pd.TimeGrouper('D'),'LocationID']).agg({'trip_distance':'mean', 
                                                                                                'dropoff_datetime': 'count'})
    data_agg.reset_index('LocationID', inplace=True)
    
    data_agg.columns = ['locationid', 'trip_distance', 'pickup_count']
    print "Done"
    return data_agg

In [4]:
# Read data by month
def yellow_taxi(tzones): 
    
    
#     if not (os.path.isdir("../Data/Aggregated")):
#         print "Creating Folder Data/Aggregated"
#         os.system("mkdir ../Data/Aggregated")
        
#     year = ['2014', '2015']
#     months = ['01','02','03','04','05','06','07','08','09','10','11','12']
    year = ['2015']
    months = ['10','11','12']

    yellow = []
    for y in year:
        for m in months:
            
            print "Reading Data/Yellow/yellow_tripdata_" + y + "-" + m + ".csv"
            data = pd.read_csv("../../Data/Yellow/yellow_tripdata_" + y + "-" + m + ".csv")
            print "Cleaning columns"
            data = clean_columns(data, y)
            print "Aggregating data"
            data_agg = data_aggregator(data, tzones)
            data_agg.to_csv("../../Data/Aggregated/yellow_"+ y + "-" + m + ".csv")
            yellow.append(data_agg)
    
    return yellow

# Taxi zones geocoded data

In [5]:
tzones = gp.read_file("../../Data/taxi_zones/taxi_zones_updated.shp")

In [6]:
tzones.to_crs(epsg=4326, inplace=True)

In [7]:
tzones.head(3)

Unnamed: 0,LocationID,OBJECTID,Shape_Area,Shape_Leng,borough,geometry,zone
0,1,1,0.000782,0.116357,EWR,POLYGON ((-74.18445300000005 40.69499600003324...,Newark Airport
1,2,2,0.004866,0.43347,Queens,(POLYGON ((-73.82337597260654 40.6389870472100...,Jamaica Bay
2,3,3,0.000314,0.084341,Bronx,POLYGON ((-73.84792614099977 40.87134223403326...,Allerton/Pelham Gardens


In [None]:
len(tzones.LocationID.unique())

263

# Run Data

In [None]:
data = yellow_taxi(tzones)

Reading Data/Yellow/yellow_tripdata_2015-10.csv
Cleaning columns
Aggregating data


In [2]:
pd.read_csv("../../Data/Yellow/yellow_tripdata_2015-05.csv", nrows=15)

Unnamed: 0,VendorID,tpep_pickup_datetime,tpep_dropoff_datetime,passenger_count,trip_distance,pickup_longitude,pickup_latitude,RateCodeID,store_and_fwd_flag,dropoff_longitude,dropoff_latitude,payment_type,fare_amount,extra,mta_tax,tip_amount,tolls_amount,improvement_surcharge,total_amount
0,1,2015-05-05 23:37:40,2015-05-05 23:45:41,1,2.0,-74.001678,40.739311,1,N,-73.978294,40.75211,2,8.5,0.5,0.5,0.0,0.0,0.3,9.8
1,2,2015-05-05 23:37:40,2015-05-05 23:40:36,1,0.54,-73.93084,40.744789,1,N,-73.937515,40.749359,2,4.5,0.5,0.5,0.0,0.0,0.3,5.8
2,2,2015-05-05 23:37:40,2015-05-05 23:44:03,3,2.1,-74.001411,40.731087,1,N,-73.981674,40.758282,2,8.0,0.5,0.5,0.0,0.0,0.3,9.3
3,2,2015-05-05 23:37:40,2015-05-06 00:14:01,6,10.93,-73.970673,40.75856,1,N,-73.933762,40.670544,1,36.0,0.5,0.5,9.32,0.0,0.3,46.62
4,2,2015-05-05 23:37:40,2015-05-05 23:46:03,5,0.93,-73.986732,40.755878,1,N,-73.990959,40.749981,1,7.0,0.5,0.5,2.49,0.0,0.3,10.79
5,1,2015-05-05 23:37:41,2015-05-05 23:50:34,3,2.4,-73.989326,40.756599,1,N,-73.979584,40.735363,2,11.0,0.5,0.5,0.0,0.0,0.3,12.3
6,1,2015-05-05 23:37:42,2015-05-05 23:41:51,1,1.3,-73.955544,40.776722,1,N,-73.941338,40.78812,2,6.0,0.5,0.5,0.0,0.0,0.3,7.3
7,2,2015-05-05 23:37:42,2015-05-05 23:53:25,1,3.7,-73.99221,40.729187,1,N,-73.960396,40.775631,1,15.0,0.5,0.5,3.26,0.0,0.3,19.56
8,2,2015-05-05 23:37:42,2015-05-05 23:52:53,2,7.84,-73.98774,40.74086,1,N,-73.873169,40.723091,1,23.5,0.5,0.5,6.07,5.54,0.3,36.41
9,2,2015-05-05 23:37:42,2015-05-05 23:48:25,3,2.36,-73.987999,40.743996,1,N,-73.998528,40.717003,2,10.0,0.5,0.5,0.0,0.0,0.3,11.3
