In [1]:
import pandas as pd
import geopandas as gp
from fiona.crs import from_epsg
import shapely
import matplotlib.pylab as plt
import csv
import dateutil
import os
from shapely.geometry import Point
%matplotlib inline

In [2]:
def clean_columns(data,year):
    
    data.columns = map(lambda x: x.strip(), list(data.columns))
    data.columns = map(lambda x: x.lower(), list(data.columns))
        
    if (year == '2014') | (year == '2013'):
        data.drop([u'vendor_id', u'passenger_count', u'rate_code', u'store_and_fwd_flag',
                   'dropoff_longitude','dropoff_latitude', u'payment_type',u'fare_amount', 
                   u'surcharge', u'mta_tax', u'tip_amount',
                   u'tolls_amount', u'total_amount'], axis = 1, inplace = True)
        
    elif year == '2015':
        data.drop([u'vendorid', u'passenger_count', u'ratecodeid', u'store_and_fwd_flag',
                   u'dropoff_longitude', u'dropoff_latitude', u'payment_type',
                   u'fare_amount', u'extra', u'mta_tax', u'tip_amount', u'tolls_amount',
                   u'improvement_surcharge', u'total_amount'], axis = 1, inplace = True)
    
    elif year == '2016':    
        data.drop([u'vendorid', u'passenger_count', u'ratecodeid', u'store_and_fwd_flag',
                   u'dropoff_longitude', u'dropoff_latitude', u'payment_type',
                   u'fare_amount', u'extra', u'mta_tax', u'tip_amount', u'tolls_amount',
                   u'improvement_surcharge', u'total_amount'], axis = 1, inplace = True)
    
    data.columns = ['date', 'dropoff_datetime', 'trip_distance', 
                    'pickup_longitude', 'pickup_latitude']
    
    return data

In [3]:
def data_aggregator(data, tzone):
    
    data["date"] = pd.to_datetime(data["date"])
    data = data[(data['trip_distance'] <= 30) & (data['trip_distance'] > 0)]
    
    crs = {'init': 'epsg:4326', 'no_defs': True}
    geometry = [Point(xy) for xy in zip(data["pickup_longitude"], data["pickup_latitude"])]
    data = data.drop(["pickup_longitude", "pickup_latitude"], axis=1)
    data = gp.GeoDataFrame(data, crs=crs, geometry=geometry)

    print "Spatially joining data and taxi zones"
    data_geo = gp.sjoin(tzones, data) 
    data_geo.drop(['borough', u'OBJECTID', 'Shape_Area', 'Shape_Leng',
                   u'geometry', 'index_right', 'zone'], axis = 1, inplace = True)   
    

    data_agg = data_geo.set_index('date').groupby([pd.TimeGrouper('D'),'LocationID']).agg({'trip_distance':'mean', 
                                                                                                'dropoff_datetime': 'count'})
    data_agg.reset_index('LocationID', inplace=True)
    
    data_agg.columns = ['locationid', 'trip_distance', 'pickup_count']
    print "Done"
    return data_agg

In [4]:
# Read data by month
def yellow_taxi(tzones): 
    
    
    if not (os.path.isdir("../../Data/Aggregated")):
        print "Creating Folder Data/Aggregated"
        os.system("mkdir ../../Data/Aggregated")
        
#     year = ['2014', '2015']
#     months = ['01','02','03','04','05','06','07','08','09','10','11','12']
    year = ['2013']
    months = ['04','05','06']

    yellow = []
    for y in year:
        for m in months:
            
            print "Reading Data/Yellow/yellow_tripdata_" + y + "-" + m + ".csv"
            data = pd.read_csv("../../Data/Yellow/yellow_tripdata_" + y + "-" + m + ".csv")
            print "Cleaning columns"
            data = clean_columns(data, y)
            print "Aggregating data"
            data_agg = data_aggregator(data, tzones)
            data_agg.to_csv("../../Data/Aggregated/yellow_"+ y + "-" + m + ".csv")
            yellow.append(data_agg)
    
    return yellow

# Taxi zones geocoded data

In [5]:
tzones = gp.read_file("../../Data/taxi_zones/taxi_zones_updated.shp")

In [6]:
tzones.to_crs(epsg=4326, inplace=True)

In [7]:
tzones.head(3)

Unnamed: 0,LocationID,OBJECTID,Shape_Area,Shape_Leng,borough,geometry,zone
0,1,1,0.000782,0.116357,EWR,POLYGON ((-74.18445300000005 40.69499600003324...,Newark Airport
1,2,2,0.004866,0.43347,Queens,(POLYGON ((-73.82337597260654 40.6389870472100...,Jamaica Bay
2,3,3,0.000314,0.084341,Bronx,POLYGON ((-73.84792614099977 40.87134223403326...,Allerton/Pelham Gardens


In [8]:
len(tzones.LocationID.unique())

263

# Run Data

In [9]:
data = yellow_taxi(tzones)

Reading Data/Yellow/yellow_tripdata_2013-04.csv


  if self.run_code(code, result):


Cleaning columns
Aggregating data
Spatially joining data and taxi zones
Done
Reading Data/Yellow/yellow_tripdata_2013-05.csv
Cleaning columns
Aggregating data
Spatially joining data and taxi zones
Done
Reading Data/Yellow/yellow_tripdata_2013-06.csv
Cleaning columns
Aggregating data
Spatially joining data and taxi zones
Done
