In [1]:
import pandas as pd
import geopandas as gp
from fiona.crs import from_epsg
import shapely
import matplotlib.pylab as plt
import csv
import dateutil
import os
from shapely.geometry import Point
%matplotlib inline

In [2]:
# files for each year like:
## <neighborhood/borough of pick up>,<distance>,<ended on Manhattan (0/1)>,<number of trips>.

In [3]:
def clean_columns(data,year):
    
    data.columns = map(lambda x: x.strip(), list(data.columns))
    data.columns = map(lambda x: x.lower(), list(data.columns))
        
    if (year == '2014') | (year == '2013'):
        data.drop([u'vendor_id', u'passenger_count', u'rate_code', u'store_and_fwd_flag',
                   'dropoff_longitude','dropoff_latitude', u'payment_type',u'fare_amount', 
                   u'surcharge', u'mta_tax', u'tip_amount',
                   u'tolls_amount', u'total_amount'], axis = 1, inplace = True)
        
    elif year == '2015':
        data.drop([u'vendorid', u'passenger_count', u'ratecodeid', u'store_and_fwd_flag',
                   u'dropoff_longitude', u'dropoff_latitude', u'payment_type',
                   u'fare_amount', u'extra', u'mta_tax', u'tip_amount', u'tolls_amount',
                   u'improvement_surcharge', u'total_amount'], axis = 1, inplace = True)
    
    elif year == '2016':    
        data.drop([u'vendorid', u'passenger_count', u'ratecodeid', u'store_and_fwd_flag',
                   u'dropoff_longitude', u'dropoff_latitude', u'payment_type',
                   u'fare_amount', u'extra', u'mta_tax', u'tip_amount', u'tolls_amount',
                   u'improvement_surcharge', u'total_amount'], axis = 1, inplace = True)
    
    data.columns = ['date', 'dropoff_datetime', 'trip_distance', 
                    'pickup_longitude', 'pickup_latitude']
    
    return data

In [4]:
def segregate(data):
    
    man_upper = [24,151,238,75, 236, 263, 262, 239, 143, 142, 237, 141, 140]
    man_lower = [4, 79,113, 114, 249, 158, 125, 211, 144, 148, 232, 13, 231, 45,209,87,88,12,261]
    bk_dwn = [17,  25,  33, 49,  40,  34, 52, 54, 65, 106, 97, 66, 181, 189, 190, 195]
    bk_br = [11, 14, 22, 26, 67, 111, 227, 228]
    
    groups = range(0,25)
    groups.extend([25,data.trip_distance.max()])
    data['trip_cat'] = pd.cut(data.trip_distance, groups, labels=groups[:-1])
    data_man_upper = data[data.LocationID.isin(man_upper)]
    data_man_lower = data[data.LocationID.isin(man_lower)]
    data_bk_dwn = data[data.LocationID.isin(bk_dwn)]
    data_bk_br = data[data.LocationID.isin(bk_br)]
    
    data_man_upper = data_man_upper.groupby(['trip_cat']).agg({'LocationID':'count'})
    data_man_lower = data_man_lower.groupby(['trip_cat']).agg({'LocationID':'count'})
    data_bk_dwn = data_bk_dwn.groupby(['trip_cat']).agg({'LocationID':'count'})
    data_bk_br = data_bk_br.groupby(['trip_cat']).agg({'LocationID':'count'})
        
    data_man_upper.reset_index(inplace=True)
    data_man_lower.reset_index(inplace=True)
    data_bk_dwn.reset_index(inplace=True)
    data_bk_br.reset_index(inplace=True)
    

    data_man_upper.loc[:, 'neighbourhood'] =  'upper_manhattan'
    data_man_lower.loc[:, 'neighbourhood'] =  'lower_manhattan'
    data_bk_dwn.loc[:, 'neighbourhood'] =  'Downtown_Bk'
    data_bk_br.loc[:, 'neighbourhood'] =  'Bayridge_bk'
    
    data = pd.concat([data_man_upper, data_man_lower, data_bk_dwn, data_bk_br], axis=0, ignore_index=True)
    data.columns = ['trip_cat', 'num_rides', 'neighbourhood']
    
    return data

In [5]:
def data_aggregator(data, tzone):
    
#     data["date"] = pd.to_datetime(data["date"])

    crs = {'init': 'epsg:4326', 'no_defs': True}
    geometry = [Point(xy) for xy in zip(data["pickup_longitude"], data["pickup_latitude"])]
    data = data.drop(["pickup_longitude", "pickup_latitude"], axis=1)
    data = gp.GeoDataFrame(data, crs=crs, geometry=geometry)

    print "Spatially joining data and taxi zones"
    data_geo = gp.sjoin(tzones, data) 
    data_geo = data_geo[['LocationID','trip_distance']]   
    
    print "Segregating"
    data_agg = segregate(data_geo)
    
    return data_agg

In [6]:
# Read data by month
def yellow_taxi(tzones): 
    
    
    if not (os.path.isdir("../../Data/Dist_agg")):
        print "Creating Folder Data/Dist_agg"
        os.system("mkdir ../../Data/Dist_agg")
        
#     year = ['2014', '2015']
#     months = ['01','02','03','04','05','06','07','08','09','10','11','12']
    year = ['2015']
#     months = ['10','11','12']
    months = ['12']

    yellow = []
    for y in year:
        for m in months:
            
            print "Reading Data/Yellow/yellow_tripdata_" + y + "-" + m + ".csv"
            data = pd.read_csv("../../Data/Yellow/yellow_tripdata_" + y + "-" + m + ".csv")
            print "Cleaning columns"
            data = clean_columns(data, y)
            print "Aggregating data"
            data_agg = data_aggregator(data, tzones)
            data_agg.to_csv("../../Data/Dist_agg/yellow_"+ y + "-" + m + ".csv")
            print 'Done'
            yellow.append(data_agg)
    
    return yellow

In [7]:
tzones = gp.read_file("../../Data/taxi_zones/taxi_zones_updated.shp")
tzones.to_crs(epsg=4326, inplace=True)

In [8]:
data = yellow_taxi(tzones)

Reading Data/Yellow/yellow_tripdata_2015-12.csv
Cleaning columns
Aggregating data
Spatially joining data and taxi zones
Segregating
Done


In [11]:
def clean_columns_green(data,year, month):
    
    data.columns = map(lambda x: x.strip(), list(data.columns))
    data.columns = map(lambda x: x.lower(), list(data.columns))
    
    if (((year == '2015') & (month in ['07', '08', '09', '10', '11', '12'])) | (year == '2016')):
        data.drop([u'vendorid',u'store_and_fwd_flag', u'ratecodeid', u'dropoff_longitude', 
                   u'dropoff_latitude', u'passenger_count', u'fare_amount', u'extra',
                   u'mta_tax', u'tip_amount', u'tolls_amount', u'ehail_fee',
                   u'improvement_surcharge', u'total_amount', u'payment_type',
                   u'trip_type'], axis = 1, inplace = True)
    
    else: 
    
        cols = data.columns
        data.reset_index(inplace=True)
        data = data.iloc[:,:len(cols)]
        data.columns = cols
        
        if year == '2015':
            data.drop([u'vendorid',u'store_and_fwd_flag', u'ratecodeid', u'dropoff_longitude', 
                       u'dropoff_latitude', u'passenger_count', u'fare_amount', u'extra',
                       u'mta_tax', u'tip_amount', u'tolls_amount', u'ehail_fee',
                       u'improvement_surcharge', u'total_amount', u'payment_type',
                       u'trip_type'], axis = 1, inplace = True)
        
        elif (year == '2014') | (year == '2013'):
            data.drop([u'vendorid',u'store_and_fwd_flag', u'ratecodeid', u'dropoff_longitude', 
                       u'dropoff_latitude', u'passenger_count', u'fare_amount', u'extra',
                       u'mta_tax', u'tip_amount', u'tolls_amount', u'ehail_fee',
                       u'total_amount', u'payment_type', u'trip_type'], axis = 1, inplace = True)
               
    data.columns = ['date', 'dropoff_datetime','pickup_longitude', 'pickup_latitude', 
                    'trip_distance']
    
    return data

In [12]:
# Read data by month
def green_taxi(tzones): 
    
    
    if not (os.path.isdir("../../Data/Dist_agg")):
        print "Creating Folder Data/Dist_agg"
        os.system("mkdir ../../Data/Dist_agg")
        
#     year = ['2014', '2015']
#     months = ['01','02','03','04','05','06','07','08','09','10','11','12']
    year = ['2014', '2015']
    months = ['10','11','12']

    green = []
    for y in year:
        for m in months:
            
            print "Reading Data/Green/green_tripdata_" + y + "-" + m + ".csv"
            data = pd.read_csv("../../Data/Green/green_tripdata_" + y + "-" + m + ".csv")
            print "Cleaning columns"
            data = clean_columns_green(data, y, m)
            print "Aggregating data"
            data_agg = data_aggregator(data, tzones)
            data_agg.to_csv("../../Data/Dist_agg/green_"+ y + "-" + m + ".csv")
            print 'Done'
            green.append(data_agg)
    
    return green

In [13]:
green_taxi(tzones)

Reading Data/Green/green_tripdata_2014-10.csv
Cleaning columns
Aggregating data
Spatially joining data and taxi zones
Segregating
Done
Reading Data/Green/green_tripdata_2014-11.csv
Cleaning columns
Aggregating data
Spatially joining data and taxi zones
Segregating
Done
Reading Data/Green/green_tripdata_2014-12.csv
Cleaning columns
Aggregating data
Spatially joining data and taxi zones
Segregating
Done
Reading Data/Green/green_tripdata_2015-10.csv
Cleaning columns
Aggregating data
Spatially joining data and taxi zones
Segregating
Done
Reading Data/Green/green_tripdata_2015-11.csv
Cleaning columns
Aggregating data
Spatially joining data and taxi zones
Segregating
Done
Reading Data/Green/green_tripdata_2015-12.csv
Cleaning columns
Aggregating data
Spatially joining data and taxi zones
Segregating
Done


[    trip_cat  num_rides    neighbourhood
 0          0        136  upper_manhattan
 1          1        170  upper_manhattan
 2          2        104  upper_manhattan
 3          3         44  upper_manhattan
 4          4         24  upper_manhattan
 5          5         16  upper_manhattan
 6          6         12  upper_manhattan
 7          7          8  upper_manhattan
 8          8          4  upper_manhattan
 9          9          4  upper_manhattan
 10        10          2  upper_manhattan
 11        11          0  upper_manhattan
 12        12          1  upper_manhattan
 13        13          0  upper_manhattan
 14        14          0  upper_manhattan
 15        15          0  upper_manhattan
 16        16          0  upper_manhattan
 17        17          1  upper_manhattan
 18        18          1  upper_manhattan
 19        19          0  upper_manhattan
 20        20          0  upper_manhattan
 21        21          0  upper_manhattan
 22        22          0  upper_ma