In [1]:
import numpy as np
import pandas as pd
from sodapy import Socrata

In [2]:
client = Socrata("data.cityofnewyork.us", None)
# https://data.cityofnewyork.us/resource/uacg-pexx.json



In [3]:
# UACG-PEXX FOR 2016 and filtering on 138 Zone, as zones are only used in the last 6 months
results = client.get("uacg-pexx", where="PULocationID = '138'", limit = 10000000)

In [4]:
data = pd.DataFrame.from_records(results)

In [5]:
data.shape

(1529683, 17)

In [6]:
columns = data.columns

In [7]:
columns

Index(['dolocationid', 'extra', 'fare_amount', 'improvement_surcharge',
       'mta_tax', 'passenger_count', 'payment_type', 'pulocationid',
       'ratecodeid', 'store_and_fwd_flag', 'tip_amount', 'tolls_amount',
       'total_amount', 'tpep_dropoff_datetime', 'tpep_pickup_datetime',
       'trip_distance', 'vendorid'],
      dtype='object')

In [8]:
data = data.drop(['extra', 'fare_amount', 'improvement_surcharge', 'mta_tax', 'payment_type', 'ratecodeid', 
                 'store_and_fwd_flag', 'tip_amount', 'tolls_amount', 'vendorid'], axis=1)

In [9]:
data.head()

Unnamed: 0,dolocationid,passenger_count,pulocationid,total_amount,tpep_dropoff_datetime,tpep_pickup_datetime,trip_distance
0,48,1,138,45.34,2016-10-18T17:14:12.000,2016-10-18T16:30:11.000,11.23
1,231,1,138,53.76,2016-11-15T10:49:58.000,2016-11-15T09:42:00.000,11.82
2,236,1,138,44.84,2016-09-23T14:41:31.000,2016-09-23T13:55:20.000,8.9
3,236,1,138,35.76,2016-12-02T19:14:59.000,2016-12-02T18:44:53.000,8.95
4,264,1,138,16.34,2016-09-20T21:07:03.000,2016-09-20T21:07:03.000,0.0


In [10]:
# data.to_csv('../../raw_data/full_taxi_2016Jul-Dec.csv')

In [11]:
data = data.drop(['dolocationid', 'pulocationid', 'total_amount', 'tpep_dropoff_datetime', 'trip_distance'], axis=1)

In [12]:
data.head()

Unnamed: 0,passenger_count,tpep_pickup_datetime
0,1,2016-10-18T16:30:11.000
1,1,2016-11-15T09:42:00.000
2,1,2016-09-23T13:55:20.000
3,1,2016-12-02T18:44:53.000
4,1,2016-09-20T21:07:03.000


In [13]:
data = data.set_index('tpep_pickup_datetime')

In [14]:
data.index = pd.DatetimeIndex(data.index)

In [15]:
data.passenger_count = data.passenger_count.astype(int)

In [16]:
data['num_pickups'] = 1

In [17]:
data.head()

Unnamed: 0_level_0,passenger_count,num_pickups
tpep_pickup_datetime,Unnamed: 1_level_1,Unnamed: 2_level_1
2016-10-18 16:30:11,1,1
2016-11-15 09:42:00,1,1
2016-09-23 13:55:20,1,1
2016-12-02 18:44:53,1,1
2016-09-20 21:07:03,1,1


In [18]:
data = data.resample('H').agg({'num_pickups' : 'count', 'passenger_count' : 'sum'})

In [19]:
data = data.rename(columns={'passenger_count':'num_passengers'})

In [20]:
data.head()

Unnamed: 0_level_0,num_pickups,num_passengers
tpep_pickup_datetime,Unnamed: 1_level_1,Unnamed: 2_level_1
2016-07-01 00:00:00,293,529.0
2016-07-01 01:00:00,8,13.0
2016-07-01 02:00:00,1,2.0
2016-07-01 03:00:00,1,1.0
2016-07-01 04:00:00,9,9.0


In [21]:
column_order = ['num_pickups', 'num_passengers']
data = data.reindex(columns=column_order)

In [22]:
data.to_csv('../clean_data/2016Jul-Dec_clean.csv')