# Green taxi pipeline

In [1]:
from pathlib import Path
import pandas as pd
from sqlalchemy import create_engine

In [2]:
green_data = Path("../data/taxi_ingest_data/green_tripdata_2019-01.csv.gz")
green_data.exists()

True

In [3]:
df = pd.read_csv(green_data, nrows=100)
df.columns

Index(['VendorID', 'lpep_pickup_datetime', 'lpep_dropoff_datetime',
       'store_and_fwd_flag', 'RatecodeID', 'PULocationID', 'DOLocationID',
       'passenger_count', 'trip_distance', 'fare_amount', 'extra', 'mta_tax',
       'tip_amount', 'tolls_amount', 'ehail_fee', 'improvement_surcharge',
       'total_amount', 'payment_type', 'trip_type', 'congestion_surcharge'],
      dtype='object')

In [3]:
engine = create_engine('postgresql://root:root@localhost:5432/ny_taxi')

In [11]:
print(pd.io.sql.get_schema(df, name='green_taxi_data', con=engine))


CREATE TABLE green_taxi_data (
	"VendorID" BIGINT, 
	lpep_pickup_datetime TEXT, 
	lpep_dropoff_datetime TEXT, 
	store_and_fwd_flag TEXT, 
	"RatecodeID" BIGINT, 
	"PULocationID" BIGINT, 
	"DOLocationID" BIGINT, 
	passenger_count BIGINT, 
	trip_distance FLOAT(53), 
	fare_amount FLOAT(53), 
	extra FLOAT(53), 
	mta_tax FLOAT(53), 
	tip_amount FLOAT(53), 
	tolls_amount FLOAT(53), 
	ehail_fee FLOAT(53), 
	improvement_surcharge FLOAT(53), 
	total_amount FLOAT(53), 
	payment_type BIGINT, 
	trip_type BIGINT, 
	congestion_surcharge FLOAT(53)
)




In [12]:
df_iter = pd.read_csv(green_data, iterator=True, chunksize=100000)
df = next(df_iter)
len(df)

100000

In [13]:
df.lpep_dropoff_datetime = pd.to_datetime(arg=df.lpep_dropoff_datetime)
df.lpep_pickup_datetime = pd.to_datetime(arg=df.lpep_pickup_datetime)

Schema should now specify our datetime columns as `TIMESTAMP`

In [14]:
print(pd.io.sql.get_schema(df, name='green_taxi_data', con=engine))


CREATE TABLE green_taxi_data (
	"VendorID" BIGINT, 
	lpep_pickup_datetime TIMESTAMP WITHOUT TIME ZONE, 
	lpep_dropoff_datetime TIMESTAMP WITHOUT TIME ZONE, 
	store_and_fwd_flag TEXT, 
	"RatecodeID" BIGINT, 
	"PULocationID" BIGINT, 
	"DOLocationID" BIGINT, 
	passenger_count BIGINT, 
	trip_distance FLOAT(53), 
	fare_amount FLOAT(53), 
	extra FLOAT(53), 
	mta_tax FLOAT(53), 
	tip_amount FLOAT(53), 
	tolls_amount FLOAT(53), 
	ehail_fee FLOAT(53), 
	improvement_surcharge FLOAT(53), 
	total_amount FLOAT(53), 
	payment_type BIGINT, 
	trip_type BIGINT, 
	congestion_surcharge FLOAT(53)
)




In [15]:
df.head(n=0).to_sql(name='green_taxi_data', con=engine, if_exists='replace')

0

In [16]:
%time df.to_sql(name='green_taxi_data', con=engine, if_exists='append')

CPU times: user 5.2 s, sys: 74.6 ms, total: 5.28 s
Wall time: 9.62 s


1000

In [9]:
from time import time
1226398

In [17]:
while df_iter:
    t_start = time()
    df = next(df_iter)
    # convert to datetime
    df.lpep_dropoff_datetime = pd.to_datetime(arg=df.lpep_dropoff_datetime)
    df.lpep_pickup_datetime = pd.to_datetime(arg=df.lpep_pickup_datetime)
    # push
    df.to_sql(name='green_taxi_data', con=engine, if_exists='append')
    t_end = time()
    print(f'chunk inserted; time taken: {t_end - t_start:.3f}')

chunk inserted; time taken: 10.036
chunk inserted; time taken: 10.209
chunk inserted; time taken: 9.936
chunk inserted; time taken: 9.855
chunk inserted; time taken: 9.860
chunk inserted; time taken: 3.043


StopIteration: 

In [2]:
zones_csv = Path("../data/taxi_ingest_data/taxi_zone_lookup.csv")
df_zones = pd.read_csv(zones_csv)
df_zones.head()

Unnamed: 0,LocationID,Borough,Zone,service_zone
0,1,EWR,Newark Airport,EWR
1,2,Queens,Jamaica Bay,Boro Zone
2,3,Bronx,Allerton/Pelham Gardens,Boro Zone
3,4,Manhattan,Alphabet City,Yellow Zone
4,5,Staten Island,Arden Heights,Boro Zone


In [4]:
print(pd.io.sql.get_schema(df_zones, name='zones', con=engine))


CREATE TABLE zones (
	"LocationID" BIGINT, 
	"Borough" TEXT, 
	"Zone" TEXT, 
	service_zone TEXT
)




In [5]:
df_zones.to_sql(name='zones', con=engine, if_exists='replace')

265

In [27]:
"Astoria" in df_zones.Zone.values

True

In [4]:
datetimes = [col for col in df.columns if "datetime" in col]
datetimes

['lpep_pickup_datetime', 'lpep_dropoff_datetime']

In [6]:
df[datetimes].dtypes

lpep_pickup_datetime     object
lpep_dropoff_datetime    object
dtype: object

In [8]:
for col in datetimes:
    df[col] = pd.to_datetime(df[col])

df[datetimes].dtypes

lpep_pickup_datetime     datetime64[ns]
lpep_dropoff_datetime    datetime64[ns]
dtype: object

In [10]:
df[df["passenger_count"] > 2]

Unnamed: 0,VendorID,lpep_pickup_datetime,lpep_dropoff_datetime,store_and_fwd_flag,RatecodeID,PULocationID,DOLocationID,passenger_count,trip_distance,fare_amount,extra,mta_tax,tip_amount,tolls_amount,ehail_fee,improvement_surcharge,total_amount,payment_type,trip_type,congestion_surcharge
0,2,2018-12-21 15:17:29,2018-12-21 15:18:57,N,1,264,264,5,0.0,3.0,0.5,0.5,0.0,0.0,,0.3,4.3,2,1,
21,2,2019-01-01 00:30:28,2019-01-01 00:41:18,N,1,256,112,5,2.18,9.5,0.5,0.5,3.24,0.0,,0.3,14.04,1,1,
22,2,2019-01-01 00:26:59,2019-01-01 01:01:33,N,1,66,68,5,6.66,25.5,0.5,0.5,0.0,0.0,,0.3,26.8,2,1,
31,2,2019-01-01 00:08:07,2019-01-01 00:18:36,N,1,182,182,3,1.36,8.5,0.5,0.5,0.0,0.0,,0.3,9.8,2,1,
33,2,2019-01-01 00:29:46,2019-01-01 00:41:09,N,1,181,52,6,1.52,9.0,0.5,0.5,2.06,0.0,,0.3,12.36,1,1,
40,2,2019-01-01 00:31:28,2019-01-01 00:57:27,N,5,255,91,3,10.98,30.0,0.0,0.0,6.0,0.0,,0.0,36.0,1,2,
46,1,2019-01-01 00:18:13,2019-01-01 00:32:22,N,1,145,146,4,2.5,11.0,0.5,0.5,0.0,0.0,,0.3,12.3,2,1,
49,2,2019-01-01 00:39:04,2019-01-01 00:52:06,N,1,260,95,4,3.14,13.0,0.5,0.5,0.0,0.0,,0.3,14.3,2,1,
67,2,2019-01-01 00:39:51,2019-01-01 00:43:05,N,1,260,83,3,0.75,4.5,0.5,0.5,0.0,0.0,,0.3,5.8,2,1,
75,2,2019-01-01 00:38:22,2019-01-01 00:50:32,N,1,25,80,5,4.66,15.5,0.5,0.5,3.36,0.0,,0.3,20.16,1,1,
