In [1]:
import pandas as pd

In [2]:
pd.__version__

'2.1.3'

In [3]:
df = pd.read_csv('yellow_tripdata_2021-01.csv', nrows=100)

In [4]:
df.tpep_pickup_datetime = pd.to_datetime(df.tpep_pickup_datetime)
df.tpep_dropoff_datetime = pd.to_datetime(df.tpep_dropoff_datetime)

In [5]:
from sqlalchemy import create_engine

In [6]:
engine = create_engine('postgresql://root:root@localhost:5432/ny_taxi')

In [7]:
print(pd.io.sql.get_schema(df, name='yellow_taxi_data', con=engine))


CREATE TABLE yellow_taxi_data (
	"VendorID" BIGINT, 
	tpep_pickup_datetime TIMESTAMP WITHOUT TIME ZONE, 
	tpep_dropoff_datetime TIMESTAMP WITHOUT TIME ZONE, 
	passenger_count BIGINT, 
	trip_distance FLOAT(53), 
	"RatecodeID" BIGINT, 
	store_and_fwd_flag TEXT, 
	"PULocationID" BIGINT, 
	"DOLocationID" BIGINT, 
	payment_type BIGINT, 
	fare_amount FLOAT(53), 
	extra FLOAT(53), 
	mta_tax FLOAT(53), 
	tip_amount FLOAT(53), 
	tolls_amount FLOAT(53), 
	improvement_surcharge FLOAT(53), 
	total_amount FLOAT(53), 
	congestion_surcharge FLOAT(53)
)




In [29]:
df_iter = pd.read_csv('yellow_tripdata_2021-01.csv', iterator=True, chunksize=100_000)

In [30]:
df = next(df_iter)

In [31]:
len(df)

10000

In [32]:
df.tpep_pickup_datetime = pd.to_datetime(df.tpep_pickup_datetime)
df.tpep_dropoff_datetime = pd.to_datetime(df.tpep_dropoff_datetime)

In [33]:
df

Unnamed: 0,VendorID,tpep_pickup_datetime,tpep_dropoff_datetime,passenger_count,trip_distance,RatecodeID,store_and_fwd_flag,PULocationID,DOLocationID,payment_type,fare_amount,extra,mta_tax,tip_amount,tolls_amount,improvement_surcharge,total_amount,congestion_surcharge
0,1,2021-01-01 00:30:10,2021-01-01 00:36:12,1,2.10,1,N,142,43,2,8.0,3.0,0.5,0.00,0.0,0.3,11.80,2.5
1,1,2021-01-01 00:51:20,2021-01-01 00:52:19,1,0.20,1,N,238,151,2,3.0,0.5,0.5,0.00,0.0,0.3,4.30,0.0
2,1,2021-01-01 00:43:30,2021-01-01 01:11:06,1,14.70,1,N,132,165,1,42.0,0.5,0.5,8.65,0.0,0.3,51.95,0.0
3,1,2021-01-01 00:15:48,2021-01-01 00:31:01,0,10.60,1,N,138,132,1,29.0,0.5,0.5,6.05,0.0,0.3,36.35,0.0
4,2,2021-01-01 00:31:49,2021-01-01 00:48:21,1,4.94,1,N,68,33,1,16.5,0.5,0.5,4.06,0.0,0.3,24.36,2.5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9995,2,2021-01-01 13:55:14,2021-01-01 13:59:31,1,0.79,1,N,142,239,1,5.5,0.0,0.5,2.64,0.0,0.3,11.44,2.5
9996,2,2021-01-01 13:37:37,2021-01-01 13:44:01,1,0.75,1,N,161,43,1,6.0,0.0,0.5,1.86,0.0,0.3,11.16,2.5
9997,2,2021-01-01 13:48:52,2021-01-01 14:05:13,1,3.68,1,N,151,140,2,14.5,0.0,0.5,0.00,0.0,0.3,17.80,2.5
9998,1,2021-01-01 13:06:50,2021-01-01 13:10:10,1,1.30,1,Y,237,263,1,5.5,2.5,0.5,1.75,0.0,0.3,10.55,2.5


In [34]:
df.head(n=0).to_sql(name='yellow_taxi_data', con=engine, if_exists='replace')

0

In [35]:
%time df.to_sql(name='yellow_taxi_data', con=engine, if_exists='append')

CPU times: total: 156 ms
Wall time: 1.45 s


1000

In [36]:
from time import time

In [37]:
while True: 
    t_start = time()

    df = next(df_iter)

    df.tpep_pickup_datetime = pd.to_datetime(df.tpep_pickup_datetime)
    df.tpep_dropoff_datetime = pd.to_datetime(df.tpep_dropoff_datetime)
    
    df.to_sql(name='yellow_taxi_data', con=engine, if_exists='append')

    t_end = time()

    print('inserted another chunk, took %.3f second' % (t_end - t_start))

inserted another chunk, took 1.300 second
inserted another chunk, took 1.350 second
inserted another chunk, took 1.130 second
inserted another chunk, took 1.436 second
inserted another chunk, took 1.264 second
inserted another chunk, took 1.440 second
inserted another chunk, took 1.370 second
inserted another chunk, took 1.592 second
inserted another chunk, took 1.925 second
inserted another chunk, took 1.583 second
inserted another chunk, took 2.400 second
inserted another chunk, took 1.543 second
inserted another chunk, took 1.657 second
inserted another chunk, took 1.589 second
inserted another chunk, took 1.618 second
inserted another chunk, took 1.494 second
inserted another chunk, took 2.705 second
inserted another chunk, took 1.928 second
inserted another chunk, took 1.464 second
inserted another chunk, took 1.354 second
inserted another chunk, took 1.417 second
inserted another chunk, took 1.809 second
inserted another chunk, took 1.359 second
inserted another chunk, took 1.341

StopIteration: 

In [41]:
!wget https://s3.amazonaws.com/nyc-tlc/misc/taxi+_zone_lookup.csv

--2024-01-29 22:07:29--  https://s3.amazonaws.com/nyc-tlc/misc/taxi+_zone_lookup.csv
Resolving s3.amazonaws.com (s3.amazonaws.com)... 52.217.171.168, 52.216.250.70, 54.231.196.112, ...
Connecting to s3.amazonaws.com (s3.amazonaws.com)|52.217.171.168|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 12322 (12K) [application/octet-stream]
Saving to: 'taxi+_zone_lookup.csv'

     0K .......... ..                                         100%  838K=0.01s

2024-01-29 22:07:30 (838 KB/s) - 'taxi+_zone_lookup.csv' saved [12322/12322]



In [42]:
df_zones = pd.read_csv('taxi+_zone_lookup.csv')

In [43]:
df_zones.head()

Unnamed: 0,LocationID,Borough,Zone,service_zone
0,1,EWR,Newark Airport,EWR
1,2,Queens,Jamaica Bay,Boro Zone
2,3,Bronx,Allerton/Pelham Gardens,Boro Zone
3,4,Manhattan,Alphabet City,Yellow Zone
4,5,Staten Island,Arden Heights,Boro Zone


In [44]:
df_zones.to_sql(name='zones', con=engine, if_exists='replace')

265

In [45]:
engine.dispose()

In [46]:
trips_url="https://github.com/DataTalksClub/nyc-tlc-data/releases/download/green/green_tripdata_2019-09.csv.gz"
zones_url="https://s3.amazonaws.com/nyc-tlc/misc/taxi+_zone_lookup.csv"

python ingest_data.py \
    --user=root \
    --password=mypass \
    --host=localhost \
    --port=5432 \
    --db=taxi_dataset \
    --trips_table_name="taxi_trips" \
    --zones_table_name="zones" \
    --trips_data_url=${trips_url} \
    --zones_data_url=${zones_url}

SyntaxError: invalid syntax (1279224368.py, line 4)

In [None]:
python ingest_data.py --user=root --password=mypass --host=localhost --port=5432 --db=taxi_dataset --trips_table_name="taxi_trips" --zones_table_name="zones" --trips_data_url="https://github.com/DataTalksClub/nyc-tlc-data/releases/download/green/green_tripdata_2019-09.csv.gz" --zones_data_url="https://s3.amazonaws.com/nyc-tlc/misc/taxi+_zone_lookup.csv"

In [None]:
python ingest_data.py --user=root --password=root --host=localhost --port=5432 --db=ny_taxi --table_name=green_taxi_trips --url="https://github.com/DataTalksClub/nyc-tlc-data/releases/download/green/green_tripdata_2019-09.csv.gz"