In [None]:
import pandas as pd
import pyarrow.parquet as pq
import datetime

```
pip install sqlalchemy psycopg2-binary 
```

In [None]:
from sqlalchemy import create_engine

In [None]:
engine = create_engine('postgresql://root:root@localhost:5432/ny_taxi')

In [None]:
engine.connect()

In [None]:
query = """
SELECT *
FROM pg_catalog.pg_tables
WHERE schemaname != 'pg_catalog' AND 
    schemaname != 'information_schema';
"""

pd.read_sql(query, con=engine)

In [None]:
# reading in parquet files and output as CSV 
# trips = pq.read_table('yellow_tripdata_2019-09.parquet')
# df_trips = trips.to_pandas()

df = pd.read_csv('green_tripdata_2019-09.csv',nrows=100)

df.head()


In [None]:
#loop over to try and  get data into postgres -CSV --> green_tripdata
from time import time

#show time started
print("Started: %s" % datetime.datetime.now())

#reading in CSV
df_iter = pd.read_csv('green_tripdata_2019-09.csv',iterator=True,chunksize=100000 ,low_memory=False)
for batch in df_iter:
    t_start = time()
    
    df = batch
    df.lpep_pickup_datetime = pd.to_datetime(df.lpep_pickup_datetime)
    df.lpep_dropoff_datetime = pd.to_datetime(df.lpep_dropoff_datetime)
    df.to_sql(name='green_tripdata_trip', con=engine, if_exists="append")
    
    t_end = time()  
    print("New chunk inserted: %.3f elapsed" % (t_end - t_start))

print("Ended: %s" % datetime.datetime.now())

In [None]:
#loop over to try and  get data into postgres -CSV --> taxi+_zone_lookup.csv
#wget https://s3.amazonaws.com/nyc-tlc/misc/taxi+_zone_lookup.csv

from time import time

#show time started
print("Started: %s" % datetime.datetime.now())

#reading in CSV
df_iter = pd.read_csv('taxi+_zone_lookup.csv',iterator=True,chunksize=100000 ,low_memory=False)
for batch in df_iter:
    t_start = time()
    
    df = batch
    df.to_sql(name='taxi_zone', con=engine, if_exists="append")
    
    t_end = time()  
    print("New chunk inserted: %.3f elapsed" % (t_end - t_start))

print("Ended: %s" % datetime.datetime.now())

In [None]:
#loop over to try and  get data into postgres - parquet
from time import time

#show time started
print("Started: %s" % datetime.datetime.now())

#reading in parquet
parquet_file = pq.ParquetFile('green_tripdata_2019-09.parquet')
for batch in parquet_file.iter_batches(batch_size=100000):
    t_start = time()
    
    df = batch.to_pandas()
    df.lpep_pickup_datetime = pd.to_datetime(df.lpep_pickup_datetime)
    df.lpep_dropoff_datetime = pd.to_datetime(df.lpep_dropoff_datetime)
    df.to_sql(name='green_tripdata_trip', con=engine, if_exists="append")
    
    t_end = time()  
    print("New chunk inserted: %.3f elapsed" % (t_end - t_start))
    
print("Ended: %s" % datetime.datetime.now())