# General Information
The task of this script is to get data from the web and populate our database

So we have nyc taxi data and we clean it here (it is already very clean, but we define the column types). 
After cleaning we want to load it into the database, but here we have more than 1.3 million entries and
it is not possible or smart to push 1.3 million entries into the db at once

So I have to chunk the data when I read it so that an iterator is passed to me by pandas.
Then I can move over that iterator (chunk by chunk) and push each chunk into the database.


In [1]:
import pandas as pd


# read data from the github repo
prefix = 'https://github.com/DataTalksClub/nyc-tlc-data/releases/download/yellow/'
filename = 'yellow_tripdata_2021-01.csv.gz'
url = prefix + filename

# df = pd.read_csv(prefix + 'yellow_tripdata_2021-01.csv.gz') # there was a nrows=100 here, but I want to get all the data
# pandas can read a gzipped file easily, so no unzipping needs to be performed

dtypes = {
    "VendorID": "Int64",
    "passenger_count": "Int64",
    "trip_distance": "float64",
    "RatecodeID": "Int64",
    "store_and_fwd_flag": "string",
    "PULocationID": "Int64",
    "DOLocationID": "Int64",
    "payment_type": "Int64",
    "fare_amount": "float64",
    "extra": "float64",
    "mta_tax": "float64",
    "tip_amount": "float64",
    "tolls_amount": "float64",
    "improvement_surcharge": "float64",
    "total_amount": "float64",
    "congestion_surcharge": "float64"
}

parse_dates = ["tpep_pickup_datetime", "tpep_dropoff_datetime"]

df = pd.read_csv(url, dtype = dtypes, parse_dates = parse_dates)



print(df.head())
print("-------------")
print(df.dtypes)
print("-------------")
print(df.shape)


   VendorID tpep_pickup_datetime tpep_dropoff_datetime  passenger_count  \
0         1  2021-01-01 00:30:10   2021-01-01 00:36:12                1   
1         1  2021-01-01 00:51:20   2021-01-01 00:52:19                1   
2         1  2021-01-01 00:43:30   2021-01-01 01:11:06                1   
3         1  2021-01-01 00:15:48   2021-01-01 00:31:01                0   
4         2  2021-01-01 00:31:49   2021-01-01 00:48:21                1   

   trip_distance  RatecodeID store_and_fwd_flag  PULocationID  DOLocationID  \
0           2.10           1                  N           142            43   
1           0.20           1                  N           238           151   
2          14.70           1                  N           132           165   
3          10.60           1                  N           138           132   
4           4.94           1                  N            68            33   

   payment_type  fare_amount  extra  mta_tax  tip_amount  tolls_amount  \


In [2]:
from sqlalchemy import create_engine

engine = create_engine("postgresql://root:root@localhost:5432/ny_taxi")

In [3]:
df.head(0).to_sql(name="yellow_taxi_data", con=engine, if_exists="replace")

0

In [4]:
print(pd.io.sql.get_schema(df, name="yellow_taxi_data", con=engine))


CREATE TABLE yellow_taxi_data (
	"VendorID" BIGINT, 
	tpep_pickup_datetime TIMESTAMP WITHOUT TIME ZONE, 
	tpep_dropoff_datetime TIMESTAMP WITHOUT TIME ZONE, 
	passenger_count BIGINT, 
	trip_distance FLOAT(53), 
	"RatecodeID" BIGINT, 
	store_and_fwd_flag TEXT, 
	"PULocationID" BIGINT, 
	"DOLocationID" BIGINT, 
	payment_type BIGINT, 
	fare_amount FLOAT(53), 
	extra FLOAT(53), 
	mta_tax FLOAT(53), 
	tip_amount FLOAT(53), 
	tolls_amount FLOAT(53), 
	improvement_surcharge FLOAT(53), 
	total_amount FLOAT(53), 
	congestion_surcharge FLOAT(53)
)




In [5]:
df_iter = pd.read_csv(url, dtype = dtypes, parse_dates = parse_dates, iterator=True, chunksize=100000)

In [6]:
from tqdm.auto import tqdm

In [7]:

# add each chunk individually to the db now
first = True   
for chunk in tqdm(df_iter):
    if first:
        chunk.head(0).to_sql(name="yellow_taxi_data", con=engine, if_exists="replace")
        first = False
        print("Table created")
    
    chunk.to_sql(name="yellow_taxi_data", con=engine, if_exists="append", chunksize=2000)
    print("inserted: {0}".format(len(chunk)))

0it [00:00, ?it/s]

Table created
inserted: 100000
inserted: 100000
inserted: 100000
inserted: 100000
inserted: 100000
inserted: 100000
inserted: 100000
inserted: 100000
inserted: 100000
inserted: 100000
inserted: 100000
inserted: 100000
inserted: 100000
inserted: 69765
