## Run Docker with Postgres

```
docker run -it \
    -e POSTGRES_USER="root" \
    -e POSTGRES_PASSWORD="root" \
    -e POSTGRES_DB="ny_taxi" \
    -p 5433:5432 \
    postgres:13
```

## Download data

In [1]:
%%bash

wget https://github.com/DataTalksClub/nyc-tlc-data/releases/download/green/green_tripdata_2019-01.csv.gz \
    -O green_tripdata_2019-01.csv.gz

wget https://s3.amazonaws.com/nyc-tlc/misc/taxi+_zone_lookup.csv \
    -O taxi+_zone_lookup.csv

--2023-01-26 15:48:59--  https://github.com/DataTalksClub/nyc-tlc-data/releases/download/green/green_tripdata_2019-01.csv.gz
Resolving github.com (github.com)... 140.82.121.3
Connecting to github.com (github.com)|140.82.121.3|:443... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://objects.githubusercontent.com/github-production-release-asset-2e65be/513814948/d3904232-1a2b-431b-803d-0ee802cd14fc?X-Amz-Algorithm=AWS4-HMAC-SHA256&X-Amz-Credential=AKIAIWNJYAX4CSVEH53A%2F20230126%2Fus-east-1%2Fs3%2Faws4_request&X-Amz-Date=20230126T154817Z&X-Amz-Expires=300&X-Amz-Signature=540a02ec0a0223bb65d3cb21306369c9856238295ac8f439f39e7a3253cd1df3&X-Amz-SignedHeaders=host&actor_id=0&key_id=0&repo_id=513814948&response-content-disposition=attachment%3B%20filename%3Dgreen_tripdata_2019-01.csv.gz&response-content-type=application%2Foctet-stream [following]
--2023-01-26 15:48:59--  https://objects.githubusercontent.com/github-production-release-asset-2e65be/513814948/d3904232

In [2]:
import pandas as pd

from sqlalchemy import create_engine

from tqdm import tqdm

## Check connection

In [3]:
engine = create_engine('postgresql://root:root@localhost:5433/ny_taxi')
engine

Engine(postgresql://root:***@localhost:5433/ny_taxi)

In [4]:
engine.connect()

<sqlalchemy.engine.base.Connection at 0x7f8a3fdd2ac0>

In [5]:
query = """
SELECT *
FROM pg_catalog.pg_tables
WHERE schemaname != 'pg_catalog' AND 
    schemaname != 'information_schema';
"""

pd.read_sql(query, con=engine)

Unnamed: 0,schemaname,tablename,tableowner,tablespace,hasindexes,hasrules,hastriggers,rowsecurity


## Upload tables

### Taxi data

In [6]:
df = pd.read_csv('green_tripdata_2019-01.csv.gz', parse_dates=[
    'lpep_pickup_datetime',
    'lpep_dropoff_datetime',
])

df.head()

Unnamed: 0,VendorID,lpep_pickup_datetime,lpep_dropoff_datetime,store_and_fwd_flag,RatecodeID,PULocationID,DOLocationID,passenger_count,trip_distance,fare_amount,extra,mta_tax,tip_amount,tolls_amount,ehail_fee,improvement_surcharge,total_amount,payment_type,trip_type,congestion_surcharge
0,2,2018-12-21 15:17:29,2018-12-21 15:18:57,N,1,264,264,5,0.0,3.0,0.5,0.5,0.0,0.0,,0.3,4.3,2,1,
1,2,2019-01-01 00:10:16,2019-01-01 00:16:32,N,1,97,49,2,0.86,6.0,0.5,0.5,0.0,0.0,,0.3,7.3,2,1,
2,2,2019-01-01 00:27:11,2019-01-01 00:31:38,N,1,49,189,2,0.66,4.5,0.5,0.5,0.0,0.0,,0.3,5.8,1,1,
3,2,2019-01-01 00:46:20,2019-01-01 01:04:54,N,1,189,17,2,2.68,13.5,0.5,0.5,2.96,0.0,,0.3,19.71,1,1,
4,2,2019-01-01 00:19:06,2019-01-01 00:39:43,N,1,82,258,1,4.53,18.0,0.5,0.5,0.0,0.0,,0.3,19.3,2,1,


In [7]:
df.shape

(630918, 20)

In [8]:
df.head(n=0).to_sql(name='taxi_data', con=engine, if_exists='replace')

0

In [9]:
df_iter = pd.read_csv('green_tripdata_2019-01.csv.gz', parse_dates=[
    'lpep_pickup_datetime',
    'lpep_dropoff_datetime',
], iterator=True, chunksize=100_000)

for chunk in tqdm(df_iter):
    chunk.to_sql(name='taxi_data', con=engine, if_exists='append')

7it [00:46,  6.71s/it]


In [10]:
sql_query = """
select count(*) from taxi_data
"""

pd.read_sql(sql_query, con=engine)

Unnamed: 0,count
0,630918


### Zones

In [11]:
df_zones = pd.read_csv('taxi+_zone_lookup.csv')
df_zones.head()

Unnamed: 0,LocationID,Borough,Zone,service_zone
0,1,EWR,Newark Airport,EWR
1,2,Queens,Jamaica Bay,Boro Zone
2,3,Bronx,Allerton/Pelham Gardens,Boro Zone
3,4,Manhattan,Alphabet City,Yellow Zone
4,5,Staten Island,Arden Heights,Boro Zone


In [12]:
df_zones.shape

(265, 4)

In [13]:
df_zones.to_sql(name='zones', con=engine, if_exists='replace')

265

### Sanity check

In [14]:
query = """
SELECT *
FROM pg_catalog.pg_tables
WHERE schemaname != 'pg_catalog' AND 
    schemaname != 'information_schema';
"""

pd.read_sql(query, con=engine)

Unnamed: 0,schemaname,tablename,tableowner,tablespace,hasindexes,hasrules,hastriggers,rowsecurity
0,public,taxi_data,root,,True,False,False,False
1,public,zones,root,,True,False,False,False


## Tasks

### Question 1. Knowing docker tags

In [15]:
!docker build --help | grep "Write the image ID to the file"

      --iidfile string          Write the image ID to the file


### Question 2. Understanding docker first run

You should run:

```
docker run -it python:3.9 /bin/bash
pip list # inside docker
```

The answer will be:

```
Package    Version
---------- -------
pip        22.0.4
setuptools 58.1.0
wheel      0.38.4
WARNING: You are using pip version 22.0.4; however, version 22.3.1 is available.
You should consider upgrading via the '/usr/local/bin/python -m pip install --upgrade pip' command.
```

### Question 3. Count records

In [16]:
sql_query = '''
select count(*)
from taxi_data
where
    true
    and lpep_pickup_datetime::TIMESTAMP::DATE = date '2019-01-15'
    and lpep_dropoff_datetime::TIMESTAMP::DATE = date '2019-01-15'
limit 10
'''

pd.read_sql(sql_query, con=engine)

Unnamed: 0,count
0,20530


### Question 4. Largest trip for each day

In [17]:
sql_query = '''
select lpep_pickup_datetime::TIMESTAMP::DATE 
from taxi_data
where trip_distance = ( select max(trip_distance) from taxi_data )
limit 10
'''

pd.read_sql(sql_query, con=engine)

Unnamed: 0,lpep_pickup_datetime
0,2019-01-15


### Question 5. The number of passengers

In [18]:
sql_query = '''
select passenger_count, count(*)
from taxi_data
where lpep_pickup_datetime::TIMESTAMP::DATE = date '2019-01-01'
group by passenger_count
having passenger_count in (2, 3)
'''

pd.read_sql(sql_query, con=engine)

Unnamed: 0,passenger_count,count
0,2,1282
1,3,254


### Question 6. Largest tip

In [19]:
sql_query = '''
with db_astoria as (
    select *
    from taxi_data
    where "PULocationID" = ( select "LocationID" from zones where "Zone" = 'Astoria' )
)

select "Zone" from (
    select "DOLocationID"
    from db_astoria
    where tip_amount = ( select max(tip_amount) from db_astoria )
) as da
join zones on "DOLocationID" = "LocationID"
'''

pd.read_sql(sql_query, con=engine)

Unnamed: 0,Zone
0,Long Island City/Queens Plaza
