In [1]:
import numpy as np
import pandas as pd
from datetime import datetime
from bs4 import BeautifulSoup

In [2]:
# 2019 Yellow Taxi Trip Data
def extract_data(url, query):
    new_url = url + query
    df = pd.DataFrame()
    
    i = 0
    cur_page = new_url + "&$limit=1000&$offset=" + str(i)
    cur_data = pd.read_json(cur_page)
    while len(cur_data) >= 1000:
        df = df.append(cur_data, sort=True)
        i += 1000
        cur_page = new_url + "&$limit=1000&$offset=" + str(i)
        cur_data = pd.read_json(cur_page)
        
        if i % 4e5 == 0:
            print("{} records have been extracted at {}.".format(i, datetime.now().time()))
    
    if len(cur_data) > 0:
        df = df.append(cur_data, sort=True)
    
    return df

In [3]:
def save_to_csv(df, fileName="./data/myData.csv"):
    df.to_csv(fileName)

In [12]:
url = "https://data.cityofnewyork.us/resource/2upf-qytp.json?"
# pickup time: Jan. ~ June, pickup location: <= 50
query = "$where=tpep_pickup_datetime between '2019-01-01' and '2019-04-01' and PULocationID<=50"
query = query.replace(" ", "%20")
print(url + query)  # first page

print("Start time: ", datetime.now().time())
df = extract_data(url, query)
print("End time: ", datetime.now().time())

save_to_csv(df)

https://data.cityofnewyork.us/resource/2upf-qytp.json?$where=tpep_pickup_datetime%20between%20'2019-01-01'%20and%20'2019-04-01'%20and%20PULocationID<=50
Start time:  20:00:53.743212
400000 records have been extracted at 20:10:57.089287.
800000 records have been extracted at 20:28:15.578542.
1200000 records have been extracted at 20:53:10.297837.
1600000 records have been extracted at 21:40:16.641509.
End time:  22:12:51.809263


In [4]:
print("Number of rows: ", len(df))
print("Number of columns: ", df.shape[1])

df.head()

NameError: name 'df' is not defined

In [5]:
def read_from_csv(path):
    df = pd.read_csv(path, index_col=0)
    return df

In [9]:
# read file from download .csv
path = "./data/myData.csv"
df = read_from_csv(path)

df.columns

  mask |= (ar1 == a)


Index(['congestion_surcharge', 'dolocationid', 'extra', 'fare_amount',
       'improvement_surcharge', 'mta_tax', 'passenger_count', 'payment_type',
       'pulocationid', 'ratecodeid', 'store_and_fwd_flag', 'tip_amount',
       'tolls_amount', 'total_amount', 'tpep_dropoff_datetime',
       'tpep_pickup_datetime', 'trip_distance', 'vendorid'],
      dtype='object')

In [10]:
# get rid of useless columns
df = df.drop(columns=["congestion_surcharge", "store_and_fwd_flag"])

print(len(df.columns))
print(df.columns)
df.dtypes

16
Index(['dolocationid', 'extra', 'fare_amount', 'improvement_surcharge',
       'mta_tax', 'passenger_count', 'payment_type', 'pulocationid',
       'ratecodeid', 'tip_amount', 'tolls_amount', 'total_amount',
       'tpep_dropoff_datetime', 'tpep_pickup_datetime', 'trip_distance',
       'vendorid'],
      dtype='object')


dolocationid               int64
extra                    float64
fare_amount              float64
improvement_surcharge    float64
mta_tax                  float64
passenger_count            int64
payment_type               int64
pulocationid               int64
ratecodeid                 int64
tip_amount               float64
tolls_amount             float64
total_amount             float64
tpep_dropoff_datetime     object
tpep_pickup_datetime      object
trip_distance            float64
vendorid                   int64
dtype: object

In [11]:
# convert object to timestamp type
df["tpep_pickup_datetime"] = pd.to_datetime(df["tpep_pickup_datetime"])
df["tpep_dropoff_datetime"] = pd.to_datetime(df["tpep_dropoff_datetime"])

df.dtypes

dolocationid                      int64
extra                           float64
fare_amount                     float64
improvement_surcharge           float64
mta_tax                         float64
passenger_count                   int64
payment_type                      int64
pulocationid                      int64
ratecodeid                        int64
tip_amount                      float64
tolls_amount                    float64
total_amount                    float64
tpep_dropoff_datetime    datetime64[ns]
tpep_pickup_datetime     datetime64[ns]
trip_distance                   float64
vendorid                          int64
dtype: object

In [12]:
# create "duration" column from pickup and dropoff datetime
df["duration"] = df["tpep_dropoff_datetime"] - df["tpep_pickup_datetime"]
df.head(20)

Unnamed: 0,dolocationid,extra,fare_amount,improvement_surcharge,mta_tax,passenger_count,payment_type,pulocationid,ratecodeid,tip_amount,tolls_amount,total_amount,tpep_dropoff_datetime,tpep_pickup_datetime,trip_distance,vendorid,duration
0,1,0.0,20.0,0.3,0.5,1,1,1,5,1.0,0.0,21.8,2019-01-01 02:19:32,2019-01-01 02:19:27,0.0,2,00:00:05
1,1,0.0,85.0,0.3,0.0,1,1,1,5,2.0,0.0,87.3,2019-01-01 05:52:53,2019-01-01 05:52:45,0.0,2,00:00:08
2,1,0.0,80.0,0.3,0.0,1,1,1,5,0.0,0.0,80.3,2019-01-01 06:23:23,2019-01-01 06:22:24,0.0,2,00:00:59
3,1,0.0,80.0,0.3,0.0,2,1,1,5,16.06,0.0,96.36,2019-01-01 08:13:01,2019-01-01 08:12:51,0.0,2,00:00:10
4,1,0.0,160.0,0.3,0.5,2,1,1,5,0.0,0.0,160.8,2019-01-01 08:14:53,2019-01-01 08:14:48,0.0,2,00:00:05
5,1,0.0,2.5,0.3,0.5,3,2,1,1,0.0,0.0,3.3,2019-01-01 10:10:33,2019-01-01 10:10:30,16.9,1,00:00:03
6,1,0.0,0.0,0.3,0.0,3,3,1,5,0.0,16.26,16.56,2019-01-01 10:11:32,2019-01-01 10:10:52,16.9,1,00:00:40
7,1,0.0,0.0,0.3,0.0,3,3,1,5,0.0,16.26,16.56,2019-01-01 10:13:13,2019-01-01 10:12:05,16.9,1,00:01:08
8,1,0.0,120.0,0.3,0.0,3,2,1,5,0.0,16.26,136.56,2019-01-01 10:14:35,2019-01-01 10:13:44,16.9,1,00:00:51
9,1,0.0,90.0,0.3,0.0,3,1,1,5,18.06,0.0,108.36,2019-01-01 12:16:23,2019-01-01 12:16:18,0.0,2,00:00:05
