In [1]:
import numpy as np
import pandas as pd
from datetime import datetime
from bs4 import BeautifulSoup

## New York City Green Taxi in the First Quarter of 2019.

In [2]:
# 2019 Yellow Taxi Trip Data
def extract_data(url, query):
    new_url = url + query
    df = pd.DataFrame()
    
    i = 0
    cur_page = new_url + "&$limit=1000&$offset=" + str(i)
    cur_data = pd.read_json(cur_page)
    while len(cur_data) >= 1000:
        df = df.append(cur_data, sort=True)
        i += 1000
        cur_page = new_url + "&$limit=1000&$offset=" + str(i)
        cur_data = pd.read_json(cur_page)
        
        if i % 4e5 == 0:
            print("{} records have been extracted at {}.".format(i, datetime.now().time()))
    
    if len(cur_data) > 0:
        df = df.append(cur_data, sort=True)
    
    return df

In [3]:
def save_to_csv(df, fileName="./data/myData.csv"):
    df.to_csv(fileName)

In [4]:
def get_data():
    url = "https://data.cityofnewyork.us/resource/2upf-qytp.json?"
    # pickup time: Jan. ~ June, pickup location: <= 50
    query = "$where=tpep_pickup_datetime between '2019-01-01' and '2019-04-01' and PULocationID<=50"
    query = query.replace(" ", "%20")
    print(url + query)  # first page

    print("Start time: ", datetime.now().time())
    df = extract_data(url, query)
    print("End time: ", datetime.now().time())

    save_to_csv(df)

In [5]:
def read_from_csv(path):
    df = pd.read_csv(path, index_col=0)
    return df

## Data Description and Exploration

| **Attribute** | **Taxi Information** | 
|----------|:-------------|
| VendorID | A code indicating the LPEP provider that provided the record. (1= Creative Mobile Technologies, LLC 2= VeriFone Inc.)|
| tpep_pickup_datetime | The date and time when the meter was engaged.|
| tpep_dropoff_datetime | The date and time when the meter was disengaged.|
| Passenger_count| The number of passengers in the vehicle.|
| Trip_distance | The elapsed trip distance in miles reported by the taximeter.|
| PULocationID | TLC Taxi Zone in which the taximeter was engaged|
| DOLocationID | TLC Taxi Zone in which the taximeter was disengaged|
| RateCodeID | The final rate code in effect at the end of the trip.(1= Standard rate 2=JFK 3=Newark 4=Nassau or Westchester 5=Negotiated fare 6=Group ride)|  
| Payment_type| A numeric code signifying how the passenger paid for the trip.(1= Credit card 2= Cash 3= No charge 4= Dispute 5= Unknown 6= Voided trip)|
| Fare_amount | The time-and-distance fare calculated by the meter.|
|Extra | Miscellaneous extras and surcharges. Currently, this only includes the \$0.50 and \$1.0 rush hour and overnight charges.|
| MTA_tax | \$0.50 MTA tax that is automatically triggered based on the metered rate in use.|
| Improvement_surcharge | \$0.30 improvement surcharge assessed on hailed trips at the flag drop.|
| Tip amount | This field is automatically populated for credit card tips. Cash tips are not included.|
| Tolls_amount | Total amount of all tolls paid in trip.|
| Total_amount | The total amount charged to passengers. Does not include cash tips.|




### Data Clean

In [6]:
def drop_useless_columns(df):
    # get rid of useless columns
    df = df.drop(columns=["congestion_surcharge", "store_and_fwd_flag"])
    return df

In [7]:
def convert_to_timestamp(df):
    df["tpep_pickup_datetime"] = pd.to_datetime(df["tpep_pickup_datetime"])
    df["tpep_dropoff_datetime"] = pd.to_datetime(df["tpep_dropoff_datetime"])
    return df

In [8]:
def create_duration_colum(df):
    # create "duration" column from pickup and dropoff datetime
    df["duration"] = df["tpep_dropoff_datetime"] - df["tpep_pickup_datetime"]
    return df

In [9]:
path = "./data/myData.csv"
df = read_from_csv(path)
df = drop_useless_columns(df)
df = convert_to_timestamp(df)
df = create_duration_colum(df)

df.columns

  mask |= (ar1 == a)


Index(['dolocationid', 'extra', 'fare_amount', 'improvement_surcharge',
       'mta_tax', 'passenger_count', 'payment_type', 'pulocationid',
       'ratecodeid', 'tip_amount', 'tolls_amount', 'total_amount',
       'tpep_dropoff_datetime', 'tpep_pickup_datetime', 'trip_distance',
       'vendorid', 'duration'],
      dtype='object')

In [10]:
df.head(10)

Unnamed: 0,dolocationid,extra,fare_amount,improvement_surcharge,mta_tax,passenger_count,payment_type,pulocationid,ratecodeid,tip_amount,tolls_amount,total_amount,tpep_dropoff_datetime,tpep_pickup_datetime,trip_distance,vendorid,duration
0,1,0.0,20.0,0.3,0.5,1,1,1,5,1.0,0.0,21.8,2019-01-01 02:19:32,2019-01-01 02:19:27,0.0,2,00:00:05
1,1,0.0,85.0,0.3,0.0,1,1,1,5,2.0,0.0,87.3,2019-01-01 05:52:53,2019-01-01 05:52:45,0.0,2,00:00:08
2,1,0.0,80.0,0.3,0.0,1,1,1,5,0.0,0.0,80.3,2019-01-01 06:23:23,2019-01-01 06:22:24,0.0,2,00:00:59
3,1,0.0,80.0,0.3,0.0,2,1,1,5,16.06,0.0,96.36,2019-01-01 08:13:01,2019-01-01 08:12:51,0.0,2,00:00:10
4,1,0.0,160.0,0.3,0.5,2,1,1,5,0.0,0.0,160.8,2019-01-01 08:14:53,2019-01-01 08:14:48,0.0,2,00:00:05
5,1,0.0,2.5,0.3,0.5,3,2,1,1,0.0,0.0,3.3,2019-01-01 10:10:33,2019-01-01 10:10:30,16.9,1,00:00:03
6,1,0.0,0.0,0.3,0.0,3,3,1,5,0.0,16.26,16.56,2019-01-01 10:11:32,2019-01-01 10:10:52,16.9,1,00:00:40
7,1,0.0,0.0,0.3,0.0,3,3,1,5,0.0,16.26,16.56,2019-01-01 10:13:13,2019-01-01 10:12:05,16.9,1,00:01:08
8,1,0.0,120.0,0.3,0.0,3,2,1,5,0.0,16.26,136.56,2019-01-01 10:14:35,2019-01-01 10:13:44,16.9,1,00:00:51
9,1,0.0,90.0,0.3,0.0,3,1,1,5,18.06,0.0,108.36,2019-01-01 12:16:23,2019-01-01 12:16:18,0.0,2,00:00:05


In [11]:
len(df)

1838913

In [12]:
def drop_outliers(df):
    # drop rows with 0.0 fare amount or trip distance
    df = df[(df['total_amount'] > 0.0) & (df['trip_distance'] > 0.0)]
    return df

In [13]:
df = drop_outliers(df)
len(df)

1822817

In [14]:
df.head(10)

Unnamed: 0,dolocationid,extra,fare_amount,improvement_surcharge,mta_tax,passenger_count,payment_type,pulocationid,ratecodeid,tip_amount,tolls_amount,total_amount,tpep_dropoff_datetime,tpep_pickup_datetime,trip_distance,vendorid,duration
5,1,0.0,2.5,0.3,0.5,3,2,1,1,0.0,0.0,3.3,2019-01-01 10:10:33,2019-01-01 10:10:30,16.9,1,00:00:03
6,1,0.0,0.0,0.3,0.0,3,3,1,5,0.0,16.26,16.56,2019-01-01 10:11:32,2019-01-01 10:10:52,16.9,1,00:00:40
7,1,0.0,0.0,0.3,0.0,3,3,1,5,0.0,16.26,16.56,2019-01-01 10:13:13,2019-01-01 10:12:05,16.9,1,00:01:08
8,1,0.0,120.0,0.3,0.0,3,2,1,5,0.0,16.26,136.56,2019-01-01 10:14:35,2019-01-01 10:13:44,16.9,1,00:00:51
10,1,0.0,135.0,0.3,0.0,4,1,1,5,33.05,30.0,198.35,2019-01-01 12:49:31,2019-01-01 12:49:12,19.3,1,00:00:19
14,132,0.0,106.0,0.3,0.5,3,2,1,1,0.0,11.52,118.32,2019-01-01 16:16:49,2019-01-01 15:23:56,41.28,2,00:52:53
19,1,0.0,142.06,0.3,0.0,1,2,1,5,0.0,0.0,142.36,2019-01-02 18:52:06,2019-01-01 19:47:34,0.11,2,23:04:32
22,1,0.0,30.0,0.3,0.0,1,1,1,5,0.0,0.0,30.3,2019-01-02 02:51:37,2019-01-02 02:48:51,1.27,2,00:02:46
23,265,0.0,15.0,0.3,0.0,1,2,1,5,0.0,0.0,15.3,2019-01-02 03:03:41,2019-01-02 03:03:36,12.65,2,00:00:05
28,1,0.0,0.0,0.3,0.0,2,4,1,5,0.0,0.0,0.3,2019-01-02 13:21:55,2019-01-02 13:20:28,17.7,1,00:01:27


In [15]:
save_to_csv(df, "./data/processed_myData.csv")