In [13]:
import os
import pandas as pd
from urllib.request import urlretrieve

In [24]:
DOWNLOAD_ROOT = 'https://s3.amazonaws.com/nyc-tlc/trip+data/'
TRIPDATA_URL = DOWNLOAD_ROOT+'(vehicle_type)_tripdata_(YYYY)-(MM).csv'
TRIPDATA_PATH = os.path.join('datasets', '(vehicle_type)_tripdata_(YYYY)-(MM).csv')

VEHICLE_TYPES = ['yellow', 'green', 'fhv', 'fhvhv']

In [34]:
# fetch data
def fetch_tripdata(year_month_pairs:list, vehicle_types:list=VEHICLE_TYPES, tripdata_url=TRIPDATA_URL, tripdata_path=TRIPDATA_PATH):
    '''
    download tripdata from https://www1.nyc.gov/site/tlc/about/tlc-trip-record-data.page

    Params:
    year_month_pairs - list of tuple of year month ex [('2020-05')]
    vehicle_types - list of vehicle type data to download ['yellow']
    tripdata_url - url for data download got from webpage
    tripdata_path - path to place downloaded files
    '''
    print('Fetching tripdata ......')
    for year_month_pair in year_month_pairs:
        year_, month_ = year_month_pair.split('-')[0], year_month_pair.split('-')[1] 
        for vehicle_type in vehicle_types:
            # replace formatting for vehicle_type, year, month
            url_ = tripdata_url.replace('(vehicle_type)', vehicle_type).replace('(YYYY)', year_).replace('(MM)', month_)
            path_ = tripdata_path.replace('(vehicle_type)', vehicle_type).replace('(YYYY)', year_).replace('(MM)', month_)           
            if not os.path.isfile(path_):
                print(f'{path_} not downloaded')
                print(f'Downloading from {url_}')
                urlretrieve(url_, path_)
            else:
                print(f'{path_} downloaded already')
            print('######################################')
    print('################ DONE #################')

In [35]:
fetch_tripdata([('2019-06'),('2019-05')])

Fetching tripdata ......
datasets\yellow_tripdata_2019-06.csv downloaded already
######################################
datasets\green_tripdata_2019-06.csv downloaded already
######################################
datasets\fhv_tripdata_2019-06.csv downloaded already
######################################
datasets\fhvhv_tripdata_2019-06.csv downloaded already
######################################
datasets\yellow_tripdata_2019-05.csv downloaded already
######################################
datasets\green_tripdata_2019-05.csv downloaded already
######################################
datasets\fhv_tripdata_2019-05.csv downloaded already
######################################
datasets\fhvhv_tripdata_2019-05.csv downloaded already
######################################
################ DONE #################


In [38]:
# load data
def load_tripdata(year_month_pairs:list, vehicle_types:list=VEHICLE_TYPES, tripdata_path=TRIPDATA_PATH):
    dfs = dict()
    print('Loading tripdata ......')
    for year_month_pair in year_month_pairs:
        year_, month_ = year_month_pair.split('-')[0], year_month_pair.split('-')[1] 
        for vehicle_type in vehicle_types:
            path_ = tripdata_path.replace('(vehicle_type)', vehicle_type).replace('(YYYY)', year_).replace('(MM)', month_)
            dfs[f'{year_month_pair}-{vehicle_type}'] = pd.read_csv(path_)
    print('################ DONE #################')
    return dfs

In [39]:
dfs = load_tripdata([('2019-05')])

Loading tripdata ......
################ DONE #################
