This script set up the data directory structure for the project and downloading of the raw data of [subway, bus and streetcar](https://open.toronto.ca/catalogue/?search=Delay&sort=score%20desc) from the City of Toronto Portal.

# Import libraries, make directories, download data

In [None]:
# Import libraries
import urllib.request
import os, sys
import json
import pandas as pd

In [None]:
# Import files from google drive
from google.colab import drive
drive.mount('/content/gdrive',force_remount=False)

Mounted at /content/gdrive


In [None]:
# THese two functions are used to make directory structure
def try_make_dir(path):
  """ Try to make a directory with given path if there's no existing directory 
  ARG:
    path (str): path to make directory """
  try:
    os.mkdir(path)
    print(f'{path} added to project')
  except FileExistsError:
    pass
    
def create_dir_structure(master_path):
  """ Create the directory structure for the project
  ARG:
    master_path (str): path to the root foler
  RETURN:
    ObjecT: A paths dictionary of the path for all those directories """
  
  paths = {}
  
  # Loop over to create main data and src folder directories
  for folder in ['data', 'src']:
    paths[folder] = os.path.join(master_path, folder)
    try_make_dir(paths[folder])

  # Loop over data directory to create folder for each processing steps
  for data_folder in ['raw', 'intermediate', 'final']:
    paths[data_folder] = os.path.join(paths['data'], data_folder)
    try_make_dir(paths[data_folder])

    # Loop over each processing steps and make folder to specify what mode
    for mode_folder in ['bus', 'subway', 'streetcar']:
      mode_key = data_folder + '/' + mode_folder
      paths[mode_key] = os.path.join(paths[data_folder], mode_folder)
      try_make_dir(paths[mode_key])

  return paths

In [None]:
# These two functions are used to download the data
def get_raw_url(master_url, param_list, mode):
  """ Get the raw url for all files of a particular mode of transportation
  ARG:
    master_url (str): url of main ttc site
    param_list (str): list of parameters for different modes, including ids for each mode
    mode (str): modes you want to choose to get url of
  RETURN:
    A list of urls for data of the specified mode"""
  # Get the id parameter to attach to master_url
  param = param_list[mode]

  # Request API call to download data, then dump response to json format
  response = urllib.request.urlopen(master_url, 
                                    data=bytes(json.dumps(param), 
                                               encoding="utf-8"))
  # Loop over request to find url for files
  package = json.loads(response.read())
  urls = pd.DataFrame(columns = ['item','url'])
  for i, pkg in enumerate(package['result']['resources']):
    urls.loc[i,'item'] = pkg['name']
    urls.loc[i,'url'] = pkg['url']
  return urls

def download_ttc_data(master_url, param_list, mode, target_folder):
  """ Download all TTC data regarding a specified mode
  ARG:
    master_url (str): url of main ttc site
    param_list (str): list of parameters for different modes, including ids for each mode
    mode (str): modes you want to choose to download data of
    target_folder (str): target folder where you want to save the data """
  # Get raw url for each file
  urls = get_raw_url(master_url, param_list, mode)

  # Download each file to the target_folder
  for item, url in zip(urls.item, urls.url):
    try:
      urllib.request.urlretrieve(url=url, filename=os.path.join(target_folder, f'{item}.xlsx'))
    except:
      pass

    print(f'Downloaded {item}.')

In [None]:
# Initialize local variables (master_path should be your Colab path or your lobal directory path)
master_path = '/content/gdrive/My Drive/ColabNtb/ttc_delay'
master_url = "https://ckan0.cf.opendata.inter.prod-toronto.ca/api/3/action/package_show"
param_list = {'subway': { "id": "996cfe8d-fb35-40ce-b569-698d51fc683b"}, #subway 
             'streetcar': { "id": "b68cb71b-44a7-4394-97e2-5d2f41462a5d"}, #streetcar
             'bus': { "id": "e271cdae-8788-4980-96ce-6a5c95bc6618"}} #bus 
# Create directory structure
paths = create_dir_structure(master_path)

# Download data
for mode in ['bus', 'subway', 'streetcar']:
  download_ttc_data(master_url, param_list, mode, paths['raw/' + mode])

/content/gdrive/My Drive/ColabNtb/ttc_delay/data added to project
/content/gdrive/My Drive/ColabNtb/ttc_delay/src added to project
/content/gdrive/My Drive/ColabNtb/ttc_delay/data/raw added to project
/content/gdrive/My Drive/ColabNtb/ttc_delay/data/raw/bus added to project
/content/gdrive/My Drive/ColabNtb/ttc_delay/data/raw/subway added to project
/content/gdrive/My Drive/ColabNtb/ttc_delay/data/raw/streetcar added to project
/content/gdrive/My Drive/ColabNtb/ttc_delay/data/intermediate added to project
/content/gdrive/My Drive/ColabNtb/ttc_delay/data/intermediate/bus added to project
/content/gdrive/My Drive/ColabNtb/ttc_delay/data/intermediate/subway added to project
/content/gdrive/My Drive/ColabNtb/ttc_delay/data/intermediate/streetcar added to project
/content/gdrive/My Drive/ColabNtb/ttc_delay/data/final added to project
/content/gdrive/My Drive/ColabNtb/ttc_delay/data/final/bus added to project
/content/gdrive/My Drive/ColabNtb/ttc_delay/data/final/subway added to project
/con

In [None]:
# Save paths into data folder
with open(os.path.join(paths['data'], 'paths.json'), 'w') as json_file:
  json_file.write(json.dumps(paths))