# Black Taxis

In [None]:
import io
import pandas as pd
import requests
from bs4 import BeautifulSoup


if 'data_loader' not in globals():
    from mage_ai.data_preparation.decorators import data_loader
if 'test' not in globals():
    from mage_ai.data_preparation.decorators import test


@data_loader
def load_data_from_api(*args, **kwargs):
    """
    Template for loading data from API
    """
    url = "https://www.nyc.gov/site/tlc/about/tlc-trip-record-data.page"
    response = requests.get(url)

    # Contenido de la pagina
    soup = BeautifulSoup(response.content, "html.parser")
    # Creo una lista para alojar todos los links que sean de los archivos parquet
    parquet_files = []

    # Hago una filtración de los archivos que pertenecen solamente a los años que nos importan
    selected_years = ['2020', '2021', '2022', '2023']
    years = [2020, 2021, 2022, 2023]

    # Busco todos los elementos que sean links
    for link in soup.find_all('a',href=True):
        if link['href'].endswith('.parquet'): # Si el elemento termina en .parquet, añado el elemento a la lista parquet_files
            if any(year in link['href'] for year in selected_years):
                print(link['href'])
                parquet_files.append(link['href'])

    high_volume_fhv = []
    for link in parquet_files:
        if 'fhvhv_tripdata' in link: # Todos los links que contengan 'yellow_tripdata' en su texto
            high_volume_fhv.append(link)

    high_volume_fhv = sorted(high_volume_fhv, reverse=True)

    # IdLocation de Manhattan
    manhattan_zones = [  4,  12,  13,  24,  41,  42,  43,  45,  48,  50,  68,  74,  75,
        79,  87,  88,  90, 100, 103, 104, 105, 107, 113, 114, 116, 120,
       125, 127, 128, 137, 140, 141, 142, 143, 144, 148, 151, 152, 153,
       158, 161, 162, 163, 164, 166, 170, 186, 194, 202, 209, 211, 224,
       229, 230, 231, 232, 233, 234, 236, 237, 238, 239, 243, 244, 246,
       249, 261, 262, 263]

    # Creo la lista para ir depositando todos los dataframes
    dataframes = []
    for parquet_file in high_volume_fhv:
        df = pd.read_parquet(parquet_file)
        df = df[['pickup_datetime','PULocationID','DOLocationID']]
        df = df[df['pickup_datetime'].dt.year.isin(years)].reset_index(drop=True)

        # Selecciono solamente los registros que pertenecen a viajes del distrito de Manhattan
        df = df[df.PULocationID.isin(manhattan_zones) & df.DOLocationID.isin(manhattan_zones)].reset_index(drop=True)
        dataframes.append(df)

    # Ahora concateno todos esos dataframes en uno solo
    full_df = pd.concat(dataframes, ignore_index=True)
    '''
    full_df['idblack'] = ('black' + full_df['pickup_datetime'].dt.year.astype(str) + full_df['pickup_datetime'].dt.strftime('%m%d%H%M%S'))

    full_df['idblack'] = full_df['idblack'].str.replace(':','')

    full_df = full_df[['idblack',
    'pickup_datetime',
    'PULocationID',
    'DOLocationID']]
    '''

    return full_df

@test
def test_output(output, *args) -> None:
    """
    Template code for testing the output of the block.
    """
    assert output is not None, 'The output is undefined'

# Yellow Taxis

In [None]:
import io
import pandas as pd
import requests
from bs4 import BeautifulSoup


if 'data_loader' not in globals():
    from mage_ai.data_preparation.decorators import data_loader
if 'test' not in globals():
    from mage_ai.data_preparation.decorators import test


@data_loader
def load_data_from_api(*args, **kwargs):
    """
    Template for loading data from API
    """
    url = "https://www.nyc.gov/site/tlc/about/tlc-trip-record-data.page"
    response = requests.get(url)

    # Contenido de la pagina
    soup = BeautifulSoup(response.content, "html.parser")
    # Creo una lista para alojar todos los links que sean de los archivos parquet
    parquet_files = []

    # Hago una filtración de los archivos que pertenecen solamente a los años que nos importan

    selected_years = ['2020', '2021', '2022', '2023']
    years = [2020, 2021, 2022, 2023]

    # Busco todos los elementos que sean links
    for link in soup.find_all('a',href=True):
        if link['href'].endswith('.parquet'): # Si el elemento termina en .parquet, añado el elemento a la lista parquet_files
            if any(year in link['href'] for year in selected_years):
                print(link['href'])
                parquet_files.append(link['href'])

    yellow_taxis = []
    for link in parquet_files:
        if 'yellow_tripdata' in link: # Todos los links que contengan 'yellow_tripdata' en su texto
            yellow_taxis.append(link)

    yellow_taxis = sorted(yellow_taxis, reverse=True)

    # IdLocation de Manhattan
    manhattan_zones = [  4,  12,  13,  24,  41,  42,  43,  45,  48,  50,  68,  74,  75,
        79,  87,  88,  90, 100, 103, 104, 105, 107, 113, 114, 116, 120,
       125, 127, 128, 137, 140, 141, 142, 143, 144, 148, 151, 152, 153,
       158, 161, 162, 163, 164, 166, 170, 186, 194, 202, 209, 211, 224,
       229, 230, 231, 232, 233, 234, 236, 237, 238, 239, 243, 244, 246,
       249, 261, 262, 263]

    # Creo la lista para ir depositando todos los dataframes
    dataframes = []
    for parquet_file in yellow_taxis:
        df = pd.read_parquet(parquet_file)
        df = df[['tpep_pickup_datetime','PULocationID','DOLocationID']]
        df = df[df['tpep_pickup_datetime'].dt.year.isin(years)].reset_index(drop=True)

        # Selecciono solamente los registros que pertenecen a viajes del distrito de Manhattan
        df = df[df.PULocationID.isin(manhattan_zones) & df.DOLocationID.isin(manhattan_zones)].reset_index(drop=True)
        dataframes.append(df)

    # Ahora concateno todos esos dataframes en uno solo
    full_df = pd.concat(dataframes, ignore_index=True)

    return full_df

@test
def test_output(output, *args) -> None:
    """
    Template code for testing the output of the block.
    """
    assert output is not None, 'The output is undefined'

# Grey Taxis

In [2]:
list(map(lambda x: str(x), range(2017,2020)))

['2017', '2018', '2019']

In [None]:
import io
import pandas as pd
import requests
from bs4 import BeautifulSoup

if 'data_loader' not in globals():
    from mage_ai.data_preparation.decorators import data_loader
if 'test' not in globals():
    from mage_ai.data_preparation.decorators import test


@data_loader
def load_data_from_api(*args, **kwargs):
    """
    Template for loading data from API
    """
    url = "https://www.nyc.gov/site/tlc/about/tlc-trip-record-data.page"
    response = requests.get(url)

    # Contenido de la pagina
    soup = BeautifulSoup(response.content, "html.parser")
    # Creo una lista para alojar todos los links que sean de los archivos parquet
    parquet_files = []

    # Hago una filtración de los archivos que pertenecen solamente a los años que nos importan

    selected_years = list(map(lambda x: str(x),range(2017,2020)))
    years = list(range(2017,2020))

    # Busco todos los elementos que sean links
    for link in soup.find_all('a',href=True):
        if link['href'].endswith('.parquet'): # Si el elemento termina en .parquet, añado el elemento a la lista parquet_files
            if any(year in link['href'] for year in selected_years):
                print(link['href'])
                parquet_files.append(link['href'])

    for_hire_vehicles = []
    for link in parquet_files:
        if 'fhv_tripdata' in link: # Todos los links que contengan 'yellow_tripdata' en su texto
            for_hire_vehicles.append(link)

    for_hire_vehicles = sorted(for_hire_vehicles, reverse=True)

    # IdLocation de Manhattan
    manhattan_zones = [  4,  12,  13,  24,  41,  42,  43,  45,  48,  50,  68,  74,  75,
        79,  87,  88,  90, 100, 103, 104, 105, 107, 113, 114, 116, 120,
       125, 127, 128, 137, 140, 141, 142, 143, 144, 148, 151, 152, 153,
       158, 161, 162, 163, 164, 166, 170, 186, 194, 202, 209, 211, 224,
       229, 230, 231, 232, 233, 234, 236, 237, 238, 239, 243, 244, 246,
       249, 261, 262, 263]

    # Creo la lista para ir depositando todos los dataframes
    dataframes = []
    for parquet_file in for_hire_vehicles:
        df = pd.read_parquet(parquet_file)
        df = df[['pickup_datetime','PUlocationID','DOlocationID']]
        df['pickup_datetime'] = pd.to_datetime(df['pickup_datetime'])
        df = df[df['pickup_datetime'].dt.year.isin(years)].reset_index(drop=True)

        # Selecciono solamente los registros que pertenecen a viajes del distrito de Manhattan
        df = df[df.PUlocationID.isin(manhattan_zones) & df.DOlocationID.isin(manhattan_zones)].reset_index(drop=True)
        dataframes.append(df)

    # Ahora concateno todos esos dataframes en uno solo
    full_df = pd.concat(dataframes, ignore_index=True)
    full_df['idgrey'] = ('grey' + full_df['pickup_datetime'].dt.year.astype(str) + full_df['pickup_datetime'].dt.strftime('%m%d%H%M%S'))

    full_df['idgrey'] = full_df['idgrey'].str.replace(':','')

    full_df = full_df[['idgrey',
    'pickup_datetime',
    'PUlocationID',
    'DOlocationID']]

    return full_df

@test
def test_output(output, *args) -> None:
    """
    Template code for testing the output of the block.
    """
    assert output is not None, 'The output is undefined'

# Green Taxis

In [None]:
import io
import pandas as pd
import requests
from bs4 import BeautifulSoup


if 'data_loader' not in globals():
    from mage_ai.data_preparation.decorators import data_loader
if 'test' not in globals():
    from mage_ai.data_preparation.decorators import test


@data_loader
def load_data_from_api(*args, **kwargs):
    """
    Template for loading data from API
    """
    url = "https://www.nyc.gov/site/tlc/about/tlc-trip-record-data.page"
    response = requests.get(url)

    # Contenido de la pagina
    soup = BeautifulSoup(response.content, "html.parser")
    # Creo una lista para alojar todos los links que sean de los archivos parquet
    parquet_files = []

    # Hago una filtración de los archivos que pertenecen solamente a los años que nos importan
    selected_years = ['2020', '2021', '2022', '2023']
    years = [2020, 2021, 2022, 2023]

    # Busco todos los elementos que sean links
    for link in soup.find_all('a',href=True):
        if link['href'].endswith('.parquet'): # Si el elemento termina en .parquet, añado el elemento a la lista parquet_files
            if any(year in link['href'] for year in selected_years):
                print(link['href'])
                parquet_files.append(link['href'])

    green_taxis = []
    for link in parquet_files:
        if 'green_tripdata' in link: # Todos los links que contengan 'yellow_tripdata' en su texto
            green_taxis.append(link)

    green_taxis = sorted(green_taxis, reverse=True)

    # IdLocation de Manhattan
    manhattan_zones = [  4,  12,  13,  24,  41,  42,  43,  45,  48,  50,  68,  74,  75,
        79,  87,  88,  90, 100, 103, 104, 105, 107, 113, 114, 116, 120,
       125, 127, 128, 137, 140, 141, 142, 143, 144, 148, 151, 152, 153,
       158, 161, 162, 163, 164, 166, 170, 186, 194, 202, 209, 211, 224,
       229, 230, 231, 232, 233, 234, 236, 237, 238, 239, 243, 244, 246,
       249, 261, 262, 263]

    # Creo la lista para ir depositando todos los dataframes
    dataframes = []
    for parquet_file in green_taxis:
        df = pd.read_parquet(parquet_file)
        df = df[['lpep_pickup_datetime','PULocationID','DOLocationID']]
        df = df[df['lpep_pickup_datetime'].dt.year.isin(years)].reset_index(drop=True)

        # Selecciono solamente los registros que pertenecen a viajes del distrito de Manhattan
        df = df[df.PULocationID.isin(manhattan_zones) & df.DOLocationID.isin(manhattan_zones)].reset_index(drop=True)
        dataframes.append(df)

    # Ahora concateno todos esos dataframes en uno solo
    full_df = pd.concat(dataframes, ignore_index=True)

    full_df['idgreen'] = ('green' + full_df['lpep_pickup_datetime'].dt.year.astype(str) + full_df['lpep_pickup_datetime'].dt.strftime('%m%d%H%M%S'))

    full_df['idgreen'] = full_df['idgreen'].str.replace(':','')

    full_df = full_df[['idgreen',
    'lpep_pickup_datetime',
    'PULocationID',
    'DOLocationID']]

    return full_df

@test
def test_output(output, *args) -> None:
    """
    Template code for testing the output of the block.
    """
    assert output is not None, 'The output is undefined'

---

In [None]:
from mage_ai.settings.repo import get_repo_path
from mage_ai.io.bigquery import BigQuery
from mage_ai.io.config import ConfigFileLoader
from pandas import DataFrame
from os import path
import io
import pandas as pd
import requests
from bs4 import BeautifulSoup

if 'data_exporter' not in globals():
    from mage_ai.data_preparation.decorators import data_exporter


@data_exporter
def export_data_to_big_query(df: DataFrame, **kwargs) -> None:
    """
    Template for exporting data to a BigQuery warehouse.
    Specify your configuration settings in 'io_config.yaml'.

    Docs: https://docs.mage.ai/design/data-loading#bigquery
    """
    table_name = 'black_taxis'
    table_id = f'nyc-taxis-project.new_york_transport_project.{table_name}'

    config_path = path.join(get_repo_path(), 'io_config.yaml')
    config_profile = 'default'

    BigQuery.with_config(ConfigFileLoader(config_path, config_profile)).export(
        df,
        table_id,
        if_exists='replace',  # Specify resolution policy if table name already exists
    )