In [7]:
# some exploratory data analysis
from datetime import datetime
from dateutil.relativedelta import relativedelta
import os

import pandas as pd
# set pandas to show 100 the columns
pd.set_option('display.max_columns', 100)

import requests


In [None]:
    """
    1. get the data from S3
    2. transform the weather data (weather_data)
    3. transform the taxi data (taxi_trips)
    4. update payment type master table (payment_type_master)
    5. update company master table (company_master)
    6. update the ids in the taxi_trips using the latest master tables
    7. upload weather data fo S3 (weather_data)
    8. upload taxi data fo S3 (taxi_trips)
    9. upload the new payment and company master tables (payment_type_master, company_master)
    """

#### 3. transform the taxi data (taxi_trips)

In [8]:
# get the taxi data of the last full month (T-1 months') data
current_datetime = datetime.now()    # current date
# get the data two months before as a formatted string
formatted_datetime = (current_datetime - relativedelta(months=2)).strftime('%Y-%m-%d')

url = f'https://data.cityofchicago.org/resource/wrvz-psew.json?$where=trip_start_timestamp >= "{formatted_datetime}T00:00:00" AND trip_start_timestamp <= "{formatted_datetime}T23:59:59"&$limit=30000'

# the os.environ.get looks for the specified variable in the .env file of the root
# folder of the project
headers = {'X-App-Token': os.environ.get("CHICHAGO_API_TOKEN")}

# response = requests.get(url, headers)    # in case an error comes leave the headers parameter
response = requests.get(url)

data = response.json()


In [9]:
# create a dataframe from the taxi trip data
taxi_trips = pd.DataFrame(data)

In [11]:
taxi_trips.columns

Index(['trip_id', 'taxi_id', 'trip_start_timestamp', 'trip_end_timestamp',
       'trip_seconds', 'trip_miles', 'pickup_census_tract',
       'dropoff_census_tract', 'pickup_community_area',
       'dropoff_community_area', 'fare', 'tips', 'tolls', 'extras',
       'trip_total', 'payment_type', 'company', 'pickup_centroid_latitude',
       'pickup_centroid_longitude', 'pickup_centroid_location',
       'dropoff_centroid_latitude', 'dropoff_centroid_longitude',
       'dropoff_centroid_location'],
      dtype='object')

In [12]:
# the order is important, because these two columns contain lots of NaN values
# first drop them, then the rows having NaN in the other columns only
taxi_trips.drop(['pickup_census_tract', 'dropoff_census_tract'], axis=1, inplace=True)
taxi_trips.drop(['pickup_centroid_location', 'dropoff_centroid_location'], axis=1, inplace=True)
taxi_trips.dropna(inplace=True)
taxi_trips.rename(columns={"pickup_community_area": "pickup_community_area_id",
                           "dropoff_community_area": "dropoff_community_area_id"},
                           inplace=True)
taxi_trips['trip_start_timestamp'] = pd.to_datetime(taxi_trips['trip_start_timestamp']).dt.floor('H')
#taxi_trips['datetime_for_weather'] = taxi_trips['trip_start_timestamp'].dt.floor('H')

#### taxi_trips transformations function

In [16]:
def taxi_trips_transformations(taxi_trips: pd.DataFrame) -> pd.DataFrame:
    """ Perform transformation with the taxi data

    Args:
        taxi_trips (pd.DataFrame): 
            The DataFrame holding the daily taxi trips.

    Returns:
        pd.DataFrame:
            The cleaned, DataFrame holding the daily taxi trips.
    """
    # the order is important, because these two columns contain lots of NaN values
    # first drop them, then the rows having NaN in the other columns only
    taxi_trips.drop(['pickup_census_tract','dropoff_census_tract',
                     'pickup_centroid_location', 'dropoff_centroid_location'],
                    axis=1, inplace=True)
    taxi_trips.dropna(inplace=True)
    taxi_trips.rename(columns={"pickup_community_area": "pickup_community_area_id",
                            "dropoff_community_area": "dropoff_community_area_id"},
                            inplace=True)
    taxi_trips['trip_start_timestamp'] = pd.to_datetime(taxi_trips['trip_start_timestamp']).dt.floor('H')