In [3]:
# some exploratory data analysis
from datetime import datetime
from dateutil.relativedelta import relativedelta
import os

import pandas as pd
# set pandas to show 100 the columns
pd.set_option('display.max_columns', 100)

import requests


In [None]:
    """
    1. get the data from S3
    2. transform the weather data (weather_data)
    3. transform the taxi data (taxi_trips)
    4. update payment type master table (payment_type_master)
    5. update company master table (company_master)
    6. update the ids in the taxi_trips using the latest master tables
    7. upload weather data fo S3 (weather_data)
    8. upload taxi data fo S3 (taxi_trips)
    9. upload the new payment and company master tables (payment_type_master, company_master)
    """

#### 3. transform the taxi data (taxi_trips)

In [4]:
# get the taxi data of the last full month (T-1 months') data
current_datetime = datetime.now()    # current date
# get the data two months before as a formatted string
formatted_datetime = (current_datetime - relativedelta(months=2)).strftime('%Y-%m-%d')

url = f'https://data.cityofchicago.org/resource/wrvz-psew.json?$where=trip_start_timestamp >= "{formatted_datetime}T00:00:00" AND trip_start_timestamp <= "{formatted_datetime}T23:59:59"&$limit=30000'

# the os.environ.get looks for the specified variable in the .env file of the root
# folder of the project
headers = {'X-App-Token': os.environ.get("CHICHAGO_API_TOKEN")}

# response = requests.get(url, headers)    # in case an error comes leave the headers parameter
response = requests.get(url)

data = response.json()


In [12]:
# create a dataframe from the taxi trip data
taxi_trips = pd.DataFrame(data)

In [13]:
taxi_trips.columns

Index(['trip_id', 'taxi_id', 'trip_start_timestamp', 'trip_end_timestamp',
       'trip_seconds', 'trip_miles', 'pickup_census_tract',
       'dropoff_census_tract', 'pickup_community_area',
       'dropoff_community_area', 'fare', 'tips', 'tolls', 'extras',
       'trip_total', 'payment_type', 'company', 'pickup_centroid_latitude',
       'pickup_centroid_longitude', 'pickup_centroid_location',
       'dropoff_centroid_latitude', 'dropoff_centroid_longitude',
       'dropoff_centroid_location'],
      dtype='object')

In [7]:
# the order is important, because these two columns contain lots of NaN values
# first drop them, then the rows having NaN in the other columns only
taxi_trips.drop(['pickup_census_tract', 'dropoff_census_tract'], axis=1, inplace=True)
taxi_trips.drop(['pickup_centroid_location', 'dropoff_centroid_location'], axis=1, inplace=True)
taxi_trips.dropna(inplace=True)
taxi_trips.rename(columns={"pickup_community_area": "pickup_community_area_id",
                           "dropoff_community_area": "dropoff_community_area_id"},
                           inplace=True)
taxi_trips['trip_start_timestamp'] = pd.to_datetime(taxi_trips['trip_start_timestamp']).dt.floor('H')
#taxi_trips['datetime_for_weather'] = taxi_trips['trip_start_timestamp'].dt.floor('H')

#### taxi_trips transformations function

In [8]:
def taxi_trips_transformations(taxi_trips: pd.DataFrame) -> pd.DataFrame:
    """ Perform transformation with the taxi data

    Args:
        taxi_trips (pd.DataFrame): 
            The DataFrame holding the daily taxi trips.

    Returns:
        pd.DataFrame:
            The cleaned, DataFrame holding the daily taxi trips.
    """
    
    # error handling - can be extended with some other ideas
    if not isinstance(taxi_trips, pd.DataFrame):
        raise TypeError('taxi_trips is a not a valid Pandas DataFrame.')
    
    # the order is important, because these two columns contain lots of NaN values
    # first drop them, then the rows having NaN in the other columns only
    taxi_trips.drop(['pickup_census_tract','dropoff_census_tract',
                     'pickup_centroid_location', 'dropoff_centroid_location'],
                    axis=1, inplace=True)
    taxi_trips.dropna(inplace=True)
    taxi_trips.rename(columns={"pickup_community_area": "pickup_community_area_id",
                            "dropoff_community_area": "dropoff_community_area_id"},
                            inplace=True)
    taxi_trips['trip_start_timestamp'] = pd.to_datetime(taxi_trips['trip_start_timestamp']).dt.floor('H')

In [14]:
taxi_trips_transformations(taxi_trips)

#### company update codes

In [15]:
company_master = taxi_trips['company'].drop_duplicates().reset_index(drop=True)


company_master = pd.DataFrame(
    {
        'company_id': range(1, len(company_master)+1),    # the id's usually starts from 1 despite indeces starting from 0
        'company': company_master
    }
)

company_master

Unnamed: 0,company_id,company
0,1,5 Star Taxi
1,2,Blue Ribbon Taxi Association
2,3,Taxi Affiliation Services
3,4,City Service
4,5,Choice Taxi Association
5,6,Flash Cab
6,7,Taxicab Insurance Agency Llc
7,8,Star North Taxi Management Llc
8,9,Globe Taxi
9,10,"Taxicab Insurance Agency, LLC"


In [16]:
# how to extend the company_master table if a new type of payment appears?
new_company_data = [
    {'company': 'Petani Cab Corp'},
    {'company': 'X'},
    {'company': 'Y'}
]
new_company_mapping = pd.DataFrame(new_company_data)

In [18]:
company_max_id = company_master['company_id'].max()

In [23]:
# get the list of new companies
list_of_new_companies = []

for company in new_company_mapping['company'].values:   # go through the list of the latest list of companies
    if company not in company_master['company'].values: # find if any is missing from our master table
        list_of_new_companies.append(company)           # add it to the list of new companies

# the same in one line
list_of_new_companies_one_line = [company for company in new_company_mapping['company'].values
                                  if company not in company_master['company'].values]
list_of_new_companies_one_line

['X', 'Y']

In [24]:
new_companies_df = pd.DataFrame({
    'company_id': range(company_max_id + 1, company_max_id + len(list_of_new_companies_one_line) + 1),
    'company': list_of_new_companies_one_line
})
new_companies_df

Unnamed: 0,company_id,company
0,31,X
1,32,Y


In [25]:
updated_company_master = pd.concat([company_master, new_companies_df], ignore_index=True)
updated_company_master.tail()

Unnamed: 0,company_id,company
27,28,6574 - Babylon Express Inc.
28,29,5167 - 71969 5167 Taxi Inc
29,30,Metro Jet Taxi A.
30,31,X
31,32,Y


In [None]:
def update_company_master(taxi_trips: pd.DataFrame, company_master: pd.DataFrame) -> pd.DataFrame :
    """ Extend the company master with new companies if there are any of them.

    Args:
        taxi_trips (pd.DataFrame):
            DataFrame holding the daily taxi trips.
        company_master (pd.DataFrame):
            DataFrame holding the company_master data.

    Returns:
        pd.DataFrame:
            The updated company master data. If no new company appeared returns the original one.
    """
    company_max_id = company_master['company_id'].max()
    list_of_new_companies_one_line = [company for company in taxi_trips['company'].values
                                  if company not in company_master['company'].values]
    new_companies_df = pd.DataFrame({
        'company_id': range(company_max_id + 1, company_max_id + len(list_of_new_companies_one_line) + 1),
        'company': list_of_new_companies_one_line
        })
    updated_company_master = pd.concat([company_master, new_companies_df], ignore_index=True)
    
    return updated_company_master