In [2]:
# some exploratory data analysis
from datetime import datetime
from dateutil.relativedelta import relativedelta
import os

import pandas as pd
# set pandas to show 100 the columns
pd.set_option('display.max_columns', 100)

import requests


In [3]:
    """
    1. get the data from S3
    2. transform the weather data (weather_data)
    3. transform the taxi data (taxi_trips) - DONE
    4. update payment type master table (payment_type_master) - DONE
    5. update company master table (company_master) - DONE
    6. update the ids in the taxi_trips using the latest master tables
    7. upload weather data fo S3 (weather_data)
    8. upload taxi data fo S3 (taxi_trips)
    9. upload the new payment and company master tables (payment_type_master, company_master)
    """

'\n1. get the data from S3\n2. transform the weather data (weather_data)\n3. transform the taxi data (taxi_trips)\n4. update payment type master table (payment_type_master)\n5. update company master table (company_master)\n6. update the ids in the taxi_trips using the latest master tables\n7. upload weather data fo S3 (weather_data)\n8. upload taxi data fo S3 (taxi_trips)\n9. upload the new payment and company master tables (payment_type_master, company_master)\n'

#### 3. transform the taxi data (taxi_trips)

In [4]:
# get the taxi data of the last full month (T-1 months') data
current_datetime = datetime.now()    # current date
# get the data two months before as a formatted string
formatted_datetime = (current_datetime - relativedelta(months=2)).strftime('%Y-%m-%d')

url = f'https://data.cityofchicago.org/resource/wrvz-psew.json?$where=trip_start_timestamp >= "{formatted_datetime}T00:00:00" AND trip_start_timestamp <= "{formatted_datetime}T23:59:59"&$limit=30000'

# the os.environ.get looks for the specified variable in the .env file of the root
# folder of the project
headers = {'X-App-Token': os.environ.get("CHICHAGO_API_TOKEN")}

# response = requests.get(url, headers)    # in case an error comes leave the headers parameter
response = requests.get(url)

data = response.json()


In [13]:
# create a dataframe from the taxi trip data
taxi_trips = pd.DataFrame(data)

In [14]:
taxi_trips.columns

Index(['trip_id', 'taxi_id', 'trip_start_timestamp', 'trip_end_timestamp',
       'trip_seconds', 'trip_miles', 'pickup_community_area',
       'dropoff_community_area', 'fare', 'tips', 'tolls', 'extras',
       'trip_total', 'payment_type', 'company', 'pickup_centroid_latitude',
       'pickup_centroid_longitude', 'pickup_centroid_location',
       'dropoff_centroid_latitude', 'dropoff_centroid_longitude',
       'dropoff_centroid_location', 'pickup_census_tract',
       'dropoff_census_tract'],
      dtype='object')

In [15]:
# # the order is important, because these two columns contain lots of NaN values
# # first drop them, then the rows having NaN in the other columns only
# taxi_trips.drop(['pickup_census_tract', 'dropoff_census_tract'], axis=1, inplace=True)
# taxi_trips.drop(['pickup_centroid_location', 'dropoff_centroid_location'], axis=1, inplace=True)
# taxi_trips.dropna(inplace=True)
# taxi_trips.rename(columns={"pickup_community_area": "pickup_community_area_id",
#                            "dropoff_community_area": "dropoff_community_area_id"},
#                            inplace=True)
# taxi_trips['trip_start_timestamp'] = pd.to_datetime(taxi_trips['trip_start_timestamp']).dt.floor('H')
# #taxi_trips['datetime_for_weather'] = taxi_trips['trip_start_timestamp'].dt.floor('H')

#### taxi_trips transformations function

In [16]:
def taxi_trips_transformations(taxi_trips: pd.DataFrame) -> pd.DataFrame:
    """ Perform transformation with the taxi data

    Args:
        taxi_trips (pd.DataFrame): 
            The DataFrame holding the daily taxi trips.

    Returns:
        pd.DataFrame:
            The cleaned, DataFrame holding the daily taxi trips.
    """
    
    # error handling - can be extended with some other ideas
    if not isinstance(taxi_trips, pd.DataFrame):
        raise TypeError('taxi_trips is a not a valid Pandas DataFrame.')
    
    # the order is important, because these two columns contain lots of NaN values
    # first drop them, then the rows having NaN in the other columns only
    taxi_trips.drop(['pickup_census_tract','dropoff_census_tract',
                     'pickup_centroid_location', 'dropoff_centroid_location'],
                    axis=1, inplace=True)
    taxi_trips.dropna(inplace=True)
    taxi_trips.rename(columns={"pickup_community_area": "pickup_community_area_id",
                            "dropoff_community_area": "dropoff_community_area_id"},
                            inplace=True)
    taxi_trips['trip_start_timestamp'] = pd.to_datetime(taxi_trips['trip_start_timestamp']).dt.floor('H')

In [17]:
taxi_trips_transformations(taxi_trips)

#### company update codes

In [18]:
company_master = taxi_trips['company'].drop_duplicates().reset_index(drop=True)


company_master = pd.DataFrame(
    {
        'company_id': range(1, len(company_master)+1),    # the id's usually starts from 1 despite indeces starting from 0
        'company': company_master
    }
)

company_master

Unnamed: 0,company_id,company
0,1,Taxi Affiliation Services
1,2,Flash Cab
2,3,City Service
3,4,Chicago Independents
4,5,Sun Taxi
5,6,Taxicab Insurance Agency Llc
6,7,Star North Taxi Management Llc
7,8,Choice Taxi Association
8,9,Globe Taxi
9,10,Chicago City Taxi Association


In [19]:
# how to extend the company_master table if a new type of payment appears?
new_company_data = [
    {'company': 'Petani Cab Corp'},
    {'company': 'X'},
    {'company': 'Y'}
]
new_company_mapping = pd.DataFrame(new_company_data)

In [20]:
company_max_id = company_master['company_id'].max()

In [21]:
# get the list of new companies
list_of_new_companies = []

for company in new_company_mapping['company'].values:   # go through the list of the latest list of companies
    if company not in company_master['company'].values: # find if any is missing from our master table
        list_of_new_companies.append(company)           # add it to the list of new companies

# the same in one line
list_of_new_companies_one_line = [company for company in new_company_mapping['company'].values
                                  if company not in company_master['company'].values]
list_of_new_companies_one_line

['X', 'Y']

In [22]:
new_companies_df = pd.DataFrame({
    'company_id': range(company_max_id + 1, company_max_id + len(list_of_new_companies_one_line) + 1),
    'company': list_of_new_companies_one_line
})
new_companies_df

Unnamed: 0,company_id,company
0,32,X
1,33,Y


In [23]:
updated_company_master = pd.concat([company_master, new_companies_df], ignore_index=True)
updated_company_master.tail()

Unnamed: 0,company_id,company
28,29,2733 - 74600 Benny Jona
29,30,5167 - 71969 5167 Taxi Inc
30,31,Petani Cab Corp
31,32,X
32,33,Y


In [24]:
def update_company_master(taxi_trips: pd.DataFrame, company_master: pd.DataFrame) -> pd.DataFrame :
    """ Extend the company master with new companies if there are any of them.

    Args:
        taxi_trips (pd.DataFrame):
            DataFrame holding the daily taxi trips.
        company_master (pd.DataFrame):
            DataFrame holding the company_master data.

    Returns:
        pd.DataFrame:
            The updated company master data. If no new company appeared returns the original one.
    """
    company_max_id = company_master['company_id'].max()
    list_of_new_companies_one_line = [company for company in taxi_trips['company'].values
                                  if company not in company_master['company'].values]
    new_companies_df = pd.DataFrame({
        'company_id': range(company_max_id + 1, company_max_id + len(list_of_new_companies_one_line) + 1),
        'company': list_of_new_companies_one_line
        })
    updated_company_master = pd.concat([company_master, new_companies_df], ignore_index=True)
    
    return updated_company_master

In [25]:
taxi_trips_company_only = pd.DataFrame({
    'company_id': range(1,4),
    'company':     ['Petani Cab Corp', 'X', 'Y'] 
})
taxi_trips_company_only

Unnamed: 0,company_id,company
0,1,Petani Cab Corp
1,2,X
2,3,Y


In [None]:
updated_company_master = update_company_master(taxi_trips=taxi_trips_company_only, company_master=company_master)
updated_company_master

#### payment type update codes

In [34]:
payment_type_master = taxi_trips['payment_type'].drop_duplicates().reset_index(drop=True)


payment_type_master = pd.DataFrame(
    {
        'payment_type_id': range(1, len(payment_type_master)+1),    # the id's usually starts from 1 despite indeces starting from 0
        'payment_type': payment_type_master
    }
)


taxi_trips_payment_type_only = pd.DataFrame({
    'payment_type_id': range(1,4),
    'payment_type':     ['Credit Card', 'X', 'Y'] 
})
payment_type_master
taxi_trips_payment_type_only

Unnamed: 0,payment_type_id,payment_type
0,1,Credit Card
1,2,X
2,3,Y


In [28]:
def update_payment_type_master(taxi_trips: pd.DataFrame, payment_type_master: pd.DataFrame) -> pd.DataFrame :
    """ Extend the payment_type master with new payment types if there are any of them.

    Args:
        taxi_trips (pd.DataFrame):
            DataFrame holding the daily taxi trips.
        payment_type_master (pd.DataFrame):
            DataFrame holding the payment_type_master data.

    Returns:
        pd.DataFrame:
            The updated payment_type master data. If no new payment type appeared returns the original one.
    """
    payment_type_max_id = payment_type_master['payment_type_id'].max()
    list_of_new_payment_types_one_line = [payment_type for payment_type in taxi_trips['payment_type'].values
                                  if payment_type not in payment_type_master['payment_type'].values]
    new_payment_types_df = pd.DataFrame({
        'payment_type_id': range(payment_type_max_id + 1, payment_type_max_id + len(list_of_new_payment_types_one_line) + 1),
        'payment_type': list_of_new_payment_types_one_line
        })
    updated_payment_type_master = pd.concat([payment_type_master, new_payment_types_df], ignore_index=True)
    
    return updated_payment_type_master

In [35]:
taxi_trips['payment_type']

0        Credit Card
1               Cash
2             Mobile
3               Cash
4        Credit Card
            ...     
21489    Credit Card
21490    Credit Card
21492           Cash
21493    Credit Card
21495         Mobile
Name: payment_type, Length: 19573, dtype: object

In [36]:
updated_payment_type_master = update_payment_type_master(taxi_trips=taxi_trips_payment_type_only, payment_type_master=payment_type_master)
updated_payment_type_master

Unnamed: 0,payment_type_id,payment_type
0,1,Credit Card
1,2,Cash
2,3,Mobile
3,4,Prcard
4,5,Unknown
5,6,Dispute
6,7,No Charge
7,8,X
8,9,Y


#### combine the update functions to a more generic one

In [37]:
def update_master(taxi_trips: pd.DataFrame, master: pd.DataFrame, id_column: str,
                  value_column: str) -> pd.DataFrame :
    """ Extend the master with new types if there are any of them.

    Args:
        taxi_trips (pd.DataFrame):
            DataFrame holding the daily taxi trips.
        master (pd.DataFrame):
            DataFrame holding the master data.
        id_column (str):
            The id column of the master DataFrame.
        value_column (str): 
            The value column of the master and taxi_trips DataFrame.

    Returns:
        pd.DataFrame:
            The updated master data. If no new type appeared returns the original one.
    """
    max_id = master[id_column].max()
    list_new_values = [value for value in taxi_trips[value_column].values
                                  if value not in master[value_column].values]
    new_values_df = pd.DataFrame({
        value_column: range(max_id + 1, max_id + len(list_new_values) + 1),
        id_column: list_new_values
        })
    updated_master = pd.concat([master, new_values_df], ignore_index=True)
    
    return updated_master

In [38]:
test_payment_type_master = update_master(taxi_trips=taxi_trips_payment_type_only,
                                         master=payment_type_master,
                                         id_column='payment_type_id',
                                         value_column='payment_type'
                                         )

In [None]:
test_payment_type_master

In [41]:
test_company_type_master = update_master(taxi_trips=taxi_trips_company_only,
                                         master=company_master,
                                         id_column='company_id',
                                         value_column='company'
                                         )

In [42]:
test_company_type_master

Unnamed: 0,company_id,company
0,1,Taxi Affiliation Services
1,2,Flash Cab
2,3,City Service
3,4,Chicago Independents
4,5,Sun Taxi
5,6,Taxicab Insurance Agency Llc
6,7,Star North Taxi Management Llc
7,8,Choice Taxi Association
8,9,Globe Taxi
9,10,Chicago City Taxi Association
