In [2]:
# some exploratory data analysis
from datetime import datetime
from dateutil.relativedelta import relativedelta
import os

import pandas as pd
# set pandas to show 100 the columns
pd.set_option('display.max_columns', 100)

import requests


In [3]:
# get the taxi data of the last full month (T-1 months') data
current_datetime = datetime.now()    # current date
# get the data two months before as a formatted string
formatted_datetime = (current_datetime - relativedelta(months=2)).strftime('%Y-%m-%d')

url = f'https://data.cityofchicago.org/resource/wrvz-psew.json?$where=trip_start_timestamp >= "{formatted_datetime}T00:00:00" AND trip_start_timestamp <= "{formatted_datetime}T23:59:59"&$limit=30000'

# the os.environ.get looks for the specified variable in the .env file of the root
# folder of the project
headers = {'X-App-Token': os.environ.get("CHICHAGO_API_TOKEN")}

# response = requests.get(url, headers)    # in case an error comes leave the headers parameter
response = requests.get(url)

data = response.json()

#type(response)    # we can check the type: "requests.models.Response"

In [4]:
# create a dataframe from the taxi trip data
taxi_trips = pd.DataFrame(data)
taxi_trips.head()

Unnamed: 0,trip_id,taxi_id,trip_start_timestamp,trip_end_timestamp,trip_seconds,trip_miles,pickup_community_area,dropoff_community_area,fare,tips,tolls,extras,trip_total,payment_type,company,pickup_centroid_latitude,pickup_centroid_longitude,pickup_centroid_location,dropoff_centroid_latitude,dropoff_centroid_longitude,dropoff_centroid_location,pickup_census_tract,dropoff_census_tract
0,ad6bac72a079e03c8313cc17ace346a1e5c8bb09,314f48fcaf3556a8f1c5a32b497057bb479a7afded63d2...,2023-11-18T23:45:00.000,2023-11-19T00:00:00.000,372,1.95,7,6.0,10.0,2.79,0,0.0,12.79,Mobile,5 Star Taxi,41.922686284,-87.649488729,"{'type': 'Point', 'coordinates': [-87.64948872...",41.944226601,-87.655998182,"{'type': 'Point', 'coordinates': [-87.65599818...",,
1,87d4a76503f84861ab22916f4615359e0e8c76e9,d79d3e19a1d5f6cb0cd4449d5579412262a2f1b182799d...,2023-11-18T23:45:00.000,2023-11-19T00:00:00.000,840,8.63,32,77.0,23.75,0.0,0,0.0,23.75,Cash,5 Star Taxi,41.878865584,-87.625192142,"{'type': 'Point', 'coordinates': [-87.62519214...",41.9867118,-87.663416405,"{'type': 'Point', 'coordinates': [-87.66341640...",,
2,047ea36120067180d105b3e618f68deedaa57c90,ae7a61c41decb6f41d165aba54911ea50c4fbf9f418142...,2023-11-18T23:45:00.000,2023-11-19T00:00:00.000,540,0.2,8,6.0,11.5,2.0,0,0.0,13.5,Credit Card,Taxi Affiliation Services,41.899602111,-87.633308037,"{'type': 'Point', 'coordinates': [-87.63330803...",41.944226601,-87.655998182,"{'type': 'Point', 'coordinates': [-87.65599818...",,
3,047fa8505afc93e44c70b7c3b8ddab0511545372,db757f6c1157d9f81e266396132cd641837c189b803c52...,2023-11-18T23:45:00.000,2023-11-19T00:15:00.000,1733,21.33,76,,52.25,0.0,0,33.5,86.25,Credit Card,Flash Cab,41.97907082,-87.903039661,"{'type': 'Point', 'coordinates': [-87.90303966...",,,,17031980000.0,
4,05c6ea9ded5f665d13ccb79257bedd1be5544563,b2a007b1410c7208af92ef3a97b87c0af4c3e7b49c2b76...,2023-11-18T23:45:00.000,2023-11-19T00:00:00.000,791,7.63,8,77.0,22.09,4.97,0,0.0,27.06,Mobile,Flash Cab,41.899602111,-87.633308037,"{'type': 'Point', 'coordinates': [-87.63330803...",41.9867118,-87.663416405,"{'type': 'Point', 'coordinates': [-87.66341640...",,


In [5]:
taxi_trips.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12556 entries, 0 to 12555
Data columns (total 23 columns):
 #   Column                      Non-Null Count  Dtype 
---  ------                      --------------  ----- 
 0   trip_id                     12556 non-null  object
 1   taxi_id                     12556 non-null  object
 2   trip_start_timestamp        12556 non-null  object
 3   trip_end_timestamp          12556 non-null  object
 4   trip_seconds                12554 non-null  object
 5   trip_miles                  12556 non-null  object
 6   pickup_community_area       12207 non-null  object
 7   dropoff_community_area      11426 non-null  object
 8   fare                        12513 non-null  object
 9   tips                        12513 non-null  object
 10  tolls                       12513 non-null  object
 11  extras                      12513 non-null  object
 12  trip_total                  12513 non-null  object
 13  payment_type                12556 non-null  ob

In [6]:
taxi_trips.describe()

Unnamed: 0,trip_id,taxi_id,trip_start_timestamp,trip_end_timestamp,trip_seconds,trip_miles,pickup_community_area,dropoff_community_area,fare,tips,tolls,extras,trip_total,payment_type,company,pickup_centroid_latitude,pickup_centroid_longitude,pickup_centroid_location,dropoff_centroid_latitude,dropoff_centroid_longitude,dropoff_centroid_location,pickup_census_tract,dropoff_census_tract
count,12556,12556,12556,12556,12554,12556,12207,11426,12513,12513,12513,12513,12513.0,12556,12556,12207.0,12207.0,12207,11438.0,11438.0,11438,3562,3421
unique,12556,1922,96,112,3153,2102,76,77,1218,955,19,141,2108.0,7,29,127.0,127.0,127,145.0,145.0,145,51,69
top,ad6bac72a079e03c8313cc17ace346a1e5c8bb09,9492b268e840fcd19b554ae0d61ab86a48eee56b7fba98...,2023-11-18T13:30:00.000,2023-11-18T13:45:00.000,0,0,8,8,9,0,0,0,3.25,Credit Card,Flash Cab,41.899602111,-87.633308037,"{'type': 'Point', 'coordinates': [-87.63330803...",41.899602111,-87.633308037,"{'type': 'Point', 'coordinates': [-87.63330803...",17031980000,17031980000
freq,1,33,251,253,264,1404,2907,2615,519,6429,12466,7800,255.0,4464,2783,1921.0,1921.0,1921,1436.0,1436.0,1436,726,451


In [7]:
# check rows where the fare is null, look at just 5 random samples
taxi_trips[taxi_trips['fare'].isna()].sample(5)

# in this case ask for advice from the potential users what to do

# some conclusions:
# - total trip might be erronous
# - fare data are missing from a few raws
# - the payment type and company occupies too much space
# - the centroid locations as Point coordinates are duplicates
# - the census tract data are very sparse, more than half of them are missing


Unnamed: 0,trip_id,taxi_id,trip_start_timestamp,trip_end_timestamp,trip_seconds,trip_miles,pickup_community_area,dropoff_community_area,fare,tips,tolls,extras,trip_total,payment_type,company,pickup_centroid_latitude,pickup_centroid_longitude,pickup_centroid_location,dropoff_centroid_latitude,dropoff_centroid_longitude,dropoff_centroid_location,pickup_census_tract,dropoff_census_tract
6969,b16b8f24bddc9366fde1069a5029bfd3a6fc9c80,15a05a171949e0f432775bd628d0d70a833e9de8fdf048...,2023-11-18T13:30:00.000,2023-11-18T14:30:00.000,3819,0.92,76,76,,,,,,Cash,Taxicab Insurance Agency Llc,41.97907082,-87.903039661,"{'type': 'Point', 'coordinates': [-87.90303966...",41.97907082,-87.903039661,"{'type': 'Point', 'coordinates': [-87.90303966...",17031980000.0,17031980000.0
4523,90af2014d2fc1688e4934e4967d1c7ff43a32803,9b6ede7d36f53ba56a1ec8882cc17703224ac426ba9ef9...,2023-11-18T16:30:00.000,2023-11-18T16:45:00.000,433,0.9,8,32,,,,,,Cash,Taxicab Insurance Agency Llc,41.892072635,-87.628874157,"{'type': 'Point', 'coordinates': [-87.62887415...",41.880994471,-87.632746489,"{'type': 'Point', 'coordinates': [-87.63274648...",17031081600.0,17031839100.0
3679,0d3914db37958b407aca21615b1e9cd8687a73d1,ea1095a215ac661a9a0da3f84747131c7bc3e6b71308b5...,2023-11-18T17:45:00.000,2023-11-18T17:45:00.000,119,0.95,56,57,,,,,,Cash,City Service,41.79259236,-87.769615453,"{'type': 'Point', 'coordinates': [-87.76961545...",41.810879008,-87.726363325,"{'type': 'Point', 'coordinates': [-87.72636332...",,
3686,08954817d42b566b69e2d203f0266b917986f527,f80bb6e887a18028617ef0e0861429d12762cfd858bcbc...,2023-11-18T17:45:00.000,2023-11-18T17:45:00.000,9,0.0,76,76,,,,,,Cash,City Service,41.97907082,-87.903039661,"{'type': 'Point', 'coordinates': [-87.90303966...",41.97907082,-87.903039661,"{'type': 'Point', 'coordinates': [-87.90303966...",17031980000.0,17031980000.0
9083,b1daf68190c6fd363da7d8f44569fe103f30cb30,3671b00a8d3d78e49e75a0eb9796fbfa1ab2e07b89d7ed...,2023-11-18T11:00:00.000,2023-11-18T11:15:00.000,398,1.52,32,8,,,,,,Cash,Taxicab Insurance Agency Llc,41.880994471,-87.632746489,"{'type': 'Point', 'coordinates': [-87.63274648...",41.892507781,-87.626214906,"{'type': 'Point', 'coordinates': [-87.62621490...",17031839100.0,17031081500.0


#### Transformation: deal with the NaN values

In [8]:
# the order is important, because these two columns contain lots of NaN values
# first drop them, then the rows having NaN in the other columns only
taxi_trips.drop(['pickup_census_tract', 'dropoff_census_tract'], axis=1, inplace=True)
taxi_trips.dropna(inplace=True)

#### Transformation: drop the unnecessary duplicated columns

In [9]:
taxi_trips.drop(['pickup_centroid_location', 'dropoff_centroid_location'], axis=1, inplace=True)

#### Transformation: renaming

In [10]:
taxi_trips.rename(columns={"pickup_community_area": "pickup_community_area_id",
                           "dropoff_community_area": "dropoff_community_area_id"},
                           inplace=True)

Transformation: an auxilary column to the weather data

In [11]:
taxi_trips['trip_start_timestamp'] = pd.to_datetime(taxi_trips['trip_start_timestamp'])
taxi_trips['datetime_for_weather'] = taxi_trips['trip_start_timestamp'].dt.floor('H')

#### Check joining the trips and the weather data

In [12]:
# get weather data from the open-meteo.com site, and create a dataframe out if it
url = 'https://archive-api.open-meteo.com/v1/era5'
# current date
current_datetime = datetime.now()
# get the data two months before as a formatted string
formatted_datetime = (current_datetime - relativedelta(months=2)).strftime('%Y-%m-%d')
params = {
    'latitude': 41.85,
    'longitude': -87.65,
    'start_date': formatted_datetime,
    'end_date': formatted_datetime,
    'hourly': 'temperature_2m,wind_speed_10m,rain,precipitation'
    }

response = requests.get(url, params=params)
weather_data = response.json()

# transform the JSON string to a pd dataframe
weather_data_filtered = {
    'datetime': weather_data['hourly']['time'],
    'temperature': weather_data['hourly']['temperature_2m'],
    'wind_speed': weather_data['hourly']['wind_speed_10m'],
    'rain': weather_data['hourly']['rain'],
    'precipitation': weather_data['hourly']['precipitation']
    }

#weather_data_filtered
weather_df = pd.DataFrame(weather_data_filtered)
weather_df['datetime'] = pd.to_datetime(weather_df['datetime'])
weather_df.info()
weather_df.head()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 24 entries, 0 to 23
Data columns (total 5 columns):
 #   Column         Non-Null Count  Dtype         
---  ------         --------------  -----         
 0   datetime       24 non-null     datetime64[ns]
 1   temperature    24 non-null     float64       
 2   wind_speed     24 non-null     float64       
 3   rain           24 non-null     float64       
 4   precipitation  24 non-null     float64       
dtypes: datetime64[ns](1), float64(4)
memory usage: 1.1 KB


Unnamed: 0,datetime,temperature,wind_speed,rain,precipitation
0,2023-11-18 00:00:00,4.5,9.4,0.0,0.0
1,2023-11-18 01:00:00,4.3,9.7,0.0,0.0
2,2023-11-18 02:00:00,3.5,9.9,0.0,0.0
3,2023-11-18 03:00:00,3.1,11.8,0.0,0.0
4,2023-11-18 04:00:00,2.7,12.4,0.0,0.0


In [13]:
taxi_trips_with_weather = taxi_trips.merge(weather_df, left_on='datetime_for_weather', right_on='datetime')


In [14]:
#taxi_trips_with_weather.head()

Transformation: Data type conversions

In [15]:
# we can characterize columns which are not object is the dataframe
taxi_trips.describe()

Unnamed: 0,trip_start_timestamp,datetime_for_weather
count,11312,11312
mean,2023-11-18 13:56:29.480197888,2023-11-18 13:33:47.545968640
min,2023-11-18 00:00:00,2023-11-18 00:00:00
25%,2023-11-18 10:30:00,2023-11-18 10:00:00
50%,2023-11-18 14:15:00,2023-11-18 14:00:00
75%,2023-11-18 18:30:00,2023-11-18 18:00:00
max,2023-11-18 23:45:00,2023-11-18 23:00:00


In [16]:
data_types = {
    'trip_end_timestamp': 'datetime64[ns]',
    'trip_seconds': 'int32',
    'trip_miles': 'float',
    'pickup_community_area_id': 'int8',
    'dropoff_community_area_id': 'int8',
    'fare': 'float',
    'tips': 'float',
    'tolls': 'float',
    'extras': 'float',
    'trip_total': 'float',
}
taxi_trips = taxi_trips.astype(data_types)

In [17]:
taxi_trips.info()

<class 'pandas.core.frame.DataFrame'>
Index: 11312 entries, 0 to 12555
Data columns (total 20 columns):
 #   Column                      Non-Null Count  Dtype         
---  ------                      --------------  -----         
 0   trip_id                     11312 non-null  object        
 1   taxi_id                     11312 non-null  object        
 2   trip_start_timestamp        11312 non-null  datetime64[ns]
 3   trip_end_timestamp          11312 non-null  datetime64[ns]
 4   trip_seconds                11312 non-null  int32         
 5   trip_miles                  11312 non-null  float64       
 6   pickup_community_area_id    11312 non-null  int8          
 7   dropoff_community_area_id   11312 non-null  int8          
 8   fare                        11312 non-null  float64       
 9   tips                        11312 non-null  float64       
 10  tolls                       11312 non-null  float64       
 11  extras                      11312 non-null  float64       


In [18]:
# much better, the non-object columns have some basic statistics easily
taxi_trips.describe()
# calculate the memory usage of the dataframe
taxi_trips.memory_usage(deep=True).sum()
# the data type optimized memory usage is less by a 40%

8862486

#### Sanity checks

In [19]:
taxi_trips.describe()

Unnamed: 0,trip_start_timestamp,trip_end_timestamp,trip_seconds,trip_miles,pickup_community_area_id,dropoff_community_area_id,fare,tips,tolls,extras,trip_total,datetime_for_weather
count,11312,11312,11312.0,11312.0,11312.0,11312.0,11312.0,11312.0,11312.0,11312.0,11312.0,11312
mean,2023-11-18 13:56:29.480197888,2023-11-18 14:17:25.120226048,1253.707302,5.618577,30.438826,26.757249,20.698788,2.446618,0.003228,1.263562,24.579,2023-11-18 13:33:47.545968640
min,2023-11-18 00:00:00,2023-11-18 00:00:00,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,2023-11-18 00:00:00
25%,2023-11-18 10:30:00,2023-11-18 10:45:00,476.0,0.88,8.0,8.0,8.75,0.0,0.0,0.0,10.75,2023-11-18 10:00:00
50%,2023-11-18 14:15:00,2023-11-18 14:30:00,907.0,2.64,28.0,25.0,15.0,0.0,0.0,0.0,17.82,2023-11-18 14:00:00
75%,2023-11-18 18:30:00,2023-11-18 18:45:00,1638.25,9.94,39.0,33.0,30.5,3.5,0.0,1.0,32.53,2023-11-18 18:00:00
max,2023-11-18 23:45:00,2023-11-19 15:45:00,70668.0,55.18,77.0,77.0,160.0,50.0,7.0,333.34,342.09,2023-11-18 23:00:00
std,,,1891.001844,6.16162,24.383393,22.201514,15.422265,3.54126,0.110709,4.791652,18.603607,


In [20]:
taxi_trips[taxi_trips['trip_end_timestamp'] == taxi_trips['trip_end_timestamp'].max()]

Unnamed: 0,trip_id,taxi_id,trip_start_timestamp,trip_end_timestamp,trip_seconds,trip_miles,pickup_community_area_id,dropoff_community_area_id,fare,tips,tolls,extras,trip_total,payment_type,company,pickup_centroid_latitude,pickup_centroid_longitude,dropoff_centroid_latitude,dropoff_centroid_longitude,datetime_for_weather
1653,34c481d3bfe04d77ff0924366b7843bd0dfbe76b,b58ac6538fb9dd4525ded8eb5af799c465e22d2b9899c5...,2023-11-18 20:45:00,2023-11-19 15:45:00,68827,9.59,32,76,28.0,0.0,0.0,0.0,28.0,Cash,Star North Taxi Management Llc,41.880994471,-87.632746489,41.97907082,-87.903039661,2023-11-18 20:00:00


In [21]:
taxi_trips[taxi_trips['trip_seconds'] == taxi_trips['trip_seconds'].max()]

Unnamed: 0,trip_id,taxi_id,trip_start_timestamp,trip_end_timestamp,trip_seconds,trip_miles,pickup_community_area_id,dropoff_community_area_id,fare,tips,tolls,extras,trip_total,payment_type,company,pickup_centroid_latitude,pickup_centroid_longitude,dropoff_centroid_latitude,dropoff_centroid_longitude,datetime_for_weather
4692,7ca1f6a0ee9e21e4394124d616f09ff9d03781b6,b85df3fd8cdc552a74a3d9314a93209b38870f70794370...,2023-11-18 16:15:00,2023-11-19 11:45:00,70668,10.16,6,3,41.75,0.0,0.0,0.0,41.75,Cash,Blue Ribbon Taxi Association,41.944226601,-87.655998182,41.96581197,-87.655878786,2023-11-18 16:00:00


In [22]:
taxi_trips[taxi_trips['fare'] == taxi_trips['fare'].max()]

Unnamed: 0,trip_id,taxi_id,trip_start_timestamp,trip_end_timestamp,trip_seconds,trip_miles,pickup_community_area_id,dropoff_community_area_id,fare,tips,tolls,extras,trip_total,payment_type,company,pickup_centroid_latitude,pickup_centroid_longitude,dropoff_centroid_latitude,dropoff_centroid_longitude,datetime_for_weather
8578,205648e78424c07a8c55c4da169ff07a331fe5f4,ae2b1026641fe1802c5c6e0f22f718230861e60d588b0c...,2023-11-18 11:45:00,2023-11-18 11:45:00,4,0.0,76,76,160.0,32.2,0.0,0.0,193.2,Credit Card,Taxicab Insurance Agency Llc,41.97907082,-87.903039661,41.97907082,-87.903039661,2023-11-18 11:00:00
11442,c5da4c59b487d992e6a15472fa451daaac38ae6d,8a2802b8001fa243450b71eb66e6e76b4bdc4ea50cbb19...,2023-11-18 04:45:00,2023-11-18 05:00:00,1320,0.0,8,76,160.0,0.0,0.0,0.0,160.0,Cash,Taxi Affiliation Services,41.899602111,-87.633308037,41.980264315,-87.913624596,2023-11-18 04:00:00


In [23]:
taxi_trips.nlargest(10, 'fare')

Unnamed: 0,trip_id,taxi_id,trip_start_timestamp,trip_end_timestamp,trip_seconds,trip_miles,pickup_community_area_id,dropoff_community_area_id,fare,tips,tolls,extras,trip_total,payment_type,company,pickup_centroid_latitude,pickup_centroid_longitude,dropoff_centroid_latitude,dropoff_centroid_longitude,datetime_for_weather
8578,205648e78424c07a8c55c4da169ff07a331fe5f4,ae2b1026641fe1802c5c6e0f22f718230861e60d588b0c...,2023-11-18 11:45:00,2023-11-18 11:45:00,4,0.0,76,76,160.0,32.2,0.0,0.0,193.2,Credit Card,Taxicab Insurance Agency Llc,41.97907082,-87.903039661,41.97907082,-87.903039661,2023-11-18 11:00:00
11442,c5da4c59b487d992e6a15472fa451daaac38ae6d,8a2802b8001fa243450b71eb66e6e76b4bdc4ea50cbb19...,2023-11-18 04:45:00,2023-11-18 05:00:00,1320,0.0,8,76,160.0,0.0,0.0,0.0,160.0,Cash,Taxi Affiliation Services,41.899602111,-87.633308037,41.980264315,-87.913624596,2023-11-18 04:00:00
4952,15a5cc27d2a58580f397d31afdaf6a8073a0c299,42e3ec7750e4be6e56c47bcdefe5cb86ddb0d0c65bcf4d...,2023-11-18 16:00:00,2023-11-18 21:45:00,20602,25.22,33,1,154.0,0.0,0.0,1.0,155.0,Cash,Taxicab Insurance Agency Llc,41.857183858,-87.620334624,42.009622881,-87.670166857,2023-11-18 16:00:00
6386,b49b8d169754f893b12f2d24a361c62df2393283,175a4c9a16e22bfd2fee721e6cdfdc85dfde21cccb1fdc...,2023-11-18 14:15:00,2023-11-18 16:15:00,6996,55.18,4,76,136.25,0.0,0.0,0.0,136.25,Cash,Sun Taxi,41.975170943,-87.687515515,41.980264315,-87.913624596,2023-11-18 14:00:00
10482,b16cb1b8d7421676a5393e4c0a63b0ea3f7528a6,641c9356c873f4b5fb13d4b2f70d8b4d4b7b2c98057272...,2023-11-18 08:45:00,2023-11-18 13:00:00,15120,27.9,3,4,117.5,0.0,0.0,0.0,117.5,Cash,Taxi Affiliation Services,41.96581197,-87.655878786,41.975170943,-87.687515515,2023-11-18 08:00:00
7169,e58be219ddced55a91574143d52733567a62f55b,78fb99d332dd664846f7934b7292dbb205641674541ff2...,2023-11-18 13:15:00,2023-11-18 16:15:00,10500,35.3,76,76,112.75,0.0,0.0,5.0,117.75,Cash,Taxi Affiliation Services,41.97907082,-87.903039661,41.97907082,-87.903039661,2023-11-18 13:00:00
9872,461f167eb0e870b02f0bde9f144ea2528a1dc683,71ebe150c7d317935a0843c988eea605df50b3ba7cb6bf...,2023-11-18 10:00:00,2023-11-18 13:30:00,11898,32.36,33,33,108.25,16.31,0.0,0.0,125.06,Credit Card,Flash Cab,41.849246754,-87.624135298,41.849246754,-87.624135298,2023-11-18 10:00:00
1158,98a39bd90a7eb8245f0935777e13384c1f1b9f7b,0bd1d932b3086e8afc5ffa6372c26866d85ff1abaa15db...,2023-11-18 21:30:00,2023-11-18 21:30:00,7,0.0,8,8,103.0,31.2,0.0,0.0,135.2,Credit Card,Taxicab Insurance Agency Llc,41.890922026,-87.618868355,41.890922026,-87.618868355,2023-11-18 21:00:00
618,afb0332537fc2ebbe7379f45b437007c2b4ee803,cefcf124c1b1156a94b0d3c6edd07763849c63f800638f...,2023-11-18 22:30:00,2023-11-18 23:00:00,2280,1.8,76,56,100.5,0.0,0.0,8.0,108.5,Cash,Taxi Affiliation Services,41.980264315,-87.913624596,41.79259236,-87.769615453,2023-11-18 22:00:00
4679,8fc908372cde0909fe145d3f51a350783ce37d75,555c4b09482029c17187f62aae86e4bab17fd875e37fef...,2023-11-18 16:15:00,2023-11-18 17:00:00,2710,18.26,8,76,100.0,0.0,0.0,0.0,101.0,Credit Card,Sun Taxi,41.899602111,-87.633308037,41.980264315,-87.913624596,2023-11-18 16:00:00


#### Data modelling
Create master tables for the payment types and the taxi company names

In [24]:
# master table for the payment types
taxi_trips['payment_type'].unique()
payment_type_master = taxi_trips['payment_type'].drop_duplicates().reset_index(drop=True)


payment_type_master = pd.DataFrame(
    {
        'payment_type_id': range(1, len(payment_type_master)+1),    # the id's usually starts from 1 despite indeces starting from 0
        'payment_type': payment_type_master
    }
)

payment_type_master

Unnamed: 0,payment_type_id,payment_type
0,1,Mobile
1,2,Cash
2,3,Credit Card
3,4,Prcard
4,5,Unknown
5,6,No Charge
6,7,Dispute


In [25]:
# master table for the company names
taxi_trips['company'].unique()
company_master = taxi_trips['company'].drop_duplicates().reset_index(drop=True)


company_master = pd.DataFrame(
    {
        'company_id': range(1, len(company_master)+1),    # the id's usually starts from 1 despite indeces starting from 0
        'company': company_master
    }
)

company_master

Unnamed: 0,company_id,company
0,1,5 Star Taxi
1,2,Taxi Affiliation Services
2,3,Flash Cab
3,4,Sun Taxi
4,5,Taxicab Insurance Agency Llc
5,6,City Service
6,7,Medallion Leasin
7,8,Chicago Independents
8,9,"Taxicab Insurance Agency, LLC"
9,10,Choice Taxi Association


Check the memory usage if we replace the long strings with small integers

In [26]:
print(taxi_trips.memory_usage(deep=True).sum())

taxi_trips_id = taxi_trips.merge(payment_type_master, on='payment_type')
taxi_trips_id = taxi_trips_id.merge(company_master, on='company')
taxi_trips_id.drop(['payment_type', 'company'], axis=1, inplace=True)
print(taxi_trips_id.memory_usage(deep=True).sum())

8862486
7391737


In [27]:
taxi_trips_id.sample(5)

Unnamed: 0,trip_id,taxi_id,trip_start_timestamp,trip_end_timestamp,trip_seconds,trip_miles,pickup_community_area_id,dropoff_community_area_id,fare,tips,tolls,extras,trip_total,pickup_centroid_latitude,pickup_centroid_longitude,dropoff_centroid_latitude,dropoff_centroid_longitude,datetime_for_weather,payment_type_id,company_id
8547,b5d9caa6d3fc92921969477e39458e2ef2a3bd83,6161b78168fb8a51b06c3994f62522e1d21390a969e3b1...,2023-11-18 17:30:00,2023-11-18 17:45:00,1163,3.11,8,6,10.0,0.0,0.0,0.0,10.0,41.899602111,-87.633308037,41.944226601,-87.655998182,2023-11-18 17:00:00,2,19
6877,ddf1e6842d14f11cbaaf99aaf202582fd7dc966b,ba3d352fc557396edc7443cf0ad65e8593c05a838a856a...,2023-11-18 23:30:00,2023-11-18 23:45:00,801,3.81,8,6,14.21,2.71,0.0,0.0,16.92,41.899602111,-87.633308037,41.944226601,-87.655998182,2023-11-18 23:00:00,1,6
7556,9517afe073fda7309262414fe847c14506a7a9f1,b6dc3e934508cc47b6b7f2c79e29d812448cc31261390f...,2023-11-18 20:30:00,2023-11-18 21:00:00,1507,9.8,76,15,26.5,10.0,0.0,4.0,41.0,41.980264315,-87.913624596,41.954027649,-87.763399032,2023-11-18 20:00:00,3,6
54,ecfa44c78d54b4400aa2e533eb1f4deea15c8e23,03cfe911366a3ec59f30b328348efcbcb5a55aabe3befe...,2023-11-18 19:15:00,2023-11-18 19:45:00,2425,17.47,76,76,43.5,11.6,0.0,14.0,69.6,41.97907082,-87.903039661,41.97907082,-87.903039661,2023-11-18 19:00:00,1,1
6586,d441200c6ca33c02868a2b531571c6be15f01e20,d9c5bad5f5bca88a0091b90ae6946b0afcd15fbf3d9c41...,2023-11-18 16:00:00,2023-11-18 16:00:00,52,0.01,8,8,25.0,6.38,0.0,0.0,31.88,41.890922026,-87.618868355,41.890922026,-87.618868355,2023-11-18 16:00:00,3,5


In [28]:
# write the master tables to csv files
payment_type_master.to_csv(r'..\..\csv\payment_type_master.csv', index=False)
company_master.to_csv(r'..\..\csv\company_master.csv', index=False)

In [29]:
# master table for the payment types
taxi_trips['payment_type'].unique()
payment_type_master = taxi_trips['payment_type'].drop_duplicates().reset_index(drop=True)


payment_type_master = pd.DataFrame(
    {
        'payment_type_id': range(1, len(payment_type_master)+1),    # the id's usually starts from 1 despite indeces starting from 0
        'payment_type': payment_type_master
    }
)

payment_type_master

Unnamed: 0,payment_type_id,payment_type
0,1,Mobile
1,2,Cash
2,3,Credit Card
3,4,Prcard
4,5,Unknown
5,6,No Charge
6,7,Dispute


In [30]:
# how to extend the payment_type_master table if a new type of payment appears?
new_payment_type_data = [
    {'payment_type': 'Credit Card'},
    {'payment_type': 'X'},
    {'payment_type': 'Y'}
]
new_payment_type_mapping = pd.DataFrame(new_payment_type_data)

payment_type_max_id = payment_type_master['payment_type_id'].max()

new_payment_type_mapping['payment_type_id'] =\
    range(payment_type_max_id + 1,
          payment_type_max_id + len(new_payment_type_mapping) + 1)

In [34]:
new_payment_type_mapping['payment_type'].isin(payment_type_master['payment_type'])

# filter only the payment types which are missing from the payment type master
# by default the filtering select the rows for which the evaluation is true, but
# using ~ we negate the results, so this line filters out from the actual list of
# payment types which are missing from our master table
new_payment_type_mapping[~new_payment_type_mapping['payment_type'].isin(payment_type_master['payment_type'])]
new_payment_types = new_payment_type_mapping[~new_payment_type_mapping['payment_type'].isin(payment_type_master['payment_type'])]

Unnamed: 0,payment_type,payment_type_id
1,X,9
2,Y,10
