In [40]:
# some exploratory data analysis
from datetime import datetime
from dateutil.relativedelta import relativedelta
import os

import pandas as pd
# set pandas to show 100 the columns
pd.set_option('display.max_columns', 100)

import requests


In [41]:
# get the taxi data of the last full month (T-1 months') data
current_datetime = datetime.now()    # current date
# get the data two months before as a formatted string
formatted_datetime = (current_datetime - relativedelta(months=2)).strftime('%Y-%m-%d')

url = f'https://data.cityofchicago.org/resource/wrvz-psew.json?$where=trip_start_timestamp >= "{formatted_datetime}T00:00:00" AND trip_start_timestamp <= "{formatted_datetime}T23:59:59"&$limit=30000'

# the os.environ.get looks for the specified variable in the .env file of the root
# folder of the project
headers = {'X-App-Token': os.environ.get("CHICHAGO_API_TOKEN")}

# response = requests.get(url, headers)    # in case an error comes leave the headers parameter
response = requests.get(url)

data = response.json()

#type(response)    # we can check the type: "requests.models.Response"

In [42]:
# create a dataframe from the taxi trip data
taxi_trips = pd.DataFrame(data)
taxi_trips.head()

Unnamed: 0,trip_id,taxi_id,trip_start_timestamp,trip_end_timestamp,trip_seconds,trip_miles,pickup_community_area,dropoff_community_area,fare,tips,tolls,extras,trip_total,payment_type,company,pickup_centroid_latitude,pickup_centroid_longitude,pickup_centroid_location,dropoff_centroid_latitude,dropoff_centroid_longitude,dropoff_centroid_location,pickup_census_tract,dropoff_census_tract
0,026f2241beb08f90c94e191e71434f549f20e277,07c564a10fd59fb57a493d26e4f2a39f0229ac59cdbfb5...,2023-11-17T23:45:00.000,2023-11-18T00:00:00.000,392,1.7,32,8,7.5,2,0,1.5,11.5,Credit Card,Chicago Independents,41.878865584,-87.625192142,"{'type': 'Point', 'coordinates': [-87.62519214...",41.899602111,-87.633308037,"{'type': 'Point', 'coordinates': [-87.63330803...",,
1,0350fd9b9a5698c9b73e57bf5a87fbb062fd587e,317344e0629911ce4e71936079bb16b41147a62c8ccc3d...,2023-11-17T23:45:00.000,2023-11-18T00:00:00.000,642,11.71,69,24,29.75,0,0,0.0,29.75,Prcard,City Service,41.763246799,-87.616134111,"{'type': 'Point', 'coordinates': [-87.61613411...",41.901206994,-87.676355989,"{'type': 'Point', 'coordinates': [-87.67635598...",,
2,0218d519c8683c4c224ee4c7e42c829446efd9b9,a39edfa6e2ec6209ece046221d25bdb4c4fee327de239a...,2023-11-17T23:45:00.000,2023-11-18T00:15:00.000,1528,17.13,76,8,43.0,0,0,4.0,47.5,Credit Card,Sun Taxi,41.980264315,-87.913624596,"{'type': 'Point', 'coordinates': [-87.91362459...",41.899602111,-87.633308037,"{'type': 'Point', 'coordinates': [-87.63330803...",,
3,d7c6b5f394d2105e138649dec78443bb0ea07279,4d383b5d473f9402a59c74dd17d0d1c4d7b2f2c640bd06...,2023-11-17T23:45:00.000,2023-11-18T00:00:00.000,1260,0.7,43,28,30.5,0,0,0.0,30.5,Unknown,Taxi Affiliation Services,41.761577908,-87.572781987,"{'type': 'Point', 'coordinates': [-87.57278198...",41.874005383,-87.66351755,"{'type': 'Point', 'coordinates': [-87.66351754...",,
4,932b488f79d42078b3ff58b93decf6eeb29c968e,6c2706b6e0302b7ac0e10ea44c03a98c0db2151bf5d723...,2023-11-17T23:45:00.000,2023-11-17T23:45:00.000,548,1.63,8,24,7.75,15,0,0.0,23.25,Credit Card,Sun Taxi,41.899602111,-87.633308037,"{'type': 'Point', 'coordinates': [-87.63330803...",41.901206994,-87.676355989,"{'type': 'Point', 'coordinates': [-87.67635598...",,


In [43]:
taxi_trips.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 19578 entries, 0 to 19577
Data columns (total 23 columns):
 #   Column                      Non-Null Count  Dtype 
---  ------                      --------------  ----- 
 0   trip_id                     19578 non-null  object
 1   taxi_id                     19578 non-null  object
 2   trip_start_timestamp        19578 non-null  object
 3   trip_end_timestamp          19578 non-null  object
 4   trip_seconds                19573 non-null  object
 5   trip_miles                  19578 non-null  object
 6   pickup_community_area       19089 non-null  object
 7   dropoff_community_area      17814 non-null  object
 8   fare                        19529 non-null  object
 9   tips                        19529 non-null  object
 10  tolls                       19529 non-null  object
 11  extras                      19529 non-null  object
 12  trip_total                  19529 non-null  object
 13  payment_type                19578 non-null  ob

In [44]:
taxi_trips.describe()

Unnamed: 0,trip_id,taxi_id,trip_start_timestamp,trip_end_timestamp,trip_seconds,trip_miles,pickup_community_area,dropoff_community_area,fare,tips,tolls,extras,trip_total,payment_type,company,pickup_centroid_latitude,pickup_centroid_longitude,pickup_centroid_location,dropoff_centroid_latitude,dropoff_centroid_longitude,dropoff_centroid_location,pickup_census_tract,dropoff_census_tract
count,19578,19578,19578,19578,19573,19578,19089,17814,19529,19529,19529,19529,19529.0,19578,19578,19111.0,19111.0,19111,17942.0,17942.0,17942,8778,8523
unique,19578,2425,96,110,3768,2345,77,77,1416,1131,24,169,2587.0,7,32,144.0,144.0,144,181.0,181.0,181,68,111
top,026f2241beb08f90c94e191e71434f549f20e277,d40dae7ea46d61abca67eb53b157fe9cf0b485cca6dce1...,2023-11-17T17:15:00.000,2023-11-17T17:30:00.000,0,0,8,8,9,0,0,0,3.25,Credit Card,Flash Cab,41.980264315,-87.913624596,"{'type': 'Point', 'coordinates': [-87.91362459...",41.899602111,-87.633308037,"{'type': 'Point', 'coordinates': [-87.63330803...",17031980000,17031839100
freq,1,33,407,375,352,2034,4463,4544,905,9352,19434,12681,429.0,7609,3870,2031.0,2031.0,2031,1476.0,1476.0,1476,1622,1048


In [45]:
# check rows where the fare is null, look at just 5 random samples
taxi_trips[taxi_trips['fare'].isna()].sample(5)

# in this case ask for advice from the potential users what to do

# some conclusions:
# - total trip might be erronous
# - fare data are missing from a few raws
# - the payment type and company occupies too much space
# - the centroid locations as Point coordinates are duplicates
# - the census tract data are very sparse, more than half of them are missing


Unnamed: 0,trip_id,taxi_id,trip_start_timestamp,trip_end_timestamp,trip_seconds,trip_miles,pickup_community_area,dropoff_community_area,fare,tips,tolls,extras,trip_total,payment_type,company,pickup_centroid_latitude,pickup_centroid_longitude,pickup_centroid_location,dropoff_centroid_latitude,dropoff_centroid_longitude,dropoff_centroid_location,pickup_census_tract,dropoff_census_tract
14939,229e939c9c87e2146afd030d7a4e195918dc6c9e,286e28223cab05d01037fb52fead0f36d84f4b00e159c0...,2023-11-17T10:30:00.000,2023-11-17T10:30:00.000,131,0.0,56,56,,,,,,Cash,Sun Taxi,41.785998518,-87.750934289,"{'type': 'Point', 'coordinates': [-87.75093428...",41.785998518,-87.750934289,"{'type': 'Point', 'coordinates': [-87.75093428...",17031980100.0,17031980100.0
9556,4339996d80c39478ac0d8b9da3d037cb50ad6646,3671b00a8d3d78e49e75a0eb9796fbfa1ab2e07b89d7ed...,2023-11-17T15:00:00.000,2023-11-17T15:00:00.000,161,0.0,6,6,,,,,,Cash,Taxicab Insurance Agency Llc,41.949829346,-87.64396537,"{'type': 'Point', 'coordinates': [-87.64396537...",41.949829346,-87.64396537,"{'type': 'Point', 'coordinates': [-87.64396537...",17031060900.0,17031060900.0
3031,7500190024d2e1538d2db02d62840afd89b53192,ea1095a215ac661a9a0da3f84747131c7bc3e6b71308b5...,2023-11-17T19:45:00.000,2023-11-17T20:00:00.000,1271,11.44,50,35,,,,,,Cash,City Service,41.706125752,-87.598255838,"{'type': 'Point', 'coordinates': [-87.59825583...",41.835117986,-87.618677767,"{'type': 'Point', 'coordinates': [-87.61867776...",,
13528,736fa5e2d0b8616311a17f3eea3de998a213b212,87a21e5ac34dbe38d564b74375f29cdcc5f88e81aa27d9...,2023-11-17T11:45:00.000,2023-11-17T12:15:00.000,1011,2.45,6,5,,,,,,Cash,Flash Cab,41.944226601,-87.655998182,"{'type': 'Point', 'coordinates': [-87.65599818...",41.947791586,-87.683834942,"{'type': 'Point', 'coordinates': [-87.68383494...",,
11930,ad11daf68af21f3eed8506e4f30a8d1046567346,8c76eb82f069c0731a0049cb78898f02cc5ac6990244c9...,2023-11-17T13:00:00.000,2023-11-17T13:00:00.000,17,0.0,38,38,,,,,,Cash,Sun Taxi,41.812948939,-87.617859676,"{'type': 'Point', 'coordinates': [-87.61785967...",41.812948939,-87.617859676,"{'type': 'Point', 'coordinates': [-87.61785967...",,


#### Transformation: deal with the NaN values

In [46]:
# the order is important, because these two columns contain lots of NaN values
# first drop them, then the rows having NaN in the other columns only
taxi_trips.drop(['pickup_census_tract', 'dropoff_census_tract'], axis=1, inplace=True)
taxi_trips.dropna(inplace=True)

#### Transformation: drop the unnecessary duplicated columns

In [47]:
taxi_trips.drop(['pickup_centroid_location', 'dropoff_centroid_location'], axis=1, inplace=True)

#### Transformation: renaming

In [48]:
taxi_trips.rename(columns={"pickup_community_area": "pickup_community_area_id",
                           "dropoff_community_area": "dropoff_community_area_id"},
                           inplace=True)

Transformation: an auxilary column to the weather data

In [49]:
taxi_trips['trip_start_timestamp'] = pd.to_datetime(taxi_trips['trip_start_timestamp'])
taxi_trips['datetime_for_weather'] = taxi_trips['trip_start_timestamp'].dt.floor('H')

#### Check joining the trips and the weather data

In [50]:
# get weather data from the open-meteo.com site, and create a dataframe out if it
url = 'https://archive-api.open-meteo.com/v1/era5'
# current date
current_datetime = datetime.now()
# get the data two months before as a formatted string
formatted_datetime = (current_datetime - relativedelta(months=2)).strftime('%Y-%m-%d')
params = {
    'latitude': 41.85,
    'longitude': -87.65,
    'start_date': formatted_datetime,
    'end_date': formatted_datetime,
    'hourly': 'temperature_2m,wind_speed_10m,rain,precipitation'
    }

response = requests.get(url, params=params)
weather_data = response.json()

# transform the JSON string to a pd dataframe
weather_data_filtered = {
    'datetime': weather_data['hourly']['time'],
    'temperature': weather_data['hourly']['temperature_2m'],
    'wind_speed': weather_data['hourly']['wind_speed_10m'],
    'rain': weather_data['hourly']['rain'],
    'precipitation': weather_data['hourly']['precipitation']
    }

#weather_data_filtered
weather_df = pd.DataFrame(weather_data_filtered)
weather_df['datetime'] = pd.to_datetime(weather_df['datetime'])
weather_df.info()
weather_df.head()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 24 entries, 0 to 23
Data columns (total 5 columns):
 #   Column         Non-Null Count  Dtype         
---  ------         --------------  -----         
 0   datetime       24 non-null     datetime64[ns]
 1   temperature    24 non-null     float64       
 2   wind_speed     24 non-null     float64       
 3   rain           24 non-null     float64       
 4   precipitation  24 non-null     float64       
dtypes: datetime64[ns](1), float64(4)
memory usage: 1.1 KB


Unnamed: 0,datetime,temperature,wind_speed,rain,precipitation
0,2023-11-17 00:00:00,15.5,30.5,0.0,0.0
1,2023-11-17 01:00:00,15.2,30.3,0.0,0.0
2,2023-11-17 02:00:00,15.0,31.6,0.0,0.0
3,2023-11-17 03:00:00,15.2,31.4,0.0,0.0
4,2023-11-17 04:00:00,15.2,29.6,0.0,0.0


In [51]:
taxi_trips_with_weather = taxi_trips.merge(weather_df, left_on='datetime_for_weather', right_on='datetime')


In [52]:
#taxi_trips_with_weather.head()

Transformation: Data type conversions

In [53]:
# we can characterize columns which are not object is the dataframe
taxi_trips.describe()

Unnamed: 0,trip_start_timestamp,datetime_for_weather
count,17654,17654
mean,2023-11-17 14:20:29.585362944,2023-11-17 13:58:14.777387520
min,2023-11-17 00:00:00,2023-11-17 00:00:00
25%,2023-11-17 10:45:00,2023-11-17 10:00:00
50%,2023-11-17 14:45:00,2023-11-17 14:00:00
75%,2023-11-17 18:15:00,2023-11-17 18:00:00
max,2023-11-17 23:45:00,2023-11-17 23:00:00


In [54]:
data_types = {
    'trip_end_timestamp': 'datetime64[ns]',
    'trip_seconds': 'int32',
    'trip_miles': 'float',
    'pickup_community_area_id': 'int8',
    'dropoff_community_area_id': 'int8',
    'fare': 'float',
    'tips': 'float',
    'tolls': 'float',
    'extras': 'float',
    'trip_total': 'float',
}
taxi_trips = taxi_trips.astype(data_types)

In [55]:
taxi_trips.info()

<class 'pandas.core.frame.DataFrame'>
Index: 17654 entries, 0 to 19577
Data columns (total 20 columns):
 #   Column                      Non-Null Count  Dtype         
---  ------                      --------------  -----         
 0   trip_id                     17654 non-null  object        
 1   taxi_id                     17654 non-null  object        
 2   trip_start_timestamp        17654 non-null  datetime64[ns]
 3   trip_end_timestamp          17654 non-null  datetime64[ns]
 4   trip_seconds                17654 non-null  int32         
 5   trip_miles                  17654 non-null  float64       
 6   pickup_community_area_id    17654 non-null  int8          
 7   dropoff_community_area_id   17654 non-null  int8          
 8   fare                        17654 non-null  float64       
 9   tips                        17654 non-null  float64       
 10  tolls                       17654 non-null  float64       
 11  extras                      17654 non-null  float64       


In [56]:
# much better, the non-object columns have some basic statistics easily
taxi_trips.describe()
# calculate the memory usage of the dataframe
taxi_trips.memory_usage(deep=True).sum()
# the data type optimized memory usage is less by a 40%

13843002

#### Sanity checks

In [57]:
taxi_trips.describe()

Unnamed: 0,trip_start_timestamp,trip_end_timestamp,trip_seconds,trip_miles,pickup_community_area_id,dropoff_community_area_id,fare,tips,tolls,extras,trip_total,datetime_for_weather
count,17654,17654,17654.0,17654.0,17654.0,17654.0,17654.0,17654.0,17654.0,17654.0,17654.0,17654
mean,2023-11-17 14:20:29.585362944,2023-11-17 14:41:29.860655104,1260.984706,5.579093,31.7532,25.902628,20.265285,2.684106,0.011089,1.18082,24.322406,2023-11-17 13:58:14.777387520
min,2023-11-17 00:00:00,2023-11-17 00:00:00,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,2023-11-17 00:00:00
25%,2023-11-17 10:45:00,2023-11-17 11:00:00,444.0,0.82,8.0,8.0,7.5,0.0,0.0,0.0,9.75,2023-11-17 10:00:00
50%,2023-11-17 14:45:00,2023-11-17 15:00:00,840.0,2.16,32.0,28.0,12.75,1.11,0.0,0.0,15.25,2023-11-17 14:00:00
75%,2023-11-17 18:15:00,2023-11-17 18:45:00,1738.75,10.29,41.0,32.0,31.25,3.9,0.0,1.0,34.75,2023-11-17 18:00:00
max,2023-11-17 23:45:00,2023-11-18 14:30:00,76783.0,63.96,77.0,77.0,998.0,50.0,45.0,71.5,1002.0,2023-11-17 23:00:00
std,,,1768.094215,6.387105,24.480231,20.96462,17.909156,3.754781,0.509658,3.271489,21.116257,


In [58]:
taxi_trips[taxi_trips['trip_end_timestamp'] == taxi_trips['trip_end_timestamp'].max()]

Unnamed: 0,trip_id,taxi_id,trip_start_timestamp,trip_end_timestamp,trip_seconds,trip_miles,pickup_community_area_id,dropoff_community_area_id,fare,tips,tolls,extras,trip_total,payment_type,company,pickup_centroid_latitude,pickup_centroid_longitude,dropoff_centroid_latitude,dropoff_centroid_longitude,datetime_for_weather
5307,2aea07968fb3b9578cc94258fdb1c39e175f6294,214fcdd082c941bd6273631b0ff3cedc4bd5307f719e73...,2023-11-17 18:00:00,2023-11-18 14:30:00,73929,14.78,76,76,44.5,0.0,0.0,6.0,50.5,Cash,Chicago Independents,41.97907082,-87.903039661,41.97907082,-87.903039661,2023-11-17 18:00:00


In [59]:
taxi_trips[taxi_trips['trip_seconds'] == taxi_trips['trip_seconds'].max()]

Unnamed: 0,trip_id,taxi_id,trip_start_timestamp,trip_end_timestamp,trip_seconds,trip_miles,pickup_community_area_id,dropoff_community_area_id,fare,tips,tolls,extras,trip_total,payment_type,company,pickup_centroid_latitude,pickup_centroid_longitude,dropoff_centroid_latitude,dropoff_centroid_longitude,datetime_for_weather
16321,231241bc25b14de9e3475422f78e3a5b8d08ca35,ee0a945778a6e3d37f8d6473b50fd38f0c7f24d05ead76...,2023-11-17 09:15:00,2023-11-18 06:30:00,76783,12.54,3,27,33.0,0.0,0.0,0.0,33.0,Cash,Flash Cab,41.96581197,-87.655878786,41.878914496,-87.70589713,2023-11-17 09:00:00


In [60]:
taxi_trips[taxi_trips['fare'] == taxi_trips['fare'].max()]

Unnamed: 0,trip_id,taxi_id,trip_start_timestamp,trip_end_timestamp,trip_seconds,trip_miles,pickup_community_area_id,dropoff_community_area_id,fare,tips,tolls,extras,trip_total,payment_type,company,pickup_centroid_latitude,pickup_centroid_longitude,dropoff_centroid_latitude,dropoff_centroid_longitude,datetime_for_weather
9846,9e999e78ba0354b42317d60416f27fe5ac555730,c85b14c46a0a7fc5391cdfbf0a516073a276dca0cf6a98...,2023-11-17 14:45:00,2023-11-17 14:45:00,120,0.0,32,32,998.0,0.0,4.0,0.0,1002.0,Dispute,Chicago Independents,41.880994471,-87.632746489,41.87101588,-87.631406525,2023-11-17 14:00:00


In [61]:
taxi_trips.nlargest(10, 'fare')

Unnamed: 0,trip_id,taxi_id,trip_start_timestamp,trip_end_timestamp,trip_seconds,trip_miles,pickup_community_area_id,dropoff_community_area_id,fare,tips,tolls,extras,trip_total,payment_type,company,pickup_centroid_latitude,pickup_centroid_longitude,dropoff_centroid_latitude,dropoff_centroid_longitude,datetime_for_weather
9846,9e999e78ba0354b42317d60416f27fe5ac555730,c85b14c46a0a7fc5391cdfbf0a516073a276dca0cf6a98...,2023-11-17 14:45:00,2023-11-17 14:45:00,120,0.0,32,32,998.0,0.0,4.0,0.0,1002.0,Dispute,Chicago Independents,41.880994471,-87.632746489,41.87101588,-87.631406525,2023-11-17 14:00:00
19093,5fb88a38fbbd3279f113e1892f55b99132d42dd9,5cffb97a6fd517bd5bc14f2d55ded4698066eb285e8c02...,2023-11-17 02:00:00,2023-11-17 02:00:00,62,0.0,6,6,300.0,0.0,0.0,0.0,300.5,Credit Card,5 Star Taxi,41.944226601,-87.655998182,41.944226601,-87.655998182,2023-11-17 02:00:00
16308,2bff74adc3c18df92d5dcf705d228bb50cdebe3f,829dced7036593da33254d88a2d065c1698fc377e76c93...,2023-11-17 09:15:00,2023-11-17 14:00:00,17400,0.0,38,76,296.75,0.0,0.0,0.0,296.75,Cash,Taxi Affiliation Services,41.812948939,-87.617859676,41.980264315,-87.913624596,2023-11-17 09:00:00
13279,3684eb84e3889fd03b7c38f372849ad2aa5e7925,175a4c9a16e22bfd2fee721e6cdfdc85dfde21cccb1fdc...,2023-11-17 12:00:00,2023-11-17 16:15:00,15032,63.96,76,3,195.25,0.0,0.0,4.0,199.25,Cash,Sun Taxi,41.980264315,-87.913624596,41.96581197,-87.655878786,2023-11-17 12:00:00
17294,47bc3dfa7ccff74aa1f70bcac7bb3a94980d4d5c,42e3ec7750e4be6e56c47bcdefe5cb86ddb0d0c65bcf4d...,2023-11-17 08:15:00,2023-11-17 15:30:00,26366,30.48,28,76,189.75,0.0,0.0,0.0,189.75,Cash,Taxicab Insurance Agency Llc,41.879255084,-87.642648998,41.97907082,-87.903039661,2023-11-17 08:00:00
14909,3f35a2d3a6133ce4ad62637da7b52afc82974572,3c0161e62a7344b442d21a208d41053ee0bf1470d74cbd...,2023-11-17 10:30:00,2023-11-17 10:30:00,125,0.16,76,76,180.0,0.0,0.0,0.0,181.0,Credit Card,Sun Taxi,41.97907082,-87.903039661,41.97907082,-87.903039661,2023-11-17 10:00:00
13668,dbb37ed59eac46103aee94ddfedd221b8ab8eec8,e07a6c4140c6ab966c27183ca8096da2fa61b2f3174cf1...,2023-11-17 11:30:00,2023-11-17 15:45:00,15660,2.1,29,71,142.25,0.0,0.0,0.0,142.25,Cash,Taxi Affiliation Services,41.860190019,-87.7172201,41.744205146,-87.656305986,2023-11-17 11:00:00
13892,f6bae1a3b6b36cb79672c6598deadcba4368c105,6c6606251e8d2b1609f34d755bf884c4d972ab44b47bd7...,2023-11-17 11:30:00,2023-11-17 16:15:00,16624,25.95,28,22,127.25,0.0,0.0,0.0,127.25,Cash,Flash Cab,41.874005383,-87.66351755,41.92276062,-87.699155343,2023-11-17 11:00:00
6507,f5098cec64ebe443224d740492a471da16c4f8c0,42e3ec7750e4be6e56c47bcdefe5cb86ddb0d0c65bcf4d...,2023-11-17 17:00:00,2023-11-17 20:30:00,12668,36.46,32,8,127.0,0.0,0.0,0.0,127.0,Cash,Taxicab Insurance Agency Llc,41.884987192,-87.620992913,41.892042136,-87.63186395,2023-11-17 17:00:00
4358,fd4eb76cee0e424ff8894e0129b3b13619cbb967,e8d374b4e7bc344add5893f1a1ae3b611823439ac1caf0...,2023-11-17 18:30:00,2023-11-17 21:15:00,10500,47.6,76,8,126.5,0.0,0.0,4.0,130.5,Credit Card,Taxi Affiliation Services,41.97907082,-87.903039661,41.89503345,-87.619710672,2023-11-17 18:00:00


#### Data modelling
Create master tables for the payment types and the taxi company names

In [62]:
# master table for the payment types
taxi_trips['payment_type'].unique()
payment_type_master = taxi_trips['payment_type'].drop_duplicates().reset_index(drop=True)


payment_type_master = pd.DataFrame(
    {
        'payment_type_id': range(1, len(payment_type_master)+1),    # the id's usually starts from 1 despite indeces starting from 0
        'payment_type': payment_type_master
    }
)

payment_type_master

Unnamed: 0,payment_type_id,payment_type
0,1,Credit Card
1,2,Prcard
2,3,Unknown
3,4,Mobile
4,5,Cash
5,6,Dispute
6,7,No Charge


In [63]:
# master table for the company names
taxi_trips['company'].unique()
company_master = taxi_trips['company'].drop_duplicates().reset_index(drop=True)


company_master = pd.DataFrame(
    {
        'company_id': range(1, len(company_master)+1),    # the id's usually starts from 1 despite indeces starting from 0
        'company': company_master
    }
)

company_master

Unnamed: 0,company_id,company
0,1,Chicago Independents
1,2,City Service
2,3,Sun Taxi
3,4,Taxi Affiliation Services
4,5,Flash Cab
5,6,5 Star Taxi
6,7,Globe Taxi
7,8,Blue Ribbon Taxi Association
8,9,Taxicab Insurance Agency Llc
9,10,Top Cab


Check the memory usage if we replace the long strings with small integers

In [64]:
print(taxi_trips.memory_usage(deep=True).sum())

taxi_trips_id = taxi_trips.merge(payment_type_master, on='payment_type')
taxi_trips_id = taxi_trips_id.merge(company_master, on='company')
taxi_trips_id.drop(['payment_type', 'company'], axis=1, inplace=True)
print(taxi_trips_id.memory_usage(deep=True).sum())

13843002
11535022


In [65]:
taxi_trips_id.sample(5)

Unnamed: 0,trip_id,taxi_id,trip_start_timestamp,trip_end_timestamp,trip_seconds,trip_miles,pickup_community_area_id,dropoff_community_area_id,fare,tips,tolls,extras,trip_total,pickup_centroid_latitude,pickup_centroid_longitude,dropoff_centroid_latitude,dropoff_centroid_longitude,datetime_for_weather,payment_type_id,company_id
13236,485fb4dc90fb418e83615f164ef13eb93e8fd6e1,b875e9e053d893ee490e723c96773ed5f81c0a2339545f...,2023-11-17 04:00:00,2023-11-17 04:00:00,360,0.65,32,8,9.0,2.22,0.0,0.0,11.22,41.878865584,-87.625192142,41.899602111,-87.633308037,2023-11-17 04:00:00,4,9
774,4123d8a659932f6cffb08e8ed584dc34772cd480,4efb4a59d8c600a9d2a8d14362769bed234568a573dc5f...,2023-11-17 09:15:00,2023-11-17 09:30:00,275,1.18,8,32,9.0,2.22,0.0,0.0,11.22,41.900265687,-87.63210922,41.880994471,-87.632746489,2023-11-17 09:00:00,4,1
15811,fa68690fbc3afab8c1a82be218c204bc7af6f52a,e284ad80755a7bb4885cc8221c0b4b61099895f85f2d16...,2023-11-17 15:00:00,2023-11-17 15:00:00,420,1.7,8,32,7.75,2.0,0.0,0.0,9.75,41.890922026,-87.618868355,41.880994471,-87.632746489,2023-11-17 15:00:00,1,13
6321,ee5b8ba3d0c88dd86823422be9d195dc8c2e510f,ff60dabe17243a25435dcaf430a8b31615374bfc6be841...,2023-11-17 02:15:00,2023-11-17 02:15:00,346,0.0,32,32,9.0,2.22,0.0,0.0,11.22,41.878865584,-87.625192142,41.878865584,-87.625192142,2023-11-17 02:00:00,4,5
3814,998e03c35bbf5fbbd9e5f919cb956a715bf262c1,303612a451e04af3d6647e3785c9609872e76f12aeda19...,2023-11-17 14:15:00,2023-11-17 14:15:00,14,0.08,22,76,3.25,0.0,0.0,0.0,3.25,41.92276062,-87.699155343,41.980264315,-87.913624596,2023-11-17 14:00:00,5,6


In [None]:
# write the master tables to csv files
payment_type_master.to_csv(r'..\..\csv\payment_type_master.csv', index=False)
company_master.to_csv(r'..\..\csv\company_master.csv', index=False)