In [5]:
# some exploratory data analysis
from datetime import datetime
from dateutil.relativedelta import relativedelta
import os

import pandas as pd
# set pandas to show 100 the columns
pd.set_option('display.max_columns', 100)

import requests


In [6]:
# get the taxi data of the last full month (T-1 months') data
current_datetime = datetime.now()    # current date
# get the data two months before as a formatted string
formatted_datetime = (current_datetime - relativedelta(months=2)).strftime('%Y-%m-%d')

url = f'https://data.cityofchicago.org/resource/wrvz-psew.json?$where=trip_start_timestamp >= "{formatted_datetime}T00:00:00" AND trip_start_timestamp <= "{formatted_datetime}T23:59:59"&$limit=30000'

# the os.environ.get looks for the specified variable in the .env file of the root
# folder of the project
headers = {'X-App-Token': os.environ.get("CHICHAGO_API_TOKEN")}

# response = requests.get(url, headers)    # in case an error comes leave the headers parameter
response = requests.get(url)

data = response.json()

#type(response)    # we can check the type: "requests.models.Response"

In [7]:
# create a dataframe from the taxi trip data
taxi_trips = pd.DataFrame(data)
taxi_trips.head()

Unnamed: 0,trip_id,taxi_id,trip_start_timestamp,trip_end_timestamp,trip_seconds,trip_miles,pickup_community_area,dropoff_community_area,fare,tips,tolls,extras,trip_total,payment_type,company,pickup_centroid_latitude,pickup_centroid_longitude,pickup_centroid_location,dropoff_centroid_latitude,dropoff_centroid_longitude,dropoff_centroid_location,pickup_census_tract,dropoff_census_tract
0,53b58ec6b718e41547ed4cb999e2463f808328e6,11bb28dc5075f790bd4529d80a571002aeb69fd4145015...,2023-11-15T23:45:00.000,2023-11-16T00:00:00.000,848,8.81,32,77,23.75,0.0,0,0,23.75,Cash,City Service,41.878865584,-87.625192142,"{'type': 'Point', 'coordinates': [-87.62519214...",41.9867118,-87.663416405,"{'type': 'Point', 'coordinates': [-87.66341640...",,
1,5848760aa67e26ebc7c56743886870d7cd779b9f,eb7536e0b280d7842bd4448d3fc93ca078630c2f39f122...,2023-11-15T23:45:00.000,2023-11-15T23:45:00.000,460,3.77,32,24,11.09,1.97,0,0,13.06,Mobile,City Service,41.878865584,-87.625192142,"{'type': 'Point', 'coordinates': [-87.62519214...",41.901206994,-87.676355989,"{'type': 'Point', 'coordinates': [-87.67635598...",,
2,59e734dccde2f3bceff4fbbaec7c289675841fbe,31d18c12a75eaeb647e36a502a0c3e548fe4e08ed6f7f4...,2023-11-15T23:45:00.000,2023-11-16T00:00:00.000,1012,9.86,22,76,27.0,0.0,0,0,27.0,Prcard,5 Star Taxi,41.92276062,-87.699155343,"{'type': 'Point', 'coordinates': [-87.69915534...",41.980264315,-87.913624596,"{'type': 'Point', 'coordinates': [-87.91362459...",,
3,62b0b08465fe81cacea05fc468fbdc2cedf4f8d6,d1a7c7e8e9cf388f9923e529d82166c5a3baf4262ec914...,2023-11-15T23:45:00.000,2023-11-16T00:00:00.000,660,2.3,8,24,9.75,0.0,0,0,9.75,Cash,Chicago Independents,41.899602111,-87.633308037,"{'type': 'Point', 'coordinates': [-87.63330803...",41.901206994,-87.676355989,"{'type': 'Point', 'coordinates': [-87.67635598...",,
4,622ddd3d365c173e72a4334d22f4a87799fe2390,6b8bec9d4caa49041a1f5d31240ff8fa2e4c571aec8921...,2023-11-15T23:45:00.000,2023-11-15T23:45:00.000,14,0.0,8,8,17.0,0.05,0,0,17.55,Credit Card,City Service,41.899602111,-87.633308037,"{'type': 'Point', 'coordinates': [-87.63330803...",41.899602111,-87.633308037,"{'type': 'Point', 'coordinates': [-87.63330803...",,


In [8]:
taxi_trips.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20959 entries, 0 to 20958
Data columns (total 23 columns):
 #   Column                      Non-Null Count  Dtype 
---  ------                      --------------  ----- 
 0   trip_id                     20959 non-null  object
 1   taxi_id                     20959 non-null  object
 2   trip_start_timestamp        20959 non-null  object
 3   trip_end_timestamp          20958 non-null  object
 4   trip_seconds                20955 non-null  object
 5   trip_miles                  20959 non-null  object
 6   pickup_community_area       20369 non-null  object
 7   dropoff_community_area      19049 non-null  object
 8   fare                        20905 non-null  object
 9   tips                        20905 non-null  object
 10  tolls                       20905 non-null  object
 11  extras                      20905 non-null  object
 12  trip_total                  20905 non-null  object
 13  payment_type                20959 non-null  ob

In [9]:
taxi_trips.describe()

Unnamed: 0,trip_id,taxi_id,trip_start_timestamp,trip_end_timestamp,trip_seconds,trip_miles,pickup_community_area,dropoff_community_area,fare,tips,tolls,extras,trip_total,payment_type,company,pickup_centroid_latitude,pickup_centroid_longitude,pickup_centroid_location,dropoff_centroid_latitude,dropoff_centroid_longitude,dropoff_centroid_location,pickup_census_tract,dropoff_census_tract
count,20959,20959,20959,20958,20955,20959,20369,19049,20905,20905,20905,20905,20905.0,20959,20959,20374.0,20374.0,20374,19173.0,19173.0,19173,10466,10120
unique,20959,2469,96,113,3890,2409,77,77,1353,1157,24,186,2659.0,7,32,139.0,139.0,139,179.0,179.0,179,63,110
top,53b58ec6b718e41547ed4cb999e2463f808328e6,3671b00a8d3d78e49e75a0eb9796fbfa1ab2e07b89d7ed...,2023-11-15T17:00:00.000,2023-11-15T18:15:00.000,0,0,76,8,9,0,0,0,3.25,Credit Card,Flash Cab,41.97907082,-87.903039661,"{'type': 'Point', 'coordinates': [-87.90303966...",41.880994471,-87.632746489,"{'type': 'Point', 'coordinates': [-87.63274648...",17031980000,17031839100
freq,1,37,412,399,407,2207,4499,4433,852,9256,20798,13737,465.0,9127,4172,2525.0,2525.0,2525,1508.0,1508.0,1508,2525,1508


In [10]:
# check rows where the fare is null, look at just 5 random samples
taxi_trips[taxi_trips['fare'].isna()].sample(5)

# in this case ask for advice from the potential users what to do

# some conclusions:
# - total trip might be erronous
# - fare data are missing from a few raws
# - the payment type and company occupies too much space
# - the centroid locations as Point coordinates are duplicates
# - the census tract data are very sparse, more than half of them are missing


Unnamed: 0,trip_id,taxi_id,trip_start_timestamp,trip_end_timestamp,trip_seconds,trip_miles,pickup_community_area,dropoff_community_area,fare,tips,tolls,extras,trip_total,payment_type,company,pickup_centroid_latitude,pickup_centroid_longitude,pickup_centroid_location,dropoff_centroid_latitude,dropoff_centroid_longitude,dropoff_centroid_location,pickup_census_tract,dropoff_census_tract
7807,3b4f54878a5fab55ba228272de8f0b5d75378ff6,188e63748dd6e23e0c2c8b9d331fd6e1b6f6dba8693e35...,2023-11-15T16:15:00.000,2023-11-15T16:15:00.000,277,0.72,8,8,,,,,,Cash,City Service,41.892507781,-87.626214906,"{'type': 'Point', 'coordinates': [-87.62621490...",41.900265687,-87.63210922,"{'type': 'Point', 'coordinates': [-87.63210921...",17031081500.0,17031081000.0
13453,93da84fbfde7c9fe15ee2609f836e9d089702bc8,87a21e5ac34dbe38d564b74375f29cdcc5f88e81aa27d9...,2023-11-15T12:15:00.000,2023-11-15T12:30:00.000,685,4.37,6,8,,,,,,Cash,Flash Cab,41.944226601,-87.655998182,"{'type': 'Point', 'coordinates': [-87.65599818...",41.899602111,-87.633308037,"{'type': 'Point', 'coordinates': [-87.63330803...",,
14519,7f5d5204ba788b64ce819ce5140281410ca15b41,c797f1560410b9db343567ea7c8e4095f66ceb65800fa4...,2023-11-15T11:30:00.000,2023-11-15T11:45:00.000,762,0.66,6,6,,,,,,Cash,Flash Cab,41.944226601,-87.655998182,"{'type': 'Point', 'coordinates': [-87.65599818...",41.944226601,-87.655998182,"{'type': 'Point', 'coordinates': [-87.65599818...",,
19149,fe5679fe5310c2175431d826691af146f5b3d67f,3671b00a8d3d78e49e75a0eb9796fbfa1ab2e07b89d7ed...,2023-11-15T07:30:00.000,2023-11-15T07:45:00.000,197,0.41,32,32,,,,,,Cash,Taxicab Insurance Agency Llc,41.880994471,-87.632746489,"{'type': 'Point', 'coordinates': [-87.63274648...",41.880994471,-87.632746489,"{'type': 'Point', 'coordinates': [-87.63274648...",17031839100.0,17031839100.0
12910,027916e052ab3db81c214d23bf7e1fda3703fcf9,5e48748b582fe3c16baf5ac371a27773e8aef2927f3344...,2023-11-15T12:45:00.000,2023-11-15T13:00:00.000,241,0.42,32,32,,,,,,Cash,Sun Taxi,41.877406123,-87.621971652,"{'type': 'Point', 'coordinates': [-87.62197165...",41.880994471,-87.632746489,"{'type': 'Point', 'coordinates': [-87.63274648...",17031320400.0,17031839100.0


#### Transformation: deal with the NaN values

In [11]:
taxi_trips.drop(['pickup_census_tract', 'dropoff_census_tract'], axis=1, inplace=True)

#### Transformation: drop the unnecessary duplicated columns

In [12]:
taxi_trips.drop(['pickup_centroid_location', 'dropoff_centroid_location'], axis=1, inplace=True)

#### Transformation: renaming

In [13]:
taxi_trips.rename(columns={"pickup_community_area": "pickup_community_area_id",
                           "dropoff_community_area": "dropoff_community_area_id"},
                           inplace=True)

Transformation: an auxilary column to the weather data

In [14]:
taxi_trips['trip_start_timestamp'] = pd.to_datetime(taxi_trips['trip_start_timestamp'])
taxi_trips['datetime_for_weather'] = taxi_trips['trip_start_timestamp'].dt.floor('H')

#### Check joining the trips and the weather data

In [15]:
# get weather data from the open-meteo.com site, and create a dataframe out if it
url = 'https://archive-api.open-meteo.com/v1/era5'
# current date
current_datetime = datetime.now()
# get the data two months before as a formatted string
formatted_datetime = (current_datetime - relativedelta(months=2)).strftime('%Y-%m-%d')
params = {
    'latitude': 41.85,
    'longitude': -87.65,
    'start_date': formatted_datetime,
    'end_date': formatted_datetime,
    'hourly': 'temperature_2m,wind_speed_10m,rain,precipitation'
    }

response = requests.get(url, params=params)
weather_data = response.json()

# transform the JSON string to a pd dataframe
weather_data_filtered = {
    'datetime': weather_data['hourly']['time'],
    'temperature': weather_data['hourly']['temperature_2m'],
    'wind_speed': weather_data['hourly']['wind_speed_10m'],
    'rain': weather_data['hourly']['rain'],
    'precipitation': weather_data['hourly']['precipitation']
    }

#weather_data_filtered
weather_df = pd.DataFrame(weather_data_filtered)
weather_df['datetime'] = pd.to_datetime(weather_df['datetime'])
weather_df.info()
weather_df.head()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 24 entries, 0 to 23
Data columns (total 5 columns):
 #   Column         Non-Null Count  Dtype         
---  ------         --------------  -----         
 0   datetime       24 non-null     datetime64[ns]
 1   temperature    24 non-null     float64       
 2   wind_speed     24 non-null     float64       
 3   rain           24 non-null     float64       
 4   precipitation  24 non-null     float64       
dtypes: datetime64[ns](1), float64(4)
memory usage: 1.1 KB


Unnamed: 0,datetime,temperature,wind_speed,rain,precipitation
0,2023-11-15 00:00:00,7.5,15.8,0.0,0.0
1,2023-11-15 01:00:00,8.7,19.5,0.0,0.0
2,2023-11-15 02:00:00,8.5,20.6,0.0,0.0
3,2023-11-15 03:00:00,7.9,18.8,0.0,0.0
4,2023-11-15 04:00:00,6.7,15.6,0.0,0.0


In [16]:
taxi_trips_with_weather = taxi_trips.merge(weather_df, left_on='datetime_for_weather', right_on='datetime')


In [17]:
taxi_trips_with_weather.head()

Unnamed: 0,trip_id,taxi_id,trip_start_timestamp,trip_end_timestamp,trip_seconds,trip_miles,pickup_community_area_id,dropoff_community_area_id,fare,tips,tolls,extras,trip_total,payment_type,company,pickup_centroid_latitude,pickup_centroid_longitude,dropoff_centroid_latitude,dropoff_centroid_longitude,datetime_for_weather,datetime,temperature,wind_speed,rain,precipitation
0,53b58ec6b718e41547ed4cb999e2463f808328e6,11bb28dc5075f790bd4529d80a571002aeb69fd4145015...,2023-11-15 23:45:00,2023-11-16T00:00:00.000,848,8.81,32,77,23.75,0.0,0,0,23.75,Cash,City Service,41.878865584,-87.625192142,41.9867118,-87.663416405,2023-11-15 23:00:00,2023-11-15 23:00:00,10.7,12.2,0.0,0.0
1,5848760aa67e26ebc7c56743886870d7cd779b9f,eb7536e0b280d7842bd4448d3fc93ca078630c2f39f122...,2023-11-15 23:45:00,2023-11-15T23:45:00.000,460,3.77,32,24,11.09,1.97,0,0,13.06,Mobile,City Service,41.878865584,-87.625192142,41.901206994,-87.676355989,2023-11-15 23:00:00,2023-11-15 23:00:00,10.7,12.2,0.0,0.0
2,59e734dccde2f3bceff4fbbaec7c289675841fbe,31d18c12a75eaeb647e36a502a0c3e548fe4e08ed6f7f4...,2023-11-15 23:45:00,2023-11-16T00:00:00.000,1012,9.86,22,76,27.0,0.0,0,0,27.0,Prcard,5 Star Taxi,41.92276062,-87.699155343,41.980264315,-87.913624596,2023-11-15 23:00:00,2023-11-15 23:00:00,10.7,12.2,0.0,0.0
3,62b0b08465fe81cacea05fc468fbdc2cedf4f8d6,d1a7c7e8e9cf388f9923e529d82166c5a3baf4262ec914...,2023-11-15 23:45:00,2023-11-16T00:00:00.000,660,2.3,8,24,9.75,0.0,0,0,9.75,Cash,Chicago Independents,41.899602111,-87.633308037,41.901206994,-87.676355989,2023-11-15 23:00:00,2023-11-15 23:00:00,10.7,12.2,0.0,0.0
4,622ddd3d365c173e72a4334d22f4a87799fe2390,6b8bec9d4caa49041a1f5d31240ff8fa2e4c571aec8921...,2023-11-15 23:45:00,2023-11-15T23:45:00.000,14,0.0,8,8,17.0,0.05,0,0,17.55,Credit Card,City Service,41.899602111,-87.633308037,41.899602111,-87.633308037,2023-11-15 23:00:00,2023-11-15 23:00:00,10.7,12.2,0.0,0.0
