In [14]:
import pandas as pd

In [15]:
# list of manhattan zones
manhattan = [4,24,12,13,41,45,42,43,48,50,68,79,74,75,87,88,90,125,100,103,107,113,114,116,120,127,128,151,140,137
             ,141,142,152,143,144,148,153,158,161,162,163,164,170,166,186,194,202,209,211,224,229,230,231,239,232
             ,233,234,236,237,238,243,244,246,249,261,262,263]

In [16]:
# reading taxi data
taxi = pd.read_parquet("cleaned_taxi.parquet")

In [17]:
# first 5 rows
taxi.head(5)

Unnamed: 0,tpep_pickup_datetime,tpep_dropoff_datetime,passenger_count,trip_distance,PULocationID,DOLocationID
0,2022-03-31 23:21:13,2022-03-31 23:58:33,1.0,10.3,163,62
1,2022-03-31 23:07:47,2022-03-31 23:19:12,1.0,2.0,142,141
2,2022-03-31 23:14:52,2022-03-31 23:23:43,1.0,1.0,79,148
3,2022-03-31 23:30:02,2022-03-31 23:45:06,1.0,2.6,79,13
4,2022-03-31 23:48:40,2022-04-01 00:03:34,1.0,2.79,238,116


In [18]:
# converting drop off time to datetime
taxi["tpep_dropoff_datetime"] = pd.to_datetime(taxi["tpep_dropoff_datetime"])

In [19]:
# confirming date time conversion
taxi.dtypes

tpep_pickup_datetime     datetime64[ns]
tpep_dropoff_datetime    datetime64[ns]
passenger_count                 float64
trip_distance                   float64
PULocationID                      int32
DOLocationID                      int32
dtype: object

In [20]:
# dropping pickup location and time
taxi = taxi.drop(["tpep_pickup_datetime","PULocationID", "trip_distance"],1)

  taxi = taxi.drop(["tpep_pickup_datetime","PULocationID", "trip_distance"],1)


In [21]:
# first 5 rows
taxi.head(5)

Unnamed: 0,tpep_dropoff_datetime,passenger_count,DOLocationID
0,2022-03-31 23:58:33,1.0,62
1,2022-03-31 23:19:12,1.0,141
2,2022-03-31 23:23:43,1.0,148
3,2022-03-31 23:45:06,1.0,13
4,2022-04-01 00:03:34,1.0,116


In [22]:
# data frame shape
taxi.shape

(41985057, 3)

# References
- https://sparkbyexamples.com/pandas/pandas-aggregate-functions-with-examples/?expand_article=1
- https://kanoki.org/pandas-group-dataframe-by-timeinterval
- https://sparkbyexamples.com/pandas/pandas-reset-index-examples/
- https://www.geeksforgeeks.org/how-to-rename-columns-in-pandas-dataframe/
- https://datascienceparichay.com/article/filter-dataframe-on-list-of-values/

In [23]:
# grouping to get the number of passengers in each location every hour
grouped_taxi = taxi.groupby(["DOLocationID", pd.Grouper(key = "tpep_dropoff_datetime", freq = "1H")]).agg({"passenger_count" 
                                                                                                           : "sum"})


In [24]:
# reseting index of grouped data frame
grouped_taxi = grouped_taxi.reset_index()

In [25]:
# shape of grouped data frame
grouped_taxi.shape

(1170566, 3)

In [26]:
# first 5 rows
grouped_taxi.head(5)

Unnamed: 0,DOLocationID,tpep_dropoff_datetime,passenger_count
0,1,2022-02-01 06:00:00,3.0
1,1,2022-02-01 07:00:00,5.0
2,1,2022-02-01 08:00:00,5.0
3,1,2022-02-01 09:00:00,11.0
4,1,2022-02-01 10:00:00,3.0


In [27]:
# filtering for rows in Manhattan
manhattan = grouped_taxi[grouped_taxi["DOLocationID"].isin(manhattan)]

In [28]:
# shape of filtered data frame
manhattan.shape

(594529, 3)

In [29]:
# first 5 rows
manhattan.head(5)

Unnamed: 0,DOLocationID,tpep_dropoff_datetime,passenger_count
8736,4,2022-02-01 00:00:00,13.0
8737,4,2022-02-01 01:00:00,9.0
8738,4,2022-02-01 02:00:00,6.0
8739,4,2022-02-01 03:00:00,1.0
8740,4,2022-02-01 04:00:00,6.0


In [30]:
# creating month, day and hour columns
manhattan["month"] = manhattan["tpep_dropoff_datetime"].dt.month
manhattan["dayofweek"] = manhattan["tpep_dropoff_datetime"].dt.dayofweek
manhattan["hour"] = manhattan["tpep_dropoff_datetime"].dt.hour

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  manhattan["month"] = manhattan["tpep_dropoff_datetime"].dt.month
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  manhattan["dayofweek"] = manhattan["tpep_dropoff_datetime"].dt.dayofweek
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  manhattan["hour"] = manhattan["tpep_dropoff_datetime"].dt.hour


In [31]:
# first 5 rows 
manhattan.head(5)

Unnamed: 0,DOLocationID,tpep_dropoff_datetime,passenger_count,month,dayofweek,hour
8736,4,2022-02-01 00:00:00,13.0,2,1,0
8737,4,2022-02-01 01:00:00,9.0,2,1,1
8738,4,2022-02-01 02:00:00,6.0,2,1,2
8739,4,2022-02-01 03:00:00,1.0,2,1,3
8740,4,2022-02-01 04:00:00,6.0,2,1,4


In [32]:
# saving filtered data frame 
manhattan.to_csv("manhattan_hourly_dropoffs.csv")

In [33]:
# renaming datetime column in taxi dataframe
manhattan.rename(columns = {"tpep_dropoff_datetime":"time"}, inplace = True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  manhattan.rename(columns = {"tpep_dropoff_datetime":"time"}, inplace = True)


In [34]:
# first 5 rows
manhattan.head(5)

Unnamed: 0,DOLocationID,time,passenger_count,month,dayofweek,hour
8736,4,2022-02-01 00:00:00,13.0,2,1,0
8737,4,2022-02-01 01:00:00,9.0,2,1,1
8738,4,2022-02-01 02:00:00,6.0,2,1,2
8739,4,2022-02-01 03:00:00,1.0,2,1,3
8740,4,2022-02-01 04:00:00,6.0,2,1,4


# Weather Data
## References
- https://www.currentresults.com/Yearly-Weather/USA/NY/New-York-City/extreme-annual-new-york-city-low-temperature.php
- https://www.currentresults.com/Yearly-Weather/USA/NY/New-York-City/extreme-annual-new-york-city-high-temperature.php#google_vignette

In [50]:
# reading weather data 
weather = pd.read_csv("weather.csv")

In [51]:
# first 5 rows
weather.head(5)

Unnamed: 0,time,temperature_2m (°C),rain (mm),snowfall (cm),windspeed_10m (km/h),Unnamed: 5
0,2022-02-01T00:00,-5.8,0.0,0.0,10.3,
1,2022-02-01T01:00,-7.1,0.0,0.0,11.0,
2,2022-02-01T02:00,-8.6,0.0,0.0,11.5,
3,2022-02-01T03:00,-9.8,0.0,0.0,12.0,
4,2022-02-01T04:00,-10.8,0.0,0.0,12.0,


In [52]:
# data frame shape
weather.shape

(8760, 6)

In [53]:
# converting time column to date time
weather["time"] = pd.to_datetime(weather["time"])

In [54]:
# creating month, day and hour columns
weather["month"] = weather["time"].dt.month
weather["dayofweek"] = weather["time"].dt.dayofweek
weather["hour"] = weather["time"].dt.hour

## Analysing Temperature

In [55]:
max_temp = weather["temperature_2m (°C)"].max()
min_temp = weather["temperature_2m (°C)"].min()
print(f"The minimum temperature is {min_temp} and the maximum temprature is {max_temp}")

The minimum temperature is -13.8 and the maximum temprature is 36.0


- According to current weather the lowest temperature in 2022 in New York City was 14 degrees celsius and the highest temprature was 36 degrees celsius, the values in the temperature column is within the permitter range

# Merging taxi and weather 


In [56]:
# merge the rows in both data frames with the same date and time
merged_df = pd.merge(weather, manhattan, on = ["time","month","dayofweek","hour"], how = "inner")

In [57]:
# shape of merged data frame
merged_df.shape

(538852, 11)

In [58]:
# first 5 rows 
merged_df.head(5)

Unnamed: 0,time,temperature_2m (°C),rain (mm),snowfall (cm),windspeed_10m (km/h),Unnamed: 5,month,dayofweek,hour,DOLocationID,passenger_count
0,2022-02-01,-5.8,0.0,0.0,10.3,,2,1,0,4,13.0
1,2022-02-01,-5.8,0.0,0.0,10.3,,2,1,0,13,5.0
2,2022-02-01,-5.8,0.0,0.0,10.3,,2,1,0,24,14.0
3,2022-02-01,-5.8,0.0,0.0,10.3,,2,1,0,41,22.0
4,2022-02-01,-5.8,0.0,0.0,10.3,,2,1,0,42,16.0


In [59]:
# saving merged data frame as a csv file
merged_df.to_csv("taxi_weather_grouped.csv")