In [1]:
import pandas as pd
import geopandas as gpd
from shapely.geometry import Point
from shapely.geometry import MultiPolygon
from shapely import wkt


In [2]:
# list of taxi zones in manhattan 
manhattan = [4,24,12,13,41,45,42,43,48,50,68,79,74,75,87,88,90,125,100,103,107,113,114,116,120,127,128,151,140,137
             ,141,142,152,143,144,148,153,158,161,162,163,164,170,166,186,194,202,209,211,224,229,230,231,239,232
             ,233,234,236,237,238,243,244,246,249,261,262,263]

In [3]:
# reading subway data 
subway_df = pd.read_parquet("cleaned_subway.parquet")

In [4]:
# reading taxi zones data
taxi_zones = pd.read_csv("taxi_zones.csv")

In [5]:
# subway data frame shape
subway_df.shape

(1148138, 7)

In [6]:
# subway data frame first 5 rows
subway_df.head(5)

Unnamed: 0,transit_timestamp,station_complex_id,borough,ridership,latitude,longitude,Georeference
0,2023-02-05 22:00:00,R252,M,56,40.7906,-73.94748,POINT (-73.94748 40.7906)
1,2022-09-17 10:00:00,R252,M,333,40.7906,-73.94748,POINT (-73.94748 40.7906)
2,2022-05-14 21:00:00,R170,M,239,40.799446,-73.968376,POINT (-73.968376 40.799446)
3,2022-05-03 21:00:00,H007,M,470,40.730953,-73.98163,POINT (-73.98163 40.730953)
4,2023-01-28 18:00:00,H007,M,1450,40.730953,-73.98163,POINT (-73.98163 40.730953)


In [7]:
# taxi_zones data frame shape
taxi_zones.shape

(263, 7)

In [8]:
# first 5 rows of taxi zones data frame
taxi_zones.head(5)

Unnamed: 0,OBJECTID,Shape_Leng,the_geom,Shape_Area,zone,LocationID,borough
0,1,0.116357,MultiPolygon(((-74.18445299999996 40.694995999...,0.000782,Newark Airport,1,EWR
1,2,0.43347,MultiPolygon (((-73.82337597260663 40.63898704...,0.004866,Jamaica Bay,2,Queens
2,3,0.084341,MultiPolygon (((-73.84792614099985 40.87134223...,0.000314,Allerton/Pelham Gardens,3,Bronx
3,4,0.043567,MultiPolygon (((-73.97177410965318 40.72582128...,0.000112,Alphabet City,4,Manhattan
4,5,0.092146,MultiPolygon (((-74.17421738099989 40.56256808...,0.000498,Arden Heights,5,Staten Island


# Creating geopandas data frames 
## References

- https://www.kaggle.com/code/dashaa/a-geospatial-analysis-of-the-nyc-subway-in-r/report
- https://www.matecdev.com/posts/point-in-polygon.html#:~:text=for%20further%20analysis.-,How%20to%20check%20if%20a%20point%20is%20inside%20a%20polygon,a%20polygon%20contains%20a%20point
- https://stackoverflow.com/questions/66786737/dataframe-with-wkt-column-to-geopandas-geometry
- https://spatial-dev.guru/2022/09/26/creating-geodataframe-from-dataframe-with-coordinates-or-wkt/
- https://www.geeksforgeeks.org/different-ways-to-create-pandas-dataframe/


## Subway Geopandas

In [9]:
# creating a new column "coordinates" from the georeference column 
subway_df["coordinates"] = subway_df["Georeference"].apply(wkt.loads)

In [10]:
# creating a geopandas data frame from the subway data frame by changing the column coordinates to a
# the geometry data type
subway_gdf = gpd.GeoDataFrame(subway_df, geometry = "coordinates")

In [11]:
# first 5 rows of geopandas data frame 
subway_gdf.head(5)

Unnamed: 0,transit_timestamp,station_complex_id,borough,ridership,latitude,longitude,Georeference,coordinates
0,2023-02-05 22:00:00,R252,M,56,40.7906,-73.94748,POINT (-73.94748 40.7906),POINT (-73.94748 40.79060)
1,2022-09-17 10:00:00,R252,M,333,40.7906,-73.94748,POINT (-73.94748 40.7906),POINT (-73.94748 40.79060)
2,2022-05-14 21:00:00,R170,M,239,40.799446,-73.968376,POINT (-73.968376 40.799446),POINT (-73.96838 40.79945)
3,2022-05-03 21:00:00,H007,M,470,40.730953,-73.98163,POINT (-73.98163 40.730953),POINT (-73.98163 40.73095)
4,2023-01-28 18:00:00,H007,M,1450,40.730953,-73.98163,POINT (-73.98163 40.730953),POINT (-73.98163 40.73095)


## Taxi Zone Geopandas

In [12]:
# creating a new column "zone_loction" from "the_geom" column
taxi_zones["zone_location"] = taxi_zones["the_geom"].apply(wkt.loads)

In [13]:
# creating a geopandas data frame from the taxi zone data frame by converting the column zone_location to a 
# geometry data type
taxi_zones_gdf = gpd.GeoDataFrame(taxi_zones, geometry = "zone_location")

In [14]:
taxi_zones_gdf.head(5)

Unnamed: 0,OBJECTID,Shape_Leng,the_geom,Shape_Area,zone,LocationID,borough,zone_location
0,1,0.116357,MultiPolygon(((-74.18445299999996 40.694995999...,0.000782,Newark Airport,1,EWR,"MULTIPOLYGON (((-74.18445 40.69500, -74.18449 ..."
1,2,0.43347,MultiPolygon (((-73.82337597260663 40.63898704...,0.004866,Jamaica Bay,2,Queens,"MULTIPOLYGON (((-73.82338 40.63899, -73.82277 ..."
2,3,0.084341,MultiPolygon (((-73.84792614099985 40.87134223...,0.000314,Allerton/Pelham Gardens,3,Bronx,"MULTIPOLYGON (((-73.84793 40.87134, -73.84725 ..."
3,4,0.043567,MultiPolygon (((-73.97177410965318 40.72582128...,0.000112,Alphabet City,4,Manhattan,"MULTIPOLYGON (((-73.97177 40.72582, -73.97179 ..."
4,5,0.092146,MultiPolygon (((-74.17421738099989 40.56256808...,0.000498,Arden Heights,5,Staten Island,"MULTIPOLYGON (((-74.17422 40.56257, -74.17349 ..."


In [15]:
taxi_zones_gdf.shape

(263, 8)

# Spatial Join
- A spatial join will be performed to get each zone station within a taxi zone


In [16]:
# defining the crs based on research and assumptions
subway_gdf.set_crs = "EPSG:4326"
taxi_zones_gdf.set_crs = "EPSG:4326"

# spatial join
subway_zones_gdf = gpd.tools.sjoin(subway_gdf, taxi_zones_gdf, predicate = "within")

# converting geopandas data frame into a pandas data frame
subway_zones = pd.DataFrame(subway_zones_gdf)

In [17]:
subway_zones.head(5)

Unnamed: 0,transit_timestamp,station_complex_id,borough_left,ridership,latitude,longitude,Georeference,coordinates,index_right,OBJECTID,Shape_Leng,the_geom,Shape_Area,zone,LocationID,borough_right
0,2023-02-05 22:00:00,R252,M,56,40.7906,-73.94748,POINT (-73.94748 40.7906),POINT (-73.94748 40.79060),78,75,0.087664,MultiPolygon (((-73.94103496899987 40.79254666...,0.000241,East Harlem South,75,Manhattan
1,2022-09-17 10:00:00,R252,M,333,40.7906,-73.94748,POINT (-73.94748 40.7906),POINT (-73.94748 40.79060),78,75,0.087664,MultiPolygon (((-73.94103496899987 40.79254666...,0.000241,East Harlem South,75,Manhattan
5,2022-04-25 09:00:00,R252,M,341,40.7906,-73.94748,POINT (-73.94748 40.7906),POINT (-73.94748 40.79060),78,75,0.087664,MultiPolygon (((-73.94103496899987 40.79254666...,0.000241,East Harlem South,75,Manhattan
8,2023-01-06 14:00:00,R252,M,689,40.7906,-73.94748,POINT (-73.94748 40.7906),POINT (-73.94748 40.79060),78,75,0.087664,MultiPolygon (((-73.94103496899987 40.79254666...,0.000241,East Harlem South,75,Manhattan
11,2022-08-13 11:00:00,R252,M,285,40.7906,-73.94748,POINT (-73.94748 40.7906),POINT (-73.94748 40.79060),78,75,0.087664,MultiPolygon (((-73.94103496899987 40.79254666...,0.000241,East Harlem South,75,Manhattan


In [18]:
# droping irrelevant columns 
subway_zones = subway_zones.drop(["station_complex_id", "borough_left", "latitude", "longitude","Georeference",
                                 "coordinates", "index_right", "OBJECTID", "Shape_Leng", "the_geom", "Shape_Area",
                                 "zone","borough_right"], 1)

  subway_zones = subway_zones.drop(["station_complex_id", "borough_left", "latitude", "longitude","Georeference",


In [19]:
# shape of new data frame
subway_zones.shape

(1148138, 3)

In [20]:
# first 5 rows 
subway_zones.head(5)

Unnamed: 0,transit_timestamp,ridership,LocationID
0,2023-02-05 22:00:00,56,75
1,2022-09-17 10:00:00,333,75
5,2022-04-25 09:00:00,341,75
8,2023-01-06 14:00:00,689,75
11,2022-08-13 11:00:00,285,75


# Grouping subway ridership by taxi zone
## References
- https://sparkbyexamples.com/pandas/pandas-aggregate-functions-with-examples/?expand_article=1
- https://kanoki.org/pandas-group-dataframe-by-timeinterval
- https://sparkbyexamples.com/pandas/pandas-reset-index-examples/

In [21]:
# converting timestamp column to datetime
subway_zones["transit_timestamp"] = pd.to_datetime(subway_zones["transit_timestamp"])

In [22]:
# first 5 rows
subway_zones.head(5)

Unnamed: 0,transit_timestamp,ridership,LocationID
0,2023-02-05 22:00:00,56,75
1,2022-09-17 10:00:00,333,75
5,2022-04-25 09:00:00,341,75
8,2023-01-06 14:00:00,689,75
11,2022-08-13 11:00:00,285,75


In [23]:
# data frame shape
subway_zones.shape

(1148138, 3)

In [24]:
# number of taxi zones with subway stations
subway_zones["LocationID"].nunique()

50

In [25]:
# grouping data frame to get to total ridership for each zone every hour
subway_grouped = subway_zones.groupby(["LocationID", pd.Grouper(key = "transit_timestamp", freq = "1H")]).agg({
    "ridership" : "sum"})

In [26]:
subway_grouped = subway_grouped.reset_index()

In [27]:
subway_grouped.head(5)

Unnamed: 0,LocationID,transit_timestamp,ridership
0,24,2022-02-01 00:00:00,25
1,24,2022-02-01 01:00:00,11
2,24,2022-02-01 02:00:00,6
3,24,2022-02-01 03:00:00,6
4,24,2022-02-01 04:00:00,8


In [28]:
# creating month, day and hour columns
subway_grouped["month"] = subway_grouped["transit_timestamp"].dt.month
subway_grouped["dayofweek"] = subway_grouped["transit_timestamp"].dt.dayofweek
subway_grouped["hour"] = subway_grouped["transit_timestamp"].dt.hour

In [29]:
# first 5 rows
subway_grouped.head(5)

Unnamed: 0,LocationID,transit_timestamp,ridership,month,dayofweek,hour
0,24,2022-02-01 00:00:00,25,2,1,0
1,24,2022-02-01 01:00:00,11,2,1,1
2,24,2022-02-01 02:00:00,6,2,1,2
3,24,2022-02-01 03:00:00,6,2,1,3
4,24,2022-02-01 04:00:00,8,2,1,4


In [30]:
# shape of grouped data frame
subway_grouped.shape

(482800, 6)

# Merging grouped subway data with taxi and weather data 
## References
- https://www.geeksforgeeks.org/how-to-rename-columns-in-pandas-dataframe/

In [31]:
# reading taxi and weather data 
taxi_weather = pd.read_csv("taxi_weather_grouped.csv")

In [32]:
# first 5 rows of taxi and weather data frame
taxi_weather.head(5)

Unnamed: 0.1,Unnamed: 0,time,temperature_2m (°C),rain (mm),snowfall (cm),windspeed_10m (km/h),Unnamed: 5,month,dayofweek,hour,DOLocationID,passenger_count
0,0,2022-02-01 00:00:00,-5.8,0.0,0.0,10.3,,2,1,0,4,13.0
1,1,2022-02-01 00:00:00,-5.8,0.0,0.0,10.3,,2,1,0,13,5.0
2,2,2022-02-01 00:00:00,-5.8,0.0,0.0,10.3,,2,1,0,24,14.0
3,3,2022-02-01 00:00:00,-5.8,0.0,0.0,10.3,,2,1,0,41,22.0
4,4,2022-02-01 00:00:00,-5.8,0.0,0.0,10.3,,2,1,0,42,16.0


In [33]:
# dropping Unnamed column from merge
taxi_weather = taxi_weather.drop("Unnamed: 0", 1)

  taxi_weather = taxi_weather.drop("Unnamed: 0", 1)


In [34]:
# confirming that the columns was dropped
taxi_weather.head(5)

Unnamed: 0,time,temperature_2m (°C),rain (mm),snowfall (cm),windspeed_10m (km/h),Unnamed: 5,month,dayofweek,hour,DOLocationID,passenger_count
0,2022-02-01 00:00:00,-5.8,0.0,0.0,10.3,,2,1,0,4,13.0
1,2022-02-01 00:00:00,-5.8,0.0,0.0,10.3,,2,1,0,13,5.0
2,2022-02-01 00:00:00,-5.8,0.0,0.0,10.3,,2,1,0,24,14.0
3,2022-02-01 00:00:00,-5.8,0.0,0.0,10.3,,2,1,0,41,22.0
4,2022-02-01 00:00:00,-5.8,0.0,0.0,10.3,,2,1,0,42,16.0


In [35]:
# convertint time column in taxi_weather data frame to datetime
taxi_weather["time"] = pd.to_datetime(taxi_weather["time"])

In [36]:
# renaming transit_timestamp column in subway data frame
subway_grouped.rename(columns = {"transit_timestamp" : "time"}, inplace = True)

In [37]:
# confirming column name change
subway_grouped.head(5)

Unnamed: 0,LocationID,time,ridership,month,dayofweek,hour
0,24,2022-02-01 00:00:00,25,2,1,0
1,24,2022-02-01 01:00:00,11,2,1,1
2,24,2022-02-01 02:00:00,6,2,1,2
3,24,2022-02-01 03:00:00,6,2,1,3
4,24,2022-02-01 04:00:00,8,2,1,4


In [38]:
# renaming DOLocationID column in taxi_weather data frame
taxi_weather.rename(columns = {"DOLocationID": "LocationID"}, inplace = True)

In [39]:
# confirming column name change
taxi_weather.head(5)

Unnamed: 0,time,temperature_2m (°C),rain (mm),snowfall (cm),windspeed_10m (km/h),Unnamed: 5,month,dayofweek,hour,LocationID,passenger_count
0,2022-02-01,-5.8,0.0,0.0,10.3,,2,1,0,4,13.0
1,2022-02-01,-5.8,0.0,0.0,10.3,,2,1,0,13,5.0
2,2022-02-01,-5.8,0.0,0.0,10.3,,2,1,0,24,14.0
3,2022-02-01,-5.8,0.0,0.0,10.3,,2,1,0,41,22.0
4,2022-02-01,-5.8,0.0,0.0,10.3,,2,1,0,42,16.0


In [40]:
# merging data frames 
merged_df = pd.merge(subway_grouped, taxi_weather, on = ["LocationID", "time", "hour", "dayofweek", "month"],
                      how = "inner")

In [41]:
# merged data frame shape
merged_df.shape

(422562, 12)

In [42]:
# first 5 rows
merged_df.head(5)

Unnamed: 0,LocationID,time,ridership,month,dayofweek,hour,temperature_2m (°C),rain (mm),snowfall (cm),windspeed_10m (km/h),Unnamed: 5,passenger_count
0,24,2022-02-01 00:00:00,25,2,1,0,-5.8,0.0,0.0,10.3,,14.0
1,24,2022-02-01 01:00:00,11,2,1,1,-7.1,0.0,0.0,11.0,,3.0
2,24,2022-02-01 02:00:00,6,2,1,2,-8.6,0.0,0.0,11.5,,2.0
3,24,2022-02-01 03:00:00,6,2,1,3,-9.8,0.0,0.0,12.0,,2.0
4,24,2022-02-01 07:00:00,254,2,1,7,-12.0,0.0,0.0,9.8,,5.0


## Total number of taxi and subway passengers in each zone
### References
- https://pandas.pydata.org/docs/getting_started/intro_tutorials/05_add_columns.html

In [43]:
# creating column total_people as the sum of passengers and subway ridership in each taxi zone
merged_df["total_people"] = (merged_df["ridership"] + merged_df["passenger_count"])

In [44]:
# first 5 rows 
merged_df.head(5)

Unnamed: 0,LocationID,time,ridership,month,dayofweek,hour,temperature_2m (°C),rain (mm),snowfall (cm),windspeed_10m (km/h),Unnamed: 5,passenger_count,total_people
0,24,2022-02-01 00:00:00,25,2,1,0,-5.8,0.0,0.0,10.3,,14.0,39.0
1,24,2022-02-01 01:00:00,11,2,1,1,-7.1,0.0,0.0,11.0,,3.0,14.0
2,24,2022-02-01 02:00:00,6,2,1,2,-8.6,0.0,0.0,11.5,,2.0,8.0
3,24,2022-02-01 03:00:00,6,2,1,3,-9.8,0.0,0.0,12.0,,2.0,8.0
4,24,2022-02-01 07:00:00,254,2,1,7,-12.0,0.0,0.0,9.8,,5.0,259.0


In [45]:
# dropping passenger_count, ridership and unnamed columns
merged_df = merged_df.drop(["ridership", "passenger_count", "Unnamed: 5"], 1)

  merged_df = merged_df.drop(["ridership", "passenger_count", "Unnamed: 5"], 1)


In [46]:
# final data frame first 5 rows
merged_df.head(5)

Unnamed: 0,LocationID,time,month,dayofweek,hour,temperature_2m (°C),rain (mm),snowfall (cm),windspeed_10m (km/h),total_people
0,24,2022-02-01 00:00:00,2,1,0,-5.8,0.0,0.0,10.3,39.0
1,24,2022-02-01 01:00:00,2,1,1,-7.1,0.0,0.0,11.0,14.0
2,24,2022-02-01 02:00:00,2,1,2,-8.6,0.0,0.0,11.5,8.0
3,24,2022-02-01 03:00:00,2,1,3,-9.8,0.0,0.0,12.0,8.0
4,24,2022-02-01 07:00:00,2,1,7,-12.0,0.0,0.0,9.8,259.0


In [47]:
# shape of final data frame
merged_df.shape

(422562, 10)

In [48]:
# saving data frame as a csv file 
merged_df.to_csv("transport_weather_hourly.csv")

# Creating for taxi and weather data frame for taxi zones without subway stations
## References
- https://datascienceparichay.com/article/filter-dataframe-on-list-of-values/

In [49]:
merged_df["LocationID"].unique()

array([ 24,  41,  42,  43,  45,  48,  74,  75,  79,  87,  88,  90, 100,
       107, 113, 114, 116, 125, 127, 141, 142, 143, 144, 148, 151, 152,
       153, 161, 162, 163, 164, 166, 186, 202, 209, 229, 230, 231, 232,
       234, 236, 237, 238, 239, 243, 244, 246, 249, 261, 263])

In [50]:
# list of taxi zones with subway stations 
zones_with_subway = [ 24,  41,  42,  43,  45,  48,  74,  75,  79,  87,  88,  90, 100,
       107, 113, 114, 116, 125, 127, 141, 142, 143, 144, 148, 151, 152,
       153, 161, 162, 163, 164, 166, 186, 202, 209, 229, 230, 231, 232,
       234, 236, 237, 238, 239, 243, 244, 246, 249, 261, 263]

In [51]:
# creating a list of taxi zones with no subway stations
taxi_only = []
for i in manhattan:
    if i not in zones_with_subway:
        taxi_only.append(i)
print(taxi_only)

[4, 12, 13, 50, 68, 103, 120, 128, 140, 137, 158, 170, 194, 211, 224, 233, 262]


In [52]:
print(len(taxi_only))

17


In [53]:
# filtering taxi and weather data frame for rows with in taxi only zones
taxi_weather_only = taxi_weather[taxi_weather["LocationID"].isin(taxi_only)]

In [54]:
# shape of data frame
taxi_weather_only.shape

(115867, 11)

In [55]:
# first 5 rows 
taxi_weather_only.head(5)

Unnamed: 0,time,temperature_2m (°C),rain (mm),snowfall (cm),windspeed_10m (km/h),Unnamed: 5,month,dayofweek,hour,LocationID,passenger_count
0,2022-02-01,-5.8,0.0,0.0,10.3,,2,1,0,4,13.0
1,2022-02-01,-5.8,0.0,0.0,10.3,,2,1,0,13,5.0
8,2022-02-01,-5.8,0.0,0.0,10.3,,2,1,0,50,12.0
9,2022-02-01,-5.8,0.0,0.0,10.3,,2,1,0,68,36.0
23,2022-02-01,-5.8,0.0,0.0,10.3,,2,1,0,137,14.0


In [56]:
# number of taxi zones 
taxi_weather_only["LocationID"].unique()

array([  4,  13,  50,  68, 137, 140, 158, 170, 211, 224, 233, 262,  12,
       194, 128, 120])

In [57]:
# saving taxi only zones as csv
taxi_weather_only.to_csv("taxi_weather_only.csv")

# Creating data frame with for zone 103 using the data from zone 12
- Only zone 103 is not include in the taxi only data frame or the taxi and subway data frame
- This is because this the only public transport that can be used to reach this zone is a ferry
- Zone 12, which represents the battery park zone will be used to represent this zone in the model instead because the ferry that travels to zone 103 departs from zone 12

## References 
- https://www.newyork.co.uk/ellis-island/
- https://sparkbyexamples.com/pandas/pandas-replace-by-examples/

In [58]:
# battery park zone
battery_park = [12]

In [59]:
# creating a data frame with data for zone 12 only
ellis_island_df = taxi_weather[taxi_weather["LocationID"].isin(battery_park)]

In [60]:
# data frame shape 
ellis_island_df.shape

(5402, 11)

In [61]:
# first 5 rows
ellis_island_df.head(5)

Unnamed: 0,time,temperature_2m (°C),rain (mm),snowfall (cm),windspeed_10m (km/h),Unnamed: 5,month,dayofweek,hour,LocationID,passenger_count
229,2022-02-01 04:00:00,-10.8,0.0,0.0,12.0,,2,1,4,12,1.0
326,2022-02-01 06:00:00,-12.5,0.0,0.0,12.2,,2,1,6,12,1.0
385,2022-02-01 07:00:00,-12.0,0.0,0.0,9.8,,2,1,7,12,1.0
446,2022-02-01 08:00:00,-12.8,0.0,0.0,9.7,,2,1,8,12,6.0
507,2022-02-01 09:00:00,-13.2,0.0,0.0,10.0,,2,1,9,12,2.0


In [62]:
# confirmng that only zone 12 is included in this data frame
ellis_island_df["LocationID"].unique()

array([12])

In [63]:
# replacing zone 
ellis_island_df["LocationID"] = ellis_island_df["LocationID"].replace(12, 103)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  ellis_island_df["LocationID"] = ellis_island_df["LocationID"].replace(12, 103)


In [64]:
# confirming LocationID change
ellis_island_df["LocationID"].unique()

array([103])

In [65]:
# first 5 rows of new data frame
ellis_island_df.head(5)

Unnamed: 0,time,temperature_2m (°C),rain (mm),snowfall (cm),windspeed_10m (km/h),Unnamed: 5,month,dayofweek,hour,LocationID,passenger_count
229,2022-02-01 04:00:00,-10.8,0.0,0.0,12.0,,2,1,4,103,1.0
326,2022-02-01 06:00:00,-12.5,0.0,0.0,12.2,,2,1,6,103,1.0
385,2022-02-01 07:00:00,-12.0,0.0,0.0,9.8,,2,1,7,103,1.0
446,2022-02-01 08:00:00,-12.8,0.0,0.0,9.7,,2,1,8,103,6.0
507,2022-02-01 09:00:00,-13.2,0.0,0.0,10.0,,2,1,9,103,2.0


In [66]:
# saving data frame as a csv file 
ellis_island_df.to_csv("ellis_island.csv")

# Concatenating Taxi and weather data frame with Ellis Island data frame
- Concatenating dataframe of taxi and weather information for zones without subway stations with the data frame for zone 103 (Governor's Island/Ellis Island/Liberty Island)

## References
- https://pandas.pydata.org/pandas-docs/stable/user_guide/merging.html

In [67]:
# list of data frames
df_list = [taxi_weather_only, ellis_island_df]

In [68]:
# concatenating data frames
all_taxi_only = pd.concat(df_list)

In [69]:
# shape of data frame
all_taxi_only.shape

(121269, 11)

In [70]:
# first 5 rows 
all_taxi_only.head(5)

Unnamed: 0,time,temperature_2m (°C),rain (mm),snowfall (cm),windspeed_10m (km/h),Unnamed: 5,month,dayofweek,hour,LocationID,passenger_count
0,2022-02-01,-5.8,0.0,0.0,10.3,,2,1,0,4,13.0
1,2022-02-01,-5.8,0.0,0.0,10.3,,2,1,0,13,5.0
8,2022-02-01,-5.8,0.0,0.0,10.3,,2,1,0,50,12.0
9,2022-02-01,-5.8,0.0,0.0,10.3,,2,1,0,68,36.0
23,2022-02-01,-5.8,0.0,0.0,10.3,,2,1,0,137,14.0


In [71]:
# confirming Ellis Island zone is included in the data frame
all_taxi_only["LocationID"].unique()

array([  4,  13,  50,  68, 137, 140, 158, 170, 211, 224, 233, 262,  12,
       194, 128, 120, 103])

In [72]:
# saving final data frame for zones that have no subway stations 
all_taxi_only.to_csv("all_taxi_only.csv")

# Concatenating all data frames to create one for all zones

In [89]:
# renaming passenger count column in data frame of zones without subway stations
all_taxi_only.rename(columns = {"passenger_count" : "total_people"}, inplace = True)

In [90]:
# list of data frames
df_list = [merged_df, all_taxi_only]

In [91]:
# concatenating data frames
all_zones = pd.concat(df_list)

In [92]:
# shape of data frame
all_zones.shape

(543831, 11)

In [93]:
# first 5 rows
all_zones.head(5)

Unnamed: 0,LocationID,time,month,dayofweek,hour,temperature_2m (°C),rain (mm),snowfall (cm),windspeed_10m (km/h),total_people,Unnamed: 5
0,24,2022-02-01 00:00:00,2,1,0,-5.8,0.0,0.0,10.3,39.0,
1,24,2022-02-01 01:00:00,2,1,1,-7.1,0.0,0.0,11.0,14.0,
2,24,2022-02-01 02:00:00,2,1,2,-8.6,0.0,0.0,11.5,8.0,
3,24,2022-02-01 03:00:00,2,1,3,-9.8,0.0,0.0,12.0,8.0,
4,24,2022-02-01 07:00:00,2,1,7,-12.0,0.0,0.0,9.8,259.0,


In [94]:
# dropping unnamed column
all_zones = all_zones.drop("Unnamed: 5", 1)

  all_zones = all_zones.drop("Unnamed: 5", 1)


In [95]:
all_zones.head(5)

Unnamed: 0,LocationID,time,month,dayofweek,hour,temperature_2m (°C),rain (mm),snowfall (cm),windspeed_10m (km/h),total_people
0,24,2022-02-01 00:00:00,2,1,0,-5.8,0.0,0.0,10.3,39.0
1,24,2022-02-01 01:00:00,2,1,1,-7.1,0.0,0.0,11.0,14.0
2,24,2022-02-01 02:00:00,2,1,2,-8.6,0.0,0.0,11.5,8.0
3,24,2022-02-01 03:00:00,2,1,3,-9.8,0.0,0.0,12.0,8.0
4,24,2022-02-01 07:00:00,2,1,7,-12.0,0.0,0.0,9.8,259.0


In [97]:
# saving data frames
all_zones.to_csv("complete_transport_weather.csv")