Loading and cleaning data

In [76]:
import pandas as pd
import numpy as np
import altair as alt

from zipfile import ZipFile
import requests
import io

In [77]:
!pip install geopandas
import geopandas as gpd

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [78]:
sept20 = pd.read_csv('https://s3.amazonaws.com/tripdata/202009-citibike-tripdata.csv.zip')

In [79]:
sept20.columns = sept20.columns.str.replace(' ', '_')

In [80]:
# Here we are getting the zip file
citibikeZipFile = requests.get('https://s3.amazonaws.com/tripdata/202109-citibike-tripdata.csv.zip')

# Here we are uncompressing it and reading the whole package into a variable
zipFiles = ZipFile(io.BytesIO(citibikeZipFile.content))

# And here we are looping through the package and printing the names of the files
for name in zipFiles.namelist():
  print(name)

# Finally, we can take that loaded uncopressed package and use it to just load one file to the Pandas dataframe:
sept21 = pd.read_csv(zipFiles.open('202109-citibike-tripdata.csv'))

202109-citibike-tripdata.csv
__MACOSX/._202109-citibike-tripdata.csv


  exec(code_obj, self.user_global_ns, self.user_ns)


September 2020 Analysis

In [81]:
# Total number of trips
len(sept20)

2488225

In [82]:
# Top 5 stations by number of trip starts
sept20.start_station_name.value_counts().head(5)

W 21 St & 6 Ave          13875
West St & Chambers St    13225
12 Ave & W 40 St         12869
1 Ave & E 68 St          12816
E 17 St & Broadway       11264
Name: start_station_name, dtype: int64

In [83]:
# Top 5 stations by number of trip stops
sept20.end_station_name.value_counts().head(5)

W 21 St & 6 Ave          13967
West St & Chambers St    13783
12 Ave & W 40 St         12922
1 Ave & E 68 St          12689
E 17 St & Broadway       11446
Name: end_station_name, dtype: int64

In [84]:
# Median number of starts
sept20.start_station_name.value_counts().median()

1457.0

In [85]:
# Median number of stops
sept20.end_station_name.value_counts().median()

1434.5

September 2021 Analysis

In [86]:
# Total number of trips
len(sept21)

3280560

In [87]:
# Top 5 stations by number of trip starts
sept21.start_station_name.value_counts().head(5)

W 21 St & 6 Ave             14435
E 17 St & Broadway          14065
Broadway & E 14 St          13155
Cleveland Pl & Spring St    12987
W 20 St & 10 Ave            12918
Name: start_station_name, dtype: int64

In [88]:
# Top 5 stations by number of trip stops
sept21.end_station_name.value_counts().head(5)

W 21 St & 6 Ave             14420
E 17 St & Broadway          14092
Cleveland Pl & Spring St    13114
Broadway & E 14 St          12918
W 20 St & 10 Ave            12912
Name: end_station_name, dtype: int64

In [89]:
# Median number of starts
sept21.start_station_name.value_counts().median()

1133.5

In [90]:
# Median number of stops
sept21.end_station_name.value_counts().median()

1078.0

Mapping

In [91]:
ntaData = gpd.read_file('https://services5.arcgis.com/GfwWNkhOj9bNBqoJ/arcgis/rest/services/NYC_Neighborhood_Tabulation_Areas_2020/FeatureServer/0/query?where=1=1&outFields=*&outSR=4326&f=pgeojson')


In [92]:
ntaData.to_file('ntaData.geojson', driver='GeoJSON')

September 2020

In [93]:
sept20['trip_count'] = 1

#starts
sept20_starts = sept20.groupby(['start_station_name']).agg({'start_station_latitude':'min','start_station_longitude':'min', 'trip_count':'count'}).reset_index()

#stops
sept20_ends = sept20.groupby(['end_station_name']).agg({'end_station_latitude':'min','end_station_longitude':'min','trip_count':'count'}).reset_index()

September 2021

In [94]:
sept21['trip_count'] = 1

# starts
sept21_starts = sept21.groupby(['start_station_name']).agg({'start_lat':'min','start_lng':'min', 'trip_count':'count'}).reset_index()

# ends
sept21_ends = sept21.groupby(['end_station_name']).agg({'end_lat':'min','end_lng':'min','trip_count':'count'}).reset_index()

In [95]:
geo_sept20_starts = gpd.GeoDataFrame(data=sept20_starts, geometry=gpd.points_from_xy(x=sept20_starts['start_station_longitude'], y=sept20_starts['start_station_latitude']), crs='epsg:4326')

In [96]:
geo_sept20_ends = gpd.GeoDataFrame(data=sept20_ends, geometry=gpd.points_from_xy(x=sept20_ends['end_station_longitude'], y=sept20_ends['end_station_latitude']), crs='epsg:4326')

In [97]:
geo_sept21_starts = gpd.GeoDataFrame(data=sept21_starts, geometry=gpd.points_from_xy(x=sept21_starts['start_lng'], y=sept21_starts['start_lat']), crs='epsg:4326')

In [98]:
geo_sept21_ends = gpd.GeoDataFrame(data=sept21_ends, geometry=gpd.points_from_xy(x=sept21_ends['end_lng'], y=sept21_ends['end_lat']), crs='epsg:4326')

In [99]:
geo_sept20_starts.to_file('geo_sept20_starts.geojson', driver='GeoJSON')

In [100]:
geo_sept20_ends.to_file('geo_sept20_ends.geojson', driver='GeoJSON')

In [101]:
geo_sept21_starts.to_file('geo_sept21_starts.geojson', driver='GeoJSON')

In [102]:
geo_sept21_ends.to_file('geo_sept21_ends.geojson', driver='GeoJSON')