In [29]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
from datetime import datetime
import datetime as dt
import pyarrow.feather as feather

#Setup the figures to be higher DPI when displayed in the notebook and saved
plt.rcParams['figure.dpi'] = 150
plt.rcParams['savefig.dpi'] = 300

#Determine if you want to save the figures or just display them in the notebook
save_figure_option = False

In [30]:
station_entries_df = pd.read_csv("data/CTA_Ridership_L_Station_Entries_Daily_Totals_20241112.csv")
station_entries_df.head(10)

Unnamed: 0,station_id,stationname,date,daytype,rides
0,41280,Jefferson Park,12/22/2017,W,6104
1,41000,Cermak-Chinatown,12/18/2017,W,3636
2,40280,Central-Lake,12/02/2017,A,1270
3,40140,Dempster-Skokie,12/19/2017,W,1759
4,40690,Dempster,12/03/2017,U,499
5,41660,Lake/State,12/30/2017,A,8615
6,40180,Oak Park-Forest Park,12/17/2017,U,442
7,40250,Kedzie-Homan-Forest Park,12/02/2017,A,1353
8,40120,35th/Archer,12/07/2017,W,3353
9,41420,Addison-North Main,12/19/2017,W,6034


In [31]:
station_entries_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1232735 entries, 0 to 1232734
Data columns (total 5 columns):
 #   Column       Non-Null Count    Dtype 
---  ------       --------------    ----- 
 0   station_id   1232735 non-null  int64 
 1   stationname  1232735 non-null  object
 2   date         1232735 non-null  object
 3   daytype      1232735 non-null  object
 4   rides        1232735 non-null  int64 
dtypes: int64(2), object(3)
memory usage: 47.0+ MB


In [32]:
#Make categories out of the stations names and daytype
station_entries_df["stationname"] = station_entries_df["stationname"].astype("category")
station_entries_df["daytype"] = station_entries_df["daytype"].astype("category")
station_entries_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1232735 entries, 0 to 1232734
Data columns (total 5 columns):
 #   Column       Non-Null Count    Dtype   
---  ------       --------------    -----   
 0   station_id   1232735 non-null  int64   
 1   stationname  1232735 non-null  category
 2   date         1232735 non-null  object  
 3   daytype      1232735 non-null  category
 4   rides        1232735 non-null  int64   
dtypes: category(2), int64(2), object(1)
memory usage: 31.7+ MB


In [33]:
#change date to a datetime object
station_entries_df["date"] = station_entries_df["date"].apply(lambda s: datetime.strptime(s, "%m/%d/%Y"))
station_entries_df.head()

Unnamed: 0,station_id,stationname,date,daytype,rides
0,41280,Jefferson Park,2017-12-22,W,6104
1,41000,Cermak-Chinatown,2017-12-18,W,3636
2,40280,Central-Lake,2017-12-02,A,1270
3,40140,Dempster-Skokie,2017-12-19,W,1759
4,40690,Dempster,2017-12-03,U,499


In [34]:
station_entries_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1232735 entries, 0 to 1232734
Data columns (total 5 columns):
 #   Column       Non-Null Count    Dtype         
---  ------       --------------    -----         
 0   station_id   1232735 non-null  int64         
 1   stationname  1232735 non-null  category      
 2   date         1232735 non-null  datetime64[ns]
 3   daytype      1232735 non-null  category      
 4   rides        1232735 non-null  int64         
dtypes: category(2), datetime64[ns](1), int64(2)
memory usage: 31.7 MB


In [35]:
station_entries_df.describe()

Unnamed: 0,station_id,date,rides
count,1232735.0,1232735,1232735.0
mean,40767.93,2012-11-11 05:14:51.254324224,2979.097
min,40010.0,2001-01-01 00:00:00,0.0
25%,40370.0,2006-12-12 00:00:00,933.0
50%,40760.0,2012-11-22 00:00:00,1940.0
75%,41160.0,2018-10-09 00:00:00,3918.0
max,41710.0,2024-08-31 00:00:00,36323.0
std,449.9409,,3061.73


In [36]:
len(station_entries_df[station_entries_df["rides"] == 0]) / len(station_entries_df["rides"]) * 100

1.2009880469038356

1.2% of the data is listed as having 0 rides and this can be for a variety of reasons, the most common being that the station is closed for work. For example, as part of the rpm (Red Purple Moderinization project) Berwyn and Lawerence have been closed for several years. Bryn Mawr was also closed for a few years and has only recently reopened but only with service in the direction of 95th. It would be challenging for a model to pick up on these trends without external data such as when long projects such as this are happening; hence, this data will be removed from the dataset. 

In [37]:
station_entries_df = station_entries_df.drop(station_entries_df[station_entries_df["rides"] == 0].index)
len(station_entries_df[station_entries_df["rides"] == 0]) #confirm the drop

0

In [38]:
len(set(station_entries_df["station_id"]))

148

In [47]:
len(set(station_entries_df["stationname"]))

149

In [42]:
station_info_df = pd.read_csv("data/CTA_-_System_Information_-_List_of__L__Stops_20241211.csv")
station_info_df.head(10)

Unnamed: 0,STOP_ID,DIRECTION_ID,STOP_NAME,STATION_NAME,STATION_DESCRIPTIVE_NAME,MAP_ID,ADA,RED,BLUE,G,BRN,P,Pexp,Y,Pnk,O,Location
0,30082,E,Cicero (Loop-bound),Cicero,Cicero (Pink Line),40420,True,False,False,False,False,False,False,False,True,False,"(41.85182, -87.745336)"
1,30151,E,Central Park (Loop-bound),Central Park,Central Park (Pink Line),40780,True,False,False,False,False,False,False,False,True,False,"(41.853839, -87.714842)"
2,30184,W,Halsted/63rd (Ashland-bound),Halsted,Halsted (Green Line),40940,True,False,False,True,False,False,False,False,False,False,"(41.778943, -87.644244)"
3,30044,N,Cumberland (O'Hare-bound),Cumberland,Cumberland (Blue Line),40230,True,False,True,False,False,False,False,False,False,False,"(41.984246, -87.838028)"
4,30092,E,Racine (O'Hare-bound),Racine,Racine (Blue Line),40470,False,False,True,False,False,False,False,False,False,False,"(41.87592, -87.659458)"
5,30253,N,Paulina (Kimball-bound),Paulina,Paulina (Brown Line),41310,True,False,False,False,True,False,False,False,False,False,"(41.943623, -87.670907)"
6,30162,W,18th (54th/Cermak-bound),18th,18th (Pink Line),40830,True,False,False,False,False,False,False,False,True,False,"(41.857908, -87.669147)"
7,30374,S,Clark/Lake (Forest Pk-bound),Clark/Lake,"Clark/Lake (Blue, Brown, Green, Orange, Purple...",40380,True,False,True,False,False,False,False,False,False,False,"(41.885737, -87.630886)"
8,30248,S,Jefferson Park (Forest Pk-bound),Jefferson Park,Jefferson Park (Blue Line),41280,True,False,True,False,False,False,False,False,False,False,"(41.970634, -87.760892)"
9,30104,S,Diversey (Loop-bound),Diversey,Diversey (Brown & Purple lines),40530,True,False,False,False,True,False,True,False,False,False,"(41.932732, -87.653131)"


In [45]:
station_info_df[station_info_df["STATION_NAME"] == "Addison"]

Unnamed: 0,STOP_ID,DIRECTION_ID,STOP_NAME,STATION_NAME,STATION_DESCRIPTIVE_NAME,MAP_ID,ADA,RED,BLUE,G,BRN,P,Pexp,Y,Pnk,O,Location
47,30239,N,Addison (O'Hare-bound),Addison,Addison (Blue Line),41240,True,False,True,False,False,False,False,False,False,False,"(41.94738, -87.71906)"
82,30240,S,Addison (Forest Pk-bound),Addison,Addison (Blue Line),41240,True,False,True,False,False,False,False,False,False,False,"(41.94738, -87.71906)"
92,30274,S,Addison (95th-bound),Addison,Addison (Red Line),41420,True,True,False,False,False,False,False,False,False,False,"(41.947428, -87.653626)"
130,30273,N,Addison (Howard-bound),Addison,Addison (Red Line),41420,True,True,False,False,False,False,False,False,False,False,"(41.947428, -87.653626)"
221,30278,S,Addison (Loop-bound),Addison,Addison (Brown Line),41440,True,False,False,False,True,False,False,False,False,False,"(41.947028, -87.674642)"
262,30277,N,Addison (Kimball-bound),Addison,Addison (Brown Line),41440,True,False,False,False,True,False,False,False,False,False,"(41.947028, -87.674642)"


The CTA does provide a helpful file which has information on each station and after a bit of exlploring MAP_ID in this station information file corresponds to the station_id in the station entries file. Hence, this is how we will join these two datasets. However, before doing that, we will explore a single stations' ridership totals to see if there is anything we will need to be aware of before we do more cleaning. State/Lake is considered the epicenter of the Loop and all train lines except red, blue and yellow visit this station so it's the one we will plot. 