In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from scipy import stats
from datetime import datetime
import pyarrow.feather as feather

#Setup the figures to be higher DPI when displayed in the notebook and saved
plt.rcParams['figure.dpi'] = 150
plt.rcParams['savefig.dpi'] = 300

#Determine if you want to save the figures or just display them in the notebook
save_figure_option = False

In [8]:
#loading weather data to make it from long to wide 
df = pd.read_csv('USW00094946_wide.csv')
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9261 entries, 0 to 9260
Data columns (total 22 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   Date    9261 non-null   object 
 1   AWND    9143 non-null   float64
 2   FMTM    2343 non-null   float64
 3   PGTM    4335 non-null   float64
 4   PRCP    9231 non-null   float64
 5   SNOW    18 non-null     float64
 6   SNWD    33 non-null     float64
 7   TAVG    2184 non-null   float64
 8   TMAX    9242 non-null   float64
 9   TMIN    9243 non-null   float64
 10  TSUN    1255 non-null   float64
 11  WDF2    9143 non-null   float64
 12  WDF5    9126 non-null   float64
 13  WSF2    9143 non-null   float64
 14  WSF5    9126 non-null   float64
 15  WT01    2448 non-null   float64
 16  WT02    467 non-null    float64
 17  WT03    971 non-null    float64
 18  WT05    1 non-null      float64
 19  WT06    91 non-null     float64
 20  WT08    776 non-null    float64
 21  WT09    33 non-null     float64
dtype

In [9]:
df['TMAX'].describe()

count    9242.000000
mean      170.737070
std       118.386047
min      -210.000000
25%        83.000000
50%       183.000000
75%       272.000000
max       417.000000
Name: TMAX, dtype: float64

In [10]:
#Filter the data from 2017 to 2024 
# Ensure the 'Date' column is in datetime format
df.reset_index(inplace=True)  # Reset index to access 'Date' as a column if it's currently an index
df['Date'] = pd.to_datetime(df['Date'])

# Define start and end dates
start_date = '2017-01-01'
end_date = '2024-12-09'

# Filter the dataset for the desired date range
df_filtered = df[(df['Date'] >= start_date) & (df['Date'] <= end_date)]


# Display the filtered dataset
df_filtered.head()


Unnamed: 0,index,Date,AWND,FMTM,PGTM,PRCP,SNOW,SNWD,TAVG,TMAX,...,WDF5,WSF2,WSF5,WT01,WT02,WT03,WT05,WT06,WT08,WT09
6361,6361,2017-01-01,33.0,,,0.0,,,,28.0,...,130.0,89.0,107.0,,,,,,,
6362,6362,2017-01-02,63.0,,,0.0,,,,6.0,...,350.0,143.0,174.0,1.0,,,,1.0,,
6363,6363,2017-01-03,76.0,,,0.0,,,,-77.0,...,340.0,139.0,174.0,,,,,,,
6364,6364,2017-01-04,31.0,,,13.0,,,,-105.0,...,350.0,81.0,94.0,1.0,,,,,1.0,
6365,6365,2017-01-05,51.0,,,0.0,,,,-127.0,...,320.0,112.0,139.0,,,,,,,


In [11]:
df_filtered.tail()

Unnamed: 0,index,Date,AWND,FMTM,PGTM,PRCP,SNOW,SNWD,TAVG,TMAX,...,WDF5,WSF2,WSF5,WT01,WT02,WT03,WT05,WT06,WT08,WT09
9256,9256,2024-12-05,37.0,,130.0,0.0,,,,11.0,...,340.0,72.0,98.0,,,,,,,
9257,9257,2024-12-06,13.0,,330.0,0.0,,,,167.0,...,240.0,40.0,58.0,,,,,,,
9258,9258,2024-12-07,28.0,,1220.0,0.0,,,,172.0,...,260.0,81.0,107.0,,,,,,,
9259,9259,2024-12-08,30.0,,1355.0,0.0,,,,194.0,...,350.0,143.0,183.0,,,,,,,
9260,9260,2024-12-09,,,,0.0,,,,72.0,...,,,,,,,,,,


In [13]:
df_filtered = df_filtered.fillna(0)  # Replace NaNs with zeros

df_filtered.tail()

Unnamed: 0,index,Date,AWND,FMTM,PGTM,PRCP,SNOW,SNWD,TAVG,TMAX,...,WDF5,WSF2,WSF5,WT01,WT02,WT03,WT05,WT06,WT08,WT09
9256,9256,2024-12-05,37.0,0.0,130.0,0.0,0.0,0.0,0.0,11.0,...,340.0,72.0,98.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9257,9257,2024-12-06,13.0,0.0,330.0,0.0,0.0,0.0,0.0,167.0,...,240.0,40.0,58.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9258,9258,2024-12-07,28.0,0.0,1220.0,0.0,0.0,0.0,0.0,172.0,...,260.0,81.0,107.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9259,9259,2024-12-08,30.0,0.0,1355.0,0.0,0.0,0.0,0.0,194.0,...,350.0,143.0,183.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9260,9260,2024-12-09,0.0,0.0,0.0,0.0,0.0,0.0,0.0,72.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [14]:
df_filtered.head()

Unnamed: 0,index,Date,AWND,FMTM,PGTM,PRCP,SNOW,SNWD,TAVG,TMAX,...,WDF5,WSF2,WSF5,WT01,WT02,WT03,WT05,WT06,WT08,WT09
6361,6361,2017-01-01,33.0,0.0,0.0,0.0,0.0,0.0,0.0,28.0,...,130.0,89.0,107.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6362,6362,2017-01-02,63.0,0.0,0.0,0.0,0.0,0.0,0.0,6.0,...,350.0,143.0,174.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0
6363,6363,2017-01-03,76.0,0.0,0.0,0.0,0.0,0.0,0.0,-77.0,...,340.0,139.0,174.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6364,6364,2017-01-04,31.0,0.0,0.0,13.0,0.0,0.0,0.0,-105.0,...,350.0,81.0,94.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
6365,6365,2017-01-05,51.0,0.0,0.0,0.0,0.0,0.0,0.0,-127.0,...,320.0,112.0,139.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [15]:
#The whole weather dataset is now filtered to be from 2017-2024
df_filtered['AWND'].describe()

count    2900.000000
mean       42.223103
std        22.770255
min         0.000000
25%        27.000000
50%        38.000000
75%        54.000000
max       181.000000
Name: AWND, dtype: float64

In [16]:
df_filtered['PRCP'].describe()

count    2900.000000
mean       16.933103
std        66.267817
min         0.000000
25%         0.000000
50%         0.000000
75%         0.000000
max      1384.000000
Name: PRCP, dtype: float64

In [17]:
df_filtered['PGTM'].describe()

count    2900.000000
mean       44.072759
std       249.876079
min         0.000000
25%         0.000000
50%         0.000000
75%         0.000000
max      2354.000000
Name: PGTM, dtype: float64

In [18]:
df_filtered['SNOW'].describe()
#will need to remove this 

count    2900.0
mean        0.0
std         0.0
min         0.0
25%         0.0
50%         0.0
75%         0.0
max         0.0
Name: SNOW, dtype: float64

In [19]:
df_filtered['SNWD'].describe()
#need to remove

count    2900.0
mean        0.0
std         0.0
min         0.0
25%         0.0
50%         0.0
75%         0.0
max         0.0
Name: SNWD, dtype: float64

In [20]:
df_filtered['TAVG'].describe()
#remove

count    2900.0
mean        0.0
std         0.0
min         0.0
25%         0.0
50%         0.0
75%         0.0
max         0.0
Name: TAVG, dtype: float64

In [21]:
df_filtered['TMAX'].describe()

count    2900.000000
mean      174.283793
std       120.509788
min      -210.000000
25%        83.000000
50%       189.000000
75%       278.000000
max       394.000000
Name: TMAX, dtype: float64

In [22]:
df_filtered['TMIN'].describe()

count    2900.000000
mean       29.652759
std       109.317244
min      -382.000000
25%       -49.000000
50%        22.000000
75%       128.000000
max       244.000000
Name: TMIN, dtype: float64

In [23]:
df_filtered.describe()

Unnamed: 0,index,AWND,FMTM,PGTM,PRCP,SNOW,SNWD,TAVG,TMAX,TMIN,...,WDF5,WSF2,WSF5,WT01,WT02,WT03,WT05,WT06,WT08,WT09
count,2900.0,2900.0,2900.0,2900.0,2900.0,2900.0,2900.0,2900.0,2900.0,2900.0,...,2900.0,2900.0,2900.0,2900.0,2900.0,2900.0,2900.0,2900.0,2900.0,2900.0
mean,7810.5,42.223103,0.0,44.072759,16.933103,0.0,0.0,0.0,174.283793,29.652759,...,223.434483,98.964138,133.973103,0.285517,0.045862,0.144138,0.0,0.016552,0.068966,0.006552
std,837.302215,22.770255,0.0,249.876079,66.267817,0.0,0.0,0.0,120.509788,109.317244,...,110.117364,39.091204,53.598053,0.451738,0.209222,0.35129,0.0,0.127606,0.253439,0.080691
min,6361.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-210.0,-382.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,7085.75,27.0,0.0,0.0,0.0,0.0,0.0,0.0,83.0,-49.0,...,150.0,72.0,98.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,7810.5,38.0,0.0,0.0,0.0,0.0,0.0,0.0,189.0,22.0,...,230.0,98.0,130.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,8535.25,54.0,0.0,0.0,0.0,0.0,0.0,0.0,278.0,128.0,...,330.0,125.0,165.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
max,9260.0,181.0,0.0,2354.0,1384.0,0.0,0.0,0.0,394.0,244.0,...,360.0,282.0,398.0,1.0,1.0,1.0,0.0,1.0,1.0,1.0


In [39]:
#Filtering by columns we need: 
keep = ['AWND', 'PRCP', 'PGTM', 'TMAX', 'TMIN', 'Date']

df_filtered_keep = df_filtered[keep]
df_filtered_keep.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2900 entries, 6361 to 9260
Data columns (total 6 columns):
 #   Column  Non-Null Count  Dtype         
---  ------  --------------  -----         
 0   AWND    2900 non-null   float64       
 1   PRCP    2900 non-null   float64       
 2   PGTM    2900 non-null   float64       
 3   TMAX    2900 non-null   float64       
 4   TMIN    2900 non-null   float64       
 5   Date    2900 non-null   datetime64[ns]
dtypes: datetime64[ns](1), float64(5)
memory usage: 158.6 KB


In [41]:
df_filtered_keep['TAVG_new'] = (df_filtered_keep['TMAX'] + df_filtered_keep['TMIN']) / 2
df_filtered_keep.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2900 entries, 6361 to 9260
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype         
---  ------    --------------  -----         
 0   AWND      2900 non-null   float64       
 1   PRCP      2900 non-null   float64       
 2   PGTM      2900 non-null   float64       
 3   TMAX      2900 non-null   float64       
 4   TMIN      2900 non-null   float64       
 5   Date      2900 non-null   datetime64[ns]
 6   TAVG_new  2900 non-null   float64       
dtypes: datetime64[ns](1), float64(6)
memory usage: 181.2 KB


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_filtered_keep['TAVG_new'] = (df_filtered_keep['TMAX'] + df_filtered_keep['TMIN']) / 2


In [42]:
df_filtered_keep.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2900 entries, 6361 to 9260
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype         
---  ------    --------------  -----         
 0   AWND      2900 non-null   float64       
 1   PRCP      2900 non-null   float64       
 2   PGTM      2900 non-null   float64       
 3   TMAX      2900 non-null   float64       
 4   TMIN      2900 non-null   float64       
 5   Date      2900 non-null   datetime64[ns]
 6   TAVG_new  2900 non-null   float64       
dtypes: datetime64[ns](1), float64(6)
memory usage: 181.2 KB


In [43]:
df_filtered_keep['TAVG_new'].describe()

count    2900.000000
mean      101.968276
std       111.572786
min      -293.500000
25%        19.500000
50%       103.000000
75%       205.500000
max       303.000000
Name: TAVG_new, dtype: float64

In [44]:
df_filtered_keep.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2900 entries, 6361 to 9260
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype         
---  ------    --------------  -----         
 0   AWND      2900 non-null   float64       
 1   PRCP      2900 non-null   float64       
 2   PGTM      2900 non-null   float64       
 3   TMAX      2900 non-null   float64       
 4   TMIN      2900 non-null   float64       
 5   Date      2900 non-null   datetime64[ns]
 6   TAVG_new  2900 non-null   float64       
dtypes: datetime64[ns](1), float64(6)
memory usage: 181.2 KB


In [36]:
##loading CTA data to see the dates
df_CTA = pd.read_csv("CTA_Ridership_L_Station_Entries_Daily_Totals_20241112.csv")
df_CTA.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1232735 entries, 0 to 1232734
Data columns (total 5 columns):
 #   Column       Non-Null Count    Dtype 
---  ------       --------------    ----- 
 0   station_id   1232735 non-null  int64 
 1   stationname  1232735 non-null  object
 2   date         1232735 non-null  object
 3   daytype      1232735 non-null  object
 4   rides        1232735 non-null  int64 
dtypes: int64(2), object(3)
memory usage: 47.0+ MB


In [37]:
df_CTA['date'].head()

0    12/22/2017
1    12/18/2017
2    12/02/2017
3    12/19/2017
4    12/03/2017
Name: date, dtype: object

In [38]:
df_CTA['date'].tail()

1232730    08/27/2024
1232731    08/28/2024
1232732    08/29/2024
1232733    08/30/2024
1232734    08/31/2024
Name: date, dtype: object

In [45]:
#Lining up the dates for the weather data and the CTA data 

end_date = '2024-08-31'

# Filter the dataset to include only rows with Date <= end_date
df_weather = df_filtered_keep[df_filtered_keep['Date'] <= end_date]

# Display the filtered dataset
print(df_weather.tail())

      AWND  PRCP    PGTM   TMAX   TMIN       Date  TAVG_new
9156  11.0   0.0     0.0  294.0  167.0 2024-08-27     230.5
9157  41.0   0.0     0.0  328.0  167.0 2024-08-28     247.5
9158  25.0  23.0     0.0  261.0  117.0 2024-08-29     189.0
9159  12.0   0.0     0.0  300.0   94.0 2024-08-30     197.0
9160  18.0   0.0  1720.0  317.0   89.0 2024-08-31     203.0


In [46]:
print(df_weather.head())

      AWND  PRCP  PGTM   TMAX   TMIN       Date  TAVG_new
6361  33.0   0.0   0.0   28.0  -93.0 2017-01-01     -32.5
6362  63.0   0.0   0.0    6.0 -105.0 2017-01-02     -49.5
6363  76.0   0.0   0.0  -77.0 -149.0 2017-01-03    -113.0
6364  31.0  13.0   0.0 -105.0 -199.0 2017-01-04    -152.0
6365  51.0   0.0   0.0 -127.0 -243.0 2017-01-05    -185.0


In [48]:
#save the new dataframe as a feather file
df_weather.reset_index(inplace =True)
df_weather.to_feather('Weather_data')

In [49]:
df_loaded = pd.read_feather('Weather_data')
df_loaded.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2800 entries, 0 to 2799
Data columns (total 8 columns):
 #   Column    Non-Null Count  Dtype         
---  ------    --------------  -----         
 0   index     2800 non-null   int64         
 1   AWND      2800 non-null   float64       
 2   PRCP      2800 non-null   float64       
 3   PGTM      2800 non-null   float64       
 4   TMAX      2800 non-null   float64       
 5   TMIN      2800 non-null   float64       
 6   Date      2800 non-null   datetime64[ns]
 7   TAVG_new  2800 non-null   float64       
dtypes: datetime64[ns](1), float64(6), int64(1)
memory usage: 175.1 KB
