In [1]:
import pandas as pd
import gzip
import os

In [2]:
# Function to read a .gz file and return a DataFrame
def read_gz_to_dataframe(file_path, columnnames=None):
    with gzip.open(file_path, 'rt') as f:
        df = pd.read_csv(f, header=None, names=columnnames)
    return df

# Function to load and concatenate all .gz files in a folder
def load_and_concatenate_files(folder_path, column_names=None):
    dataframes = []
    for file_name in os.listdir(folder_path):
        if file_name.endswith('.gz'):
            file_path = os.path.join(folder_path, file_name)
            df = read_gz_to_dataframe(file_path, column_names)
            dataframes.append(df)
    concatenated_df = pd.concat(dataframes, ignore_index=True)
    return concatenated_df


# Function to filter data and produce the summary table
def summarize_flow(df, confidence_level, timestamp_col, percent_observed_col, total_flow_col):
    # Ensure the timestamp is a datetime object
    df[timestamp_col] = pd.to_datetime(df[timestamp_col])
    
    # Filter the DataFrame by the specified confidence level
    filtered_df = df[df[percent_observed_col] >= confidence_level]
    
    # Add columns for the year, month, and day of the week
    filtered_df['year'] = filtered_df[timestamp_col].dt.year
    filtered_df['month'] = filtered_df[timestamp_col].dt.month
    filtered_df['day_of_week'] = filtered_df[timestamp_col].dt.day_name()
    
    # Group by the station, year, month, and day of the week
    summary = filtered_df.groupby(['Station', 'year', 'month', 'day_of_week']).agg(
        average_flow=(total_flow_col, 'mean'),
        observed_days=(timestamp_col, 'nunique')
    ).reset_index()
    
    # Sort by station, year, month, and day of the week
    day_order = ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday']
    summary['day_of_week'] = pd.Categorical(summary['day_of_week'], categories=day_order, ordered=True)
    summary = summary.sort_values(['Station', 'year', 'month', 'day_of_week'])
    summary['confidence_level'] = confidence_level
    
    return summary
# Example usage
columnnames = ['Timestamp',	'Station',	'District',	'Route',	'Direction of Travel',	'Lane Type',	'Station Length',	'Samples',	'Percent_Observed',	'total_flow',	'Delay (V_t=35)',	'Delay (V_t=40)',	'Delay (V_t=45)',	'Delay (V_t=50)',	'Delay (V_t=55)',	'Delay (V_t=60)']
folder_path = r'F:\Research and Analysis\Transportation\TrafficCounts\PEMS\Daily_Counts'
df = load_and_concatenate_files(folder_path, columnnames)

# Specify the column names and confidence level
timestamp_col = 'Timestamp'
percent_observed_col = 'Percent_Observed'
total_flow_col = 'total_flow'
confidence_level_75 = 75
confidence_level_25 = 25
# Produce the summary table
summary_table_75 = summarize_flow(df, confidence_level_75, timestamp_col, percent_observed_col, total_flow_col)
summary_table_25 = summarize_flow(df, confidence_level_25, timestamp_col, percent_observed_col, total_flow_col)
# Display the summary table
summary_table = pd.concat([summary_table_75, summary_table_25])
print(summary_table)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_df['year'] = filtered_df[timestamp_col].dt.year
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_df['month'] = filtered_df[timestamp_col].dt.month
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_df['day_of_week'] = filtered_df[timestamp_col].dt.day_name()
A value is trying t

        Station  year  month day_of_week  average_flow  observed_days  \
1        308511  2020      1      Monday   3918.333333              3   
5        308511  2020      1     Tuesday   2922.000000              2   
6        308511  2020      1   Wednesday   8571.000000              1   
4        308511  2020      1    Thursday   7928.000000              1   
0        308511  2020      1      Friday   9210.000000              2   
...         ...   ...    ...         ...           ...            ...   
374170  3900024  2024      2   Wednesday   9477.000000              1   
374168  3900024  2024      2    Thursday   9412.000000              2   
374164  3900024  2024      2      Friday  16971.000000              2   
374166  3900024  2024      2    Saturday   5671.500000              2   
374167  3900024  2024      2      Sunday   8088.000000              1   

        confidence_level  
1                     75  
5                     75  
6                     75  
4              

In [None]:
summary_table.to_csv(r'F:\Research and Analysis\Transportation\TrafficCounts\PEMS\summary_table.csv', index=False)

In [None]:
station_metadata = pd.read_csv(r'F:\Research and Analysis\Transportation\TrafficCounts\PEMS\d03_text_meta_2024_06_04.txt', sep='\t')
station_metadata.to_csv(r'F:\Research and Analysis\Transportation\TrafficCounts\PEMS\station_metadata.csv', index=False)

In [None]:
pems_geodatabase = r'F:\Research and Analysis\Transportation\TrafficCounts\PEMS\PEMS_Locations.gdb'
#make a feature class from the station metadata
import arcpy
arcpy.env.workspace = pems_geodatabase
arcpy.management.XYTableToPoint(r'F:\Research and Analysis\Transportation\TrafficCounts\PEMS\station_metadata.csv', 'station_locations', 'Longitude', 'Latitude')

In [4]:
model_day_lookup = pd.read_csv(r'F:\Research and Analysis\Transportation\TrafficCounts\PEMS\modelday_lookup.csv')

In [5]:
# make timestamp column of df into a date column with just date
df['Date'] = pd.to_datetime(df['Timestamp']).dt.date
model_day_lookup['date'] = pd.to_datetime(model_day_lookup['date']).dt.date

In [12]:
total_flow_col = 'total_flow'
df_model_days = pd.merge(df, model_day_lookup, left_on='Date', right_on='date', how='inner')
percent_observed_col = 'Percent_Observed'
total_flow_col = 'total_flow'
confidence_level_75 = 75
# Filter the DataFrame by the specified confidence level
df_model_days = df_model_days[df_model_days[percent_observed_col] >= confidence_level_75]
df_model_days =df_model_days[df_model_days['modelday'] == 'yes']
df_model_days = df_model_days.drop(columns=['date', 'modelday'])
df_model_days['year'] = df_model_days['Timestamp'].dt.year
df_model_days_grouped = df_model_days.groupby(['Station', 'year']).agg(average_flow=(total_flow_col, 'mean'),
        observed_days=(timestamp_col, 'nunique')
    ).reset_index()

In [14]:
df_model_days_grouped.to_csv(r'F:\Research and Analysis\Transportation\TrafficCounts\PEMS\model_day_average_flow_75.csv')