In [1]:
import os
import gzip
import csv
import datetime as dt
import pandas as pd
import numpy as np

In [None]:
directory_path = 'data/'
file_list = []
for file in os.listdir(directory_path):
    # os.fsdecode decode a file system name from the file system's encoding to Unicode
    filename = os.fsdecode(file)
    if filename.endswith('.gz'):
        file_list.append(filename)
        # print(f'Processing file: {filename}')

file_list

## Notes

### Filter the data:

- only lane type = `ML` or `HV` will be collected
- `total_flow` and `avg_speed` will be interpolated if `pct_observed >= 70`

In [2]:
directory_path = 'data/'
pemsdata_columns = \
    ['timestamp', 'station', 'district', 'freeway_num', 'direction', \
     'lane_type', 'station_length', 'samples', 'pct_observed', 'total_flow', \
     'avg_occupancy', 'avg_speed']

raw_df = pd.read_csv(directory_path+'d12_text_station_5min_2019_10_08.txt.gz', header=None, usecols=range(0, 12))
raw_df.columns = pemsdata_columns
raw_df['timestamp'] = pd.to_datetime(raw_df['timestamp'], format="%m/%d/%Y %H:%M:%S")
raw_df = raw_df[(raw_df['lane_type'] == 'HV') | (raw_df['lane_type'] == 'ML')]
# raw_df.to_parquet(directory_path + "d12_5min_" + f"{raw_df['timestamp'][0].strftime('%y%m%d')}" + ".parquet")


In [45]:
# if pct_observed is larger than 70, keep it as it is; otherwise, change to interpolate()
raw_df['total_flow'].where(raw_df['pct_observed']>=70, raw_df['total_flow'].interpolate())

2          8.0
5         11.0
7         44.0
9          3.0
10        67.0
          ... 
695512    11.0
695513     0.0
695514     4.0
695518    93.0
695519     1.0
Name: total_flow, Length: 443808, dtype: float64

In [3]:
# assign np.nan value to total_flow if the pct_observed is less than 70
# group by each route, fill np.ana value with interpolate values
# interpolate is designed to be applied on missing values
raw_df['total_flow'].where(raw_df['pct_observed']>=70, np.nan)
raw_df.groupby('freeway_num')['total_flow'].apply(lambda x: x.interpolate())

2          8.0
5         11.0
7         44.0
9          3.0
10        67.0
          ... 
695512    11.0
695513     0.0
695514     4.0
695518    93.0
695519     1.0
Name: total_flow, Length: 443808, dtype: float64

In [49]:
(raw_df['total_flow'].interpolate().fillna(0) == raw_df['total_flow'].where(raw_df['pct_observed']>=70, raw_df['total_flow'].interpolate())).sum()

443808