In [None]:
import ast
from os.path import exists
from utils import *

In [None]:
dirname = os.path.dirname(os.path.realpath("__file__"))
meta_folder = os.path.join(dirname, '../data/meta')
hourly_folder = os.path.join(dirname, '../data/hourly')
processed_folder = os.path.join(dirname, '../processed')

# Load config

In [None]:
with open('config.txt') as f:
    config = ast.literal_eval( f.read() )

# Choose a district, load meta data

In [None]:
with open(os.path.join(processed_folder, 'meta_vds_dict.txt')) as f:
    vds_dict = ast.literal_eval( f.read() )

my_district = 4 # int(input('Choose a district: {}'.format(vds_dict.keys())))

vds_table, all_vdss = load_vds_table(processed_folder,my_district)

# Collect table of files available for the given district

In [None]:
rows = list()
for filename in os.listdir(hourly_folder):

    file_split = os.path.splitext(filename)

    if file_split[1]!='.txt':
        continue

    a = file_split[0].split("_")
    district = int(a[0][1:])
    year = int(a[4])
    month = int(a[5])

    if district!=my_district:
        continue

    rows.append([district,  year, month, filename])

files_table = pd.DataFrame(rows,columns=['district', 'year', 'month','filename'])

# Process each file
# Save to {vds}_hourly.csv

In [None]:
if False:

    cols = ['timestamp', 'station', 'district', 'route', 'dir', 'lanetype', 'stn_length', 'samples', 'perc_obs',
            'total_flow', 'avg_occ', 'avg_speed', 'delay_35', 'delay_40', 'delay_45', 'delay_50', 'delay_55', 'delay_60']

    for index, row in files_table.iterrows():

        text_file = os.path.join(hourly_folder, row['filename'])

        print(text_file)

        df = pd.read_csv(text_file, header=None)

        nrows, ncols = df.shape

        # figure out the header for the file
        nlanes = int((ncols-len(cols))/3)
        colnames = cols.copy()
        flw_cols = []
        occ_cols = []
        spd_cols = []
        for lane in range(nlanes):
            colnames.append(f'lane_flw_{lane+1}')
            colnames.append(f'lane_avg_occ_{lane+1}')
            colnames.append(f'lane_avg_spd_{lane+1}')

            flw_cols.append(f'lane_flw_{lane+1}')
            occ_cols.append(f'lane_avg_occ_{lane+1}')
            spd_cols.append(f'lane_avg_spd_{lane+1}')

        df.columns = colnames

        # filter all_vdss
        ind = [vds in all_vdss for vds in df['station']]
        df = df[ind]

        # Drop lane information
        df = df.drop(columns = flw_cols)
        df = df.drop(columns = occ_cols)
        df = df.drop(columns = spd_cols)

        # Drop other information
        df = df.drop(columns=['district','delay_35', 'delay_40', 'delay_45', 'delay_50', 'delay_55', 'delay_60'])

        # store in files per vds
        for vds in all_vdss:

            df_vds = df[df['station']==vds].copy()
            df_vds = df_vds.set_index('timestamp')

            filename = os.path.join(processed_folder,f'{vds}_hourly.csv')
            if exists(filename):
                a = pd.read_csv(filename)
                a = a.set_index('timestamp')
                df_vds = pd.concat((a,df_vds),ignore_index=False)

            df_vds.to_csv(filename)

    print('done')

# Plot

In [None]:
filename = os.path.join(processed_folder,'400045_hourly.csv')
df_vds = pd.read_csv(filename,index_col='timestamp')

In [None]:
df_vds['total_flow'].plot()

In [None]:
df_vds['avg_occ'].plot()

In [None]:
df_vds['avg_speed'].plot()

# Daily station health
# Save to vds_health_{cfg_name}.csv

In [None]:
for cfg_name, cfg_vdss in config.items():

    print(cfg_name)

    # Collect all days from hourly files for these vdss
    days_set = set()
    for vds in cfg_vdss:

        filename = os.path.join(processed_folder,f'{vds}_hourly.csv')
        df_vds = pd.read_csv(filename,index_col='timestamp')
        days = set(pd.Timestamp(t).date() for t in df_vds.index)
        days_set.update(days)

    cfg_days = pd.Series(list(days_set))
    cfg_days = cfg_days.sort_values()

    # vds_health
    vds_health = pd.DataFrame(index=cfg_days,columns=cfg_vdss)

    for vds in cfg_vdss:

        filename = os.path.join(processed_folder,f'{vds}_hourly.csv')
        df_vds = pd.read_csv(filename)
        df_vds['date'] = [pd.Timestamp(t).date() for t in df_vds['timestamp']]

        for day in cfg_days:
            ind = df_vds['date']==day
            vds_health.loc[day,vds] = df_vds.loc[ind,'perc_obs'].mean(skipna=True)

    # Save to file
    filename = os.path.join(processed_folder,f'vds_health_{cfg_name}.csv')
    vds_health.to_csv(filename)