WARNING: Remember to remove hourly files from the processed folder before running this script. This script is cumulative in the sense that it appends to existing processed files, so if you do not remove the files, they will contain redundant information which may cause downstream methods to fail.

In [None]:
from utils import *
import logging

dirname = os.path.dirname(os.path.realpath("__file__"))
meta_folder = os.path.join(dirname, '../data/meta')
hourly_folder = os.path.join(dirname, '../data/hourly')
processed_folder = get_processed_folder()
logging.basicConfig(filename='process_hourly.log',
                    encoding='utf-8',
                    level=logging.INFO,
                    filemode='w')

def load_filestable(my_district):

    rows = list()
    for filename in os.listdir(hourly_folder):

        file_split = os.path.splitext(filename)

        if file_split[1]!='.txt':
            continue

        a = file_split[0].split("_")
        district = int(a[0][1:])
        year = int(a[4])
        month = int(a[5])

        if district!=my_district:
            continue

        rows.append([district,  year, month, filename])

    files_table = pd.DataFrame(rows,columns=['district', 'year', 'month','filename'])
    return files_table

In [None]:
all_districts = [7, 3, 8, 12]
cols = ['timestamp', 'station', 'district', 'route', 'dir', 'lanetype', 'stn_length', 'samples', 'perc_obs',
        'total_flow', 'avg_occ', 'avg_speed', 'delay_35', 'delay_40', 'delay_45', 'delay_50', 'delay_55', 'delay_60']

for my_district in all_districts:

    vds_table = load_vds_table(my_district)
    all_vdss = set(vds_table['ID'].values)

    files_table = load_filestable(my_district)

    for index, row in files_table.iterrows():

        text_file = os.path.join(hourly_folder, row['filename'])

        print(text_file)

        df = pd.read_csv(text_file, header=None)

        nrows, ncols = df.shape

        # figure out the header for the file
        nlanes = int((ncols-len(cols))/3)
        colnames = cols.copy()
        flw_cols = []
        occ_cols = []
        spd_cols = []
        for lane in range(nlanes):
            colnames.append(f'lane_flw_{lane+1}')
            colnames.append(f'lane_avg_occ_{lane+1}')
            colnames.append(f'lane_avg_spd_{lane+1}')

            flw_cols.append(f'lane_flw_{lane+1}')
            occ_cols.append(f'lane_avg_occ_{lane+1}')
            spd_cols.append(f'lane_avg_spd_{lane+1}')

        df.columns = colnames

        # filter all_vdss
        ind = [vds in all_vdss for vds in df['station']]
        df = df[ind]

        # Drop lane information
        df = df.drop(columns = flw_cols)
        df = df.drop(columns = occ_cols)
        df = df.drop(columns = spd_cols)

        # Drop other information
        df = df.drop(columns=['district','delay_35', 'delay_40', 'delay_45', 'delay_50', 'delay_55', 'delay_60'])

        # store in files per vds
        for vds in all_vdss:

            if not (df['station']==vds).any():
                logging.warning(f"{vds} not found in {text_file}")

            df_vds = df[df['station']==vds].copy()
            df_vds = df_vds.set_index('timestamp')

            filename = os.path.join(processed_folder,f'{vds}_hourly.csv')
            if exists(filename):
                a = pd.read_csv(filename)
                a = a.set_index('timestamp')
                df_vds = pd.concat((a,df_vds),ignore_index=False)

            df_vds = df_vds.sort_index()
            df_vds.to_csv(filename)

print('done')