In [None]:
import os
import numpy as np
import pandas as pd
import dill
from utils import *

import warnings
from pandas.core.common import SettingWithCopyWarning

warnings.simplefilter(action="ignore", category=SettingWithCopyWarning)

In [None]:
dirname = os.path.dirname(os.path.realpath("__file__"))
meta_folder = os.path.join(dirname, '../data/meta')
processed_folder = os.path.join(dirname, '../processed')

# Load configuration file

In [None]:
routes, all_vdss = load_routes()


# Gather list of metadata files.
# Save to processed/meta_files.csv

In [None]:
if True:

    rows = list()
    for filename in os.listdir(meta_folder):

        file_split = os.path.splitext(filename)

        if file_split[1]!='.txt':
            continue

        a = file_split[0].split("_")
        district = int(a[0][1:])
        year = int(a[3])
        month = int(a[4])
        day = int(a[5])

        rows.append([district,  year, month, day, filename])

    files_table = pd.DataFrame(rows,columns=['district', 'year', 'month', 'day','filename'])

    files_table.to_csv(os.path.join(processed_folder, 'meta_files.csv'))

else:
    files_table = pd.read_csv(os.path.join(processed_folder, 'meta_files.csv'))

In [None]:
files_table

# Create dictionary of district -> VDS ids.
# Save to processed/meta_vds_dict.txt

In [None]:
if True:

    vds_dict= dict()

    for index, row in files_table.iterrows():

        df = pd.read_csv(os.path.join(meta_folder, row['filename']),sep='\t')
        district = df.loc[0,'District']

        if district not in vds_dict.keys():
            vds_dict[district] = set()

        dfvds = set(df['ID'].values)
        dfvds = dfvds.intersection(all_vdss)

        vds_dict[district] = vds_dict[district].union(dfvds)

    with open(os.path.join(processed_folder, 'meta_vds_dict.txt'),'w') as f:
        f.write(str(vds_dict))

else:

    with open(os.path.join(processed_folder, 'meta_vds_dict.txt')) as f:
        vds_dict = ast.literal_eval( f.read() )

In [None]:
vds_dict

# Gather meta data history for each VDS
# Save as processed/meta_vds_hist_<district>.pickle

In [None]:
if True:

    for district, vdss in vds_dict.items():

        if len(vdss)==0:
            continue

        vds_tables = dict.fromkeys(vdss)
        district_files = files_table[files_table['district']==district]

        for ind, file_row in district_files.iterrows():

            year = file_row[1]
            month = file_row[2]
            day = file_row[3]
            filename = file_row[4]

            df = pd.read_csv(os.path.join(meta_folder, filename),sep='\t')

            for vds in vds_dict[district]:
                dfvds = df.loc[df['ID']==vds,:]
                vds_table = vds_tables[vds]

                if vds_table is None:
                    vds_table = dfvds.copy()
                    vds_table.insert(0,'date',pd.Timestamp(year=year,month=month,day=day))
                else:
                    new_row = dfvds.copy()
                    new_row.insert(0,'date',pd.Timestamp(year=year,month=month,day=day))
                    vds_table = pd.concat((vds_table,new_row))

                vds_tables[vds] = vds_table

        # keep differences
        for vds, vds_table in vds_tables.items():
            vds_table.sort_values('date', inplace=True)
            cols = vds_table.columns.values
            cols = np.delete(cols, np.where(cols == 'date'))
            both_nan = vds_table[cols].shift().isna() & vds_table[cols].isna()
            equal_val = vds_table[cols].shift() == vds_table[cols]
            vds_tables[vds] = vds_table.loc[~(both_nan | equal_val).all(axis=1)]

            # correct State_PM that contain characters
            try:
                state_pm_str = [str(x) for x in vds_tables[vds]['State_PM']]
                vds_tables[vds]['State_PM'] = [float(''.join(x for x in str if not x.isalpha())) for str in state_pm_str]
            except:
                print(f"Warning: Cannot cast state PM for vds {vds} to float: {vds_tables[vds]['State_PM']}")

        with open(os.path.join(processed_folder, f'meta_vds_hist_{district}.pickle'),'wb') as f:
            dill.dump(vds_tables,f)

# Save as processed/meta_vds_hist_<district>.csv

In [None]:
if True:

    for district, vdss in vds_dict.items():

        print(district)

        if len(vdss)==0:
            continue

        with open(os.path.join(processed_folder, f'meta_vds_hist_{district}.pickle'),'rb') as f:
            vds_tables = dill.load(f)

        df = pd.DataFrame()
        for vds, vds_table in vds_tables.items():

            if len(df)==0:
                df = vds_table
            else:
                df = pd.concat((df,vds_table))

        df.to_csv(os.path.join(processed_folder, f'meta_vds_hist_{district}.csv'))

# Check VDS info

In [None]:
if True:

    for district, vdss in vds_dict.items():

        print(district)

        if len(vdss)==0:
            continue

        with open(os.path.join(processed_folder, f'meta_vds_hist_{district}.pickle'),'rb') as f:
            vds_tables = dill.load(f)

        for vds, vds_table in vds_tables.items():

            if len(np.unique(vds_table['Fwy']))>1:
                print(f'WARNING: VDS {vds} has multiple freeways.')

            if len(np.unique(vds_table['Dir']))>1:
                print(f'WARNING: VDS {vds} has multiple directions.')

            if len(np.unique(vds_table['District']))>1:
                print(f'WARNING: VDS {vds} has multiple districts.')

            if len(np.unique(vds_table['County']))>1:
                print(f'WARNING: VDS {vds} has multiple counties.')

            if len(np.unique(vds_table['State_PM']))>1:
                print(f'WARNING: VDS {vds} has multiple state PMs.')

            if len(np.unique(vds_table['Abs_PM']))>1:
                print(f'WARNING: VDS {vds} has multiple absolute PMs.')

            if len(np.unique(vds_table['Latitude']))>1:
                print(f'WARNING: VDS {vds} has multiple latitude.')

            if len(np.unique(vds_table['Longitude']))>1:
                print(f'WARNING: VDS {vds} has multiple longitude.')

            if len(np.unique(vds_table['Type']))>1:
                print(f'WARNING: VDS {vds} has multiple type.')

            if len(np.unique(vds_table['Lanes']))>1:
                print(f'WARNING: VDS {vds} has multiple lanes.')