In [None]:
import numpy as np
from utils import *
import warnings
from pandas.core.common import SettingWithCopyWarning
import logging

warnings.simplefilter(action="ignore", category=SettingWithCopyWarning)

In [None]:
meta_folder = os.path.join(get_data_folder(), 'meta')
processed_folder = get_processed_folder()
route_names = load_route_names()
routes = {rname:load_ordered_route_vdss_from_excel(rname) for rname in route_names}
logging.basicConfig(filename='process_meta.log',
                    encoding='utf-8',
                    level=logging.INFO,
                    filemode='w')

In [None]:
routes

# All VDSs to process

In [None]:
route_vdss = set()
for route in routes.values():
    if route is not None:
        route_vdss.update( route )

In [None]:
route_vdss

# Gather list of metadata files.

In [None]:
rows = list()
for filename in os.listdir(meta_folder):

    file_split = os.path.splitext(filename)

    if file_split[1]!='.txt':
        continue

    a = file_split[0].split("_")
    district = int(a[0][1:])
    year = int(a[3])
    month = int(a[4])
    day = int(a[5])

    rows.append([district,  year, month, day, filename])

files_table = pd.DataFrame(rows,columns=['district', 'year', 'month', 'day','filename'])

In [None]:
files_table

# Creatae dictionary of district -> VDS ids.

In [None]:
district2vdss = dict()

for index, row in files_table.iterrows():

    df = pd.read_csv(os.path.join(meta_folder, row['filename']),sep='\t',dtype={'ID':str})
    district = df.loc[0,'District']

    if district not in district2vdss.keys():
        district2vdss[district] = set()

    dfvds = set(df['ID'].values)
    dfvds = dfvds.intersection(route_vdss)

    district2vdss[district] = district2vdss[district].union(dfvds)

In [None]:
district2vdss

# Gather meta data history for each VDS
# Save as processed/meta_district_vds_hist_{district}.csv

In [None]:
def check_vds(logging, vds_table, vds):

    if len(np.unique(vds_table['Fwy']))>1:
        logging.warning(f'VDS {vds} has multiple freeways.')

    if len(np.unique(vds_table['Dir']))>1:
        logging.warning(f'VDS {vds} has multiple directions.')

    if len(np.unique(vds_table['District']))>1:
        logging.warning(f'VDS {vds} has multiple districts.')

    if len(np.unique(vds_table['County']))>1:
        logging.warning(f'VDS {vds} has multiple counties.')

    if len(np.unique(vds_table['State_PM']))>1:
        logging.warning(f'VDS {vds} has multiple state PMs.')

    if len(np.unique(vds_table['Abs_PM']))>1:
        logging.warning(f'VDS {vds} has multiple absolute PMs.')

    if len(np.unique(vds_table['Latitude']))>1:
        logging.warning(f'VDS {vds} has multiple latitude.')

    if len(np.unique(vds_table['Longitude']))>1:
        logging.warning(f'VDS {vds} has multiple longitude.')

    if len(np.unique(vds_table['Type']))>1:
        logging.warning(f'VDS {vds} has multiple type.')

    if len(np.unique(vds_table['Lanes']))>1:
        logging.warning(f'VDS {vds} has multiple lanes.')

In [None]:
for district, vdss in district2vdss.items():

    print(district)

    if len(vdss)==0:
        continue

    vds_tables = dict.fromkeys(vdss)
    district_files = files_table[files_table['district']==district]

    for ind, file_row in district_files.iterrows():

        year = file_row[1]
        month = file_row[2]
        day = file_row[3]
        filename = file_row[4]

        df = pd.read_csv(os.path.join(meta_folder, filename),dtype={'ID':str},sep='\t')

        for vds in district2vdss[district]:
            dfvds = df.loc[df['ID']==vds,:]
            vds_table = vds_tables[vds]

            if vds_table is None:
                vds_table = dfvds.copy()
                vds_table.insert(0,'date',pd.Timestamp(year=year,month=month,day=day))
            else:
                new_row = dfvds.copy()
                new_row.insert(0,'date',pd.Timestamp(year=year,month=month,day=day))
                vds_table = pd.concat((vds_table,new_row))

            vds_tables[vds] = vds_table

    # keep differences
    df = pd.DataFrame()
    for vds, vds_table in vds_tables.items():

        # correct State_PM that contain characters
        try:
            state_pm_str = [str(x) for x in vds_table['State_PM']]
            vds_table['State_PM'] = [float(''.join(x for x in str if not x.isalpha())) for str in state_pm_str]
        except:
            logging.warning(f"Cannot cast state PM for vds {vds} to float: {vds_table['State_PM']}")

        vds_table.sort_values('date', inplace=True)
        cols = np.setdiff1d(vds_table.columns.values, ('date', 'User_ID_1', 'User_ID_2', 'User_ID_3', 'User_ID_4'))
        np.setdiff1d(vds_table.columns.values, {'date', 'User_ID_1'})
        both_nan = vds_table[cols].shift().isna() & vds_table[cols].isna()
        equal_val = vds_table[cols].shift() == vds_table[cols]

        vds_table = vds_table.loc[~(both_nan | equal_val).all(axis=1)]

        check_vds(logging, vds_table, vds)

        if len(df) == 0:
            df = vds_table
        else:
            df = pd.concat((df, vds_table))

    df.to_csv(os.path.join(processed_folder, f'meta_district_vds_hist_{district}.csv'))

# Tables for routes

In [None]:
for route_name, vdss in routes.items():
    print(route_name)

    if vdss==None:
        continue

    districts = get_districts_for_vdss(vdss)
    vds_table = load_districts_config(districts)

    route_table = pd.DataFrame()
    for vds in vdss:
        route_table = pd.concat((route_table, vds_table[vds_table['ID']==vds]))

    route_table.to_csv(os.path.join(processed_folder, f'meta_route_vds_hist_{route_name}.csv'))

print('done')