# Overview

The goal of this file is to load the raw database files that ia in the `raw_data` folder, extract and format the data in a way that is usefull for training the model. The output files are saved in the `datasets` folder.

Produces:
- 'datasets/train.csv' : training data. 80% of the data.
- 'datasets/test.csv' : testing data. 20% of the data.

In [None]:
import pandas as pd
import numpy as np
from tqdm import tqdm
import sys
import os
# install pyodbc
os.system('pip install pyodbc')
import pyodbc

ModuleNotFoundError: No module named 'pyodbc'

In [2]:
conn_str = (
    r'DRIVER={Microsoft Access Driver (*.mdb, *.accdb)};'
    r'DBQ=./Bucket_114495.mdb;'
)
conn = pyodbc.connect(conn_str)

# Get a list of all tables
cursor = conn.cursor()
table_list = [row.table_name for row in cursor.tables(tableType='TABLE')]

print(table_list)

NameError: name 'pyodbc' is not defined

In [None]:
#load and write the IRI data csv file to overwrite the MON_HSS_PROFILE_SECTION parquet file
iri_data = pd.read_csv('raw_data/IRI_DATA.csv')
iri_data.to_parquet('./datasets/MON_HSS_PROFILE_SECTION.parquet')

In [None]:
rut_data = pd.read_csv('raw_data/Materials.csv')
rut_data.to_parquet('./datasets/MATERIALS.parquet')

In [None]:
#load all data from parquet files
data_dir = './datasets/'
data = {}
for file in tqdm(os.listdir(data_dir)):
    if file.endswith('.parquet'):
        data[".".join(file.split('.')[:-1])] = pd.read_parquet(data_dir + file)

# Preprocessing

This will happen in a few steps.

- Weather data will be loaded and processed into a single useful dataframe.
- IRI dataframe will be loaded and columns from other dataframes will be added to it.
- The final result will be saved to another parquet file.

In [None]:
HUMIDITY = data['CLM_VWS_HUMIDITY_ANNUAL'].set_index(['SHRP_ID', 'STATE_CODE'])[['MAX_ANN_HUM_AVG', 'MIN_ANN_HUM_AVG']]
HUMIDITY = HUMIDITY.groupby(['SHRP_ID', 'STATE_CODE']).mean()
print(HUMIDITY)

In [None]:
TEMPERATURE = data['CLM_VWS_TEMP_ANNUAL'].set_index(['SHRP_ID', 'STATE_CODE'])[['MEAN_ANN_TEMP_AVG', 'FREEZE_THAW_YR']]
TEMPERATURE = TEMPERATURE.groupby(['SHRP_ID', 'STATE_CODE']).mean()
print(TEMPERATURE)

In [None]:
PRECIPIATION = data['CLM_VWS_PRECIP_ANNUAL'].set_index(['SHRP_ID', 'STATE_CODE'])[['TOTAL_ANN_PRECIP', 'TOTAL_SNOWFALL_YR']]
PRECIPIATION = PRECIPIATION.groupby(['SHRP_ID', 'STATE_CODE']).mean()
print(PRECIPIATION)

In [None]:
CONSTRUCTION_MATERIAL = data['MATERIALS'].set_index(['SHRP_ID', 'STATE_CODE'])[['CONSTRUCTION_NO', 'LAYER_NO', 'LAYER_TYPE']]
# filter for the largest construction number
CONSTRUCTION_MATERIAL = CONSTRUCTION_MATERIAL[
    CONSTRUCTION_MATERIAL['CONSTRUCTION_NO'] == CONSTRUCTION_MATERIAL.groupby(['SHRP_ID', 'STATE_CODE'])['CONSTRUCTION_NO'].transform('max')]
# filter for the largest layer number
CONSTRUCTION_MATERIAL = CONSTRUCTION_MATERIAL[
    CONSTRUCTION_MATERIAL['LAYER_NO'] == CONSTRUCTION_MATERIAL.groupby(['SHRP_ID', 'STATE_CODE'])['LAYER_NO'].transform('max')]
CONSTRUCTION_MATERIAL = CONSTRUCTION_MATERIAL[['LAYER_TYPE']]
# replace with index to convert Character codes to numeric codes
CONSTRUCTION_MATERIAL['LAYER_TYPE'] = pd.factorize(CONSTRUCTION_MATERIAL['LAYER_TYPE'])[0]
print(CONSTRUCTION_MATERIAL)

In [None]:
RUT = data['MON_T_PROF_INDEX_SECTION'].set_index(['SHRP_ID', 'STATE_CODE'])[['MAX_MEAN_DEPTH_WIRE_REF']]
RUT = RUT.groupby(['SHRP_ID', 'STATE_CODE']).mean()
print(RUT)

In [None]:
FINAL_DATA_DIR = './training_data/'


IRI = data['MON_HSS_PROFILE_SECTION'].set_index(['SHRP_ID', 'STATE_CODE'])
IRI = IRI[['VISIT_DATE', 'IRI_LEFT_WHEEL_PATH', 'IRI_RIGHT_WHEEL_PATH']]
IRI.reset_index(inplace=True)
IRI['VISIT_DATE'] = pd.to_datetime(IRI['VISIT_DATE'], format='%m/%d/%Y')
IRI['IRI_LEFT_WHEEL_PATH'] = IRI['IRI_LEFT_WHEEL_PATH'].astype(float)
IRI['IRI_RIGHT_WHEEL_PATH'] = IRI['IRI_RIGHT_WHEEL_PATH'].astype(float)
IRI = IRI.groupby(['SHRP_ID', 'STATE_CODE', 'VISIT_DATE'], as_index=False).agg({
    'IRI_LEFT_WHEEL_PATH': 'mean',
    'IRI_RIGHT_WHEEL_PATH': 'mean'
})
IRI.set_index(['SHRP_ID', 'STATE_CODE'], inplace=True)

IRI.to_parquet(FINAL_DATA_DIR + 'IRI-only.parquet')
IRI.to_csv(FINAL_DATA_DIR + 'IRI-only.csv')

ESAL = data['TRF_HIST_EST_ESAL'].set_index(['SHRP_ID', 'STATE_CODE'])
ESAL = ESAL['AADT_ALL_VEHIC']

CRACK = data['MON_DIS_AC_CRACK_INDEX'].set_index(['SHRP_ID', 'STATE_CODE'])
CRACK = CRACK['MEPDG_TRANS_CRACK_LENGTH_AC']
CRACK = CRACK.astype(float)

CONSTRUCTION = data['CONSTRUCTION_HIST'].set_index(['SHRP_ID', 'STATE_CODE'])
CONSTRUCTION = CONSTRUCTION[['IMP_DATE', 'IMP_TYPE']]

# Merge traffic and IRI
final = IRI.merge(ESAL, how='left', left_index=True, right_index=True)
# Merge crack data
final = final.merge(CRACK, how='left', left_index=True, right_index=True)
# Merge weather data
final = final.merge(HUMIDITY, how='left', left_index=True, right_index=True)
final = final.merge(TEMPERATURE, how='left', left_index=True, right_index=True)
final = final.merge(PRECIPIATION, how='left', left_index=True, right_index=True)
# Merge construction material data
final = final.merge(CONSTRUCTION_MATERIAL, how='left', left_index=True, right_index=True)
# Merge rutting data
final = final.merge(RUT, how='left', left_index=True, right_index=True)

final = final.reset_index()


final = final.groupby(['SHRP_ID',
                       'STATE_CODE',
                       'VISIT_DATE', 
                       'IRI_LEFT_WHEEL_PATH',
                       'IRI_RIGHT_WHEEL_PATH',
                       'MAX_ANN_HUM_AVG',
                       'MIN_ANN_HUM_AVG',
                       'MEAN_ANN_TEMP_AVG',
                       'FREEZE_THAW_YR',
                       'TOTAL_ANN_PRECIP',
                       'TOTAL_SNOWFALL_YR',
                       'LAYER_TYPE',
                       'MAX_MEAN_DEPTH_WIRE_REF'], as_index=False).agg({
    'AADT_ALL_VEHIC': 'mean',
    'MEPDG_TRANS_CRACK_LENGTH_AC': 'mean'
})

final.set_index(['SHRP_ID', 'STATE_CODE'], inplace=True)
# remove duplicates and de-NAN the values
final = final.fillna(-1)
final = final[~final.duplicated(keep='first')]
# replace MEPDG_TRANS_CRACK_LENGTH_AC 0s with -1s
final['MEPDG_TRANS_CRACK_LENGTH_AC'].replace(0, -1, inplace=True)

# save to parquet and csv
final.to_parquet(FINAL_DATA_DIR + 'final_data.parquet')
final.to_csv(FINAL_DATA_DIR + 'final_data.csv')

# # save construction data to parquet and csv
CONSTRUCTION.to_parquet(FINAL_DATA_DIR + 'construction_data.parquet')
CONSTRUCTION.to_csv(FINAL_DATA_DIR + 'construction_data.csv')