In [3]:
# Author: Tiago Tamagusko (tamagusko@gmail.com)
# Version: 3.0 (2022-11-15)

In [11]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import LabelEncoder
from sklearn.tree import DecisionTreeRegressor, plot_tree
from sklearn.ensemble import RandomForestRegressor
import xgboost as xgb
import pickle
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error

from sklearn.model_selection import train_test_split

%matplotlib inline

In [12]:
# Regressors
DT_reg = DecisionTreeRegressor(random_state=42, max_depth=10)
RF_reg = RandomForestRegressor(random_state=42, max_depth=10)
XG_reg = xgb.XGBRegressor(random_state=42, max_depth=4)

In [13]:
# Data

IRI = pd.read_csv('data/raw/iri.csv')
IMPROVEMENT = pd.read_csv('data/raw/improvements.csv')
TRAFFIC = pd.read_csv('data/raw/aadtt.csv')
SN = pd.read_csv('data/raw/sn.csv')
PREC = pd.read_csv('data/raw/precipitation.csv')
TEMP = pd.read_csv('data/raw/temperature.csv')

In [14]:
# Cleaning data
IRI = IRI[['STATE_CODE', 'SHRP_ID', 'VISIT_DATE', 'MRI']]
TRAFFIC = TRAFFIC[['STATE_CODE', 'SHRP_ID', 'YEAR', 'AADTT_ALL_TRUCKS_TREND']]
SN = SN[['STATE_CODE', 'SHRP_ID', 'SN_VALUE']]
PREC = PREC.drop(['STATE_CODE_EXP', 'TOTAL_SNOWFALL_YR'], axis=1)
TEMP = TEMP.drop(['STATE_CODE_EXP', 'FREEZE_THAW_YR', 'FREEZE_INDEX_YR'], axis=1)

IRI['VISIT_DATE'] = pd.DatetimeIndex(IRI['VISIT_DATE']).year
IRI.rename(columns={'VISIT_DATE': 'YEAR'}, inplace=True)

In [15]:
# Processing climate data
CLIMATE = pd.merge(PREC, TEMP, on=[
                   'SHRP_ID', 'STATE_CODE', 'YEAR', 'VWS_ID']).drop(['VWS_ID'], axis=1)
# Processing data
DATA = pd.merge(IRI, CLIMATE, how='right', on=[
                'SHRP_ID', 'STATE_CODE', 'YEAR'])
DATA = pd.merge(DATA, TRAFFIC, how='right', on=[
                'SHRP_ID', 'STATE_CODE', 'YEAR'])
DATA = pd.merge(DATA, SN, how='right', on=['SHRP_ID', 'STATE_CODE'])
DATA.shape

(193915, 8)

In [16]:
# drop NA and duplicates
DATA = DATA.dropna()
DATA = DATA.drop_duplicates()
DATA.shape

(39471, 8)

In [17]:
# Sort dataset
DATA = DATA.sort_values(['YEAR'], ascending=[True])

In [18]:
# Save processed data
DATA.to_csv('data/processed/ltpp-data.csv', index=None, header=True)

In [19]:
# Drop unused columns
DATA = DATA.drop(['STATE_CODE', 'SHRP_ID'], axis=1)
# Rename columns
DATA = DATA.rename({'YEAR': 'Year',
                    'MRI': 'IRI',
                    'TOTAL_ANN_PRECIP': 'Precipitation',
                    'MEAN_ANN_TEMP_AVG': 'Temperature',
                    'AADTT_ALL_TRUCKS_TREND': 'AADTT',
                    'SN_VALUE': 'SN',
                    }, axis=1)

In [20]:
DATA.head()

Unnamed: 0,Year,IRI,Precipitation,Temperature,AADTT,SN
47181,1989,139699995517731,151600006103516,919999980926514,160,270000004768372
138695,1989,157299995422363,13195,121000003814697,640,609999990463257
138694,1989,164800000190735,13195,121000003814697,640,609999990463257
99608,1989,874000012874603,807099975585938,65,760,790000009536743
99607,1989,953999996185303,807099975585938,65,760,790000009536743


In [27]:
# verify data
DATA.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 39471 entries, 47181 to 65964
Data columns (total 6 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   Year           39471 non-null  int64 
 1   IRI            39471 non-null  object
 2   Precipitation  39471 non-null  object
 3   Temperature    39471 non-null  object
 4   AADTT          39471 non-null  int64 
 5   SN             39471 non-null  object
dtypes: int64(2), object(4)
memory usage: 2.1+ MB


In [29]:
DATA['IRI'] = pd.to_numeric(DATA['IRI'])

ValueError: Unable to parse string "1,39699995517731" at position 0

In [21]:
# Split into target and dataset
y = DATA['IRI']  # target
X = DATA.drop(['IRI'], axis=1)

In [24]:
# Split in 75/25 (train/test)
train_threshold = 0.75

# Split dataset
X_train, X_test, y_train, y_test = train_test_split(
    X, y, train_size=train_threshold, random_state=42)

In [25]:
DT_reg.fit(X_train, y_train)
RF_reg.fit(X_train, y_train)
XG_reg.fit(X_train, y_train)
print('Data trained! =)')

ValueError: could not convert string to float: '1331,90002441406'