In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.neural_network import MLPRegressor
import seaborn as sns
from census import Census
import statsmodels.formula.api as smf
import statsmodels.api as sm
import scipy.stats as sps
from sklearn.preprocessing import FunctionTransformer, OneHotEncoder, StandardScaler, PolynomialFeatures
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LinearRegression, ElasticNetCV,LogisticRegression
from sklearn.model_selection import train_test_split,GridSearchCV
from sklearn.metrics import mean_squared_error,accuracy_score,precision_score,classification_report,confusion_matrix
from sklearn.ensemble import RandomForestClassifier
from sklearn.datasets import make_classification
from sklearn.impute import SimpleImputer
import sklearn
import warnings
warnings.filterwarnings("ignore")
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

In [None]:
df = pd.read_csv('preprocessed_data.csv')
df

## 🌵Each time you run these models, change the number in random_state. 

### Run three times totally, remember to record these results in the google sheets.

In [None]:
train, test = train_test_split(df, test_size=0.25, random_state=12345) #<--change

In [None]:
feat_cols = ['AADT_ALL_VEHIC','construction_time', 'MEPDG_TRANS_CRACK_LENGTH_AC']
out_col = 'Mean_IRI'
train_x = train[feat_cols]
train_y = train[out_col]
test_x = test[feat_cols]
test_y = test[out_col]

### regression

1. Random Forest

In [None]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score
import time

rm_pipe = Pipeline([
    ('columns', ColumnTransformer([
        ('AADT',MinMaxScaler(),['AADT_ALL_VEHIC']),
        ('Crack',MinMaxScaler(),['MEPDG_TRANS_CRACK_LENGTH_AC']),
        ('time',MinMaxScaler(),['construction_time'])
    ])),
    ('model', RandomForestRegressor(max_depth=5, random_state=0,n_estimators=100))
])
start_time = time.time()
rm_pipe.fit(train_x,train_y)
end_time = time.time()
print('training-time:',end_time - start_time)
print('train-MSE:',mean_squared_error(train_y,rm_pipe.predict(train_x)))
print('train-r2:',r2_score(train_y,rm_pipe.predict(train_x)))
print('test-MSE:',mean_squared_error(test_y,rm_pipe.predict(test_x)))
print('test-r2:',r2_score(test_y,rm_pipe.predict(test_x)))

start_time1 = time.time()
rm_pipe.predict(test_x)
end_time1 = time.time()

print('test-time:',end_time1 - start_time1)

2. KNN

In [None]:
from sklearn.neighbors import KNeighborsRegressor
knn_pipe = Pipeline([
    ('columns', ColumnTransformer([
        ('AADT',MinMaxScaler(),['AADT_ALL_VEHIC']),
        ('Crack',MinMaxScaler(),['MEPDG_TRANS_CRACK_LENGTH_AC']),
        ('time',MinMaxScaler(),['construction_time'])
    ])),
    ('model', KNeighborsRegressor(n_neighbors=5))
])
start_time = time.time()
knn_pipe.fit(train_x,train_y)
end_time = time.time()
print('training-time:',end_time - start_time)
print('train-MSE:',mean_squared_error(train_y,knn_pipe.predict(train_x)))
print('train-r2:',r2_score(train_y,knn_pipe.predict(train_x)))
print('test-MSE:',mean_squared_error(test_y,knn_pipe.predict(test_x)))
print('test-r2:',r2_score(test_y,knn_pipe.predict(test_x)))

start_time1 = time.time()
knn_pipe.predict(test_x)
end_time1 = time.time()

print('test-time:',end_time1 - start_time1)

In [None]:
from sklearn.linear_model import LinearRegression
lr_pipe = Pipeline([
    ('columns', ColumnTransformer([
        ('AADT',MinMaxScaler(),['AADT_ALL_VEHIC']),
        ('Crack',MinMaxScaler(),['MEPDG_TRANS_CRACK_LENGTH_AC']),
        ('time',MinMaxScaler(),['construction_time'])
    ])),
    ('model', LinearRegression())
])
start_time = time.time()
lr_pipe.fit(train_x,train_y)
end_time = time.time()
print('training-time:',end_time - start_time)
print('train-MSE:',mean_squared_error(train_y,lr_pipe.predict(train_x)))
print('train-r2:',r2_score(train_y,lr_pipe.predict(train_x)))
print('test-MSE:',mean_squared_error(test_y,lr_pipe.predict(test_x)))
print('test-r2:',r2_score(test_y,lr_pipe.predict(test_x)))

start_time1 = time.time()
lr_pipe.predict(test_x)
end_time1 = time.time()

print('test-time:',end_time1 - start_time1)

In [None]:
from sklearn.svm import SVR
svr_pipe = Pipeline([
    ('columns', ColumnTransformer([
        ('AADT',MinMaxScaler(),['AADT_ALL_VEHIC']),
        ('Crack',MinMaxScaler(),['MEPDG_TRANS_CRACK_LENGTH_AC']),
        ('time',MinMaxScaler(),['construction_time'])
    ])),
    ('model', SVR(kernel='rbf'))
])
start_time = time.time()
svr_pipe.fit(train_x,train_y)
end_time = time.time()
print('training-time:',end_time - start_time)
print('train-MSE:',mean_squared_error(train_y,svr_pipe.predict(train_x)))
print('train-r2:',r2_score(train_y,svr_pipe.predict(train_x)))
print('test-MSE:',mean_squared_error(test_y,svr_pipe.predict(test_x)))
print('test-r2:',r2_score(test_y,svr_pipe.predict(test_x)))

start_time1 = time.time()
svr_pipe.predict(test_x)
end_time1 = time.time()

print('test-time:',end_time1 - start_time1)

In [None]:
# !pip install xgboost

In [None]:
import xgboost as xgb
xgb_pipe = Pipeline([
    ('columns', ColumnTransformer([
        ('AADT',MinMaxScaler(),['AADT_ALL_VEHIC']),
        ('Crack',MinMaxScaler(),['MEPDG_TRANS_CRACK_LENGTH_AC']),
        ('time',MinMaxScaler(),['construction_time'])
    ])),
    ('model', xgb.XGBRegressor())
])
start_time = time.time()
xgb_pipe.fit(train_x,train_y)
end_time = time.time()
print('training-time:',end_time - start_time)
print('train-MSE:',mean_squared_error(train_y,xgb_pipe.predict(train_x)))
print('train-r2:',r2_score(train_y,xgb_pipe.predict(train_x)))
print('test-MSE:',mean_squared_error(test_y,xgb_pipe.predict(test_x)))
print('test-r2:',r2_score(test_y,xgb_pipe.predict(test_x)))

start_time1 = time.time()
xgb_pipe.predict(test_x)
end_time1 = time.time()

print('test-time:',end_time1 - start_time1)