In [1]:
#Libraries for data wrangling
import pandas as pd
import numpy as np
from xgboost import XGBRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.pipeline import Pipeline

In [2]:
from sklearn.model_selection import cross_val_score

In [3]:
from sklearn.model_selection import KFold
from sklearn.impute import SimpleImputer,KNNImputer

In [4]:
#Reading the training dataset
train=pd.read_excel('dataset.xlsx').drop(['Compressive\nstrength tests', 'Elastic\nmodulus tests',
       'Flexural\nstrength tests', 'Splitting\ntensile strength tests'],axis=1)
target_col='Compressive\nstrength (f’c) (MPa)'
features=list(train.columns)
#drop row with all the features na.
train=train.dropna(how='all')

In [5]:
model_x=XGBRegressor()
model_g=GradientBoostingRegressor()
X=train.drop([target_col],axis=1)
y=train[target_col]

# statistical imputation

In [6]:
strategies = ['mean', 'median', 'most_frequent', 'constant']

In [7]:
for s in strategies:
    imputer = SimpleImputer(strategy=s)
    pipeline1 = Pipeline(steps=[('i', imputer), ('m', model_x)])
    pipeline2 = Pipeline(steps=[('i', imputer), ('m', model_g)])
    cv =KFold(n_splits=10, random_state=1,shuffle=True)
    scores1 = cross_val_score(pipeline1, X, y, scoring='neg_mean_absolute_error', cv=cv, n_jobs=-1)
    scores2 = cross_val_score(pipeline2, X, y, scoring='neg_mean_absolute_error', cv=cv, n_jobs=-1)
    print('Mean Accuracy xgb: %.3f (%.3f)' % (np.mean(scores1), np.std(scores1)))
    print('Mean Accuracy gbdt: %.3f (%.3f)' % (np.mean(scores2), np.std(scores2)))

Mean Accuracy xgb: -4.337 (0.702)
Mean Accuracy gbdt: -5.614 (0.915)
Mean Accuracy xgb: -4.288 (0.546)
Mean Accuracy gbdt: -5.698 (0.910)
Mean Accuracy xgb: -4.461 (0.479)
Mean Accuracy gbdt: -5.739 (0.882)
Mean Accuracy xgb: -4.232 (0.585)
Mean Accuracy gbdt: -5.381 (0.680)


In [8]:
neighbour=[2,3,4,5,6]

In [9]:
for n in neighbour:
    imputer = KNNImputer(n_neighbors=n, weights="uniform")
    pipeline1 = Pipeline(steps=[('i', imputer), ('m', model_x)])
    pipeline2 = Pipeline(steps=[('i', imputer), ('m', model_g)])
    cv =KFold(n_splits=10, random_state=1,shuffle=True)
    scores1 = cross_val_score(pipeline1, X, y, scoring='neg_mean_absolute_error', cv=cv, n_jobs=-1)
    scores2 = cross_val_score(pipeline2, X, y, scoring='neg_mean_absolute_error', cv=cv, n_jobs=-1)
    print('Mean Accuracy xgb: %.3f (%.3f)' % (np.mean(scores1), np.std(scores1)))
    print('Mean Accuracy gbdt: %.3f (%.3f)' % (np.mean(scores2), np.std(scores2)))

Mean Accuracy xgb: -5.190 (0.682)
Mean Accuracy gbdt: -5.792 (0.878)
Mean Accuracy xgb: -5.322 (0.787)
Mean Accuracy gbdt: -5.796 (0.843)
Mean Accuracy xgb: -5.270 (1.030)
Mean Accuracy gbdt: -5.851 (0.711)
Mean Accuracy xgb: -5.251 (0.762)
Mean Accuracy gbdt: -5.903 (0.798)
Mean Accuracy xgb: -5.108 (0.789)
Mean Accuracy gbdt: -5.574 (0.627)


In [10]:
imputer=SimpleImputer(strategy='constant')
train=pd.DataFrame(imputer.fit_transform(train),columns=features)

In [11]:
for feature in features:
    skewed_col=pd.DataFrame(train[feature])
    print(skewed_col.skew())
#only the parent concrete strength got higher skewness than 3.   

Effective\nwater- to-cement ratio    0.353837
dtype: float64
Aggregate-\nto-cement ratio (a/c)   -0.217836
dtype: float64
RCA\nreplacement ratio (RCA %)    0.033084
dtype: float64
Parent\nconcrete strength(MPa)    3.396532
dtype: float64
Nominal\nmaximum RCA size(mm)   -0.839659
dtype: float64
Nominal\nmaximum NA size(mm)   -0.909702
dtype: float64
Bulk\ndensity of RCA (kg/m3)   -0.919482
dtype: float64
Bulk\ndensity of NA (kg/m3)   -0.55538
dtype: float64
Water\nabsorption of RCA(WARCA) (%)    0.100572
dtype: float64
Water\nabsorption of NA    1.128026
dtype: float64
Los\nAngeles abrasion of RCA    1.840858
dtype: float64
Los\nAngeles abrasion of NA    1.898719
dtype: float64
Density of\nhardened\nconcrete\nAD (qad)\n(kg/m3)    1.695857
dtype: float64
Density of\nhardened\nconcrete\nSSD (qSSD)\n(kg/m3)    2.137571
dtype: float64
Compressive\nstrength (f’c) (MPa)    0.911971
dtype: float64


# iterative imputation

In [12]:
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer


In [13]:
# define imputer
imputer = IterativeImputer(estimator=GradientBoostingRegressor(), n_nearest_features=None,max_iter=4, imputation_order='ascending')
models=[('xgb',XGBRegressor()),('gdbt', GradientBoostingRegressor())]

In [14]:
for name,model in models:
    pipeline = Pipeline(steps=[('i', imputer), (name, model)])
    cv =KFold(n_splits=5, random_state=47,shuffle=True)
    scores = cross_val_score(pipeline, X, y, scoring='neg_mean_absolute_error', cv=cv, n_jobs=-1)
    print('Mean Accuracy : %.3f (%.3f)' % (np.mean(scores), np.std(scores)))

Mean Accuracy : -5.496 (0.490)
Mean Accuracy : -5.834 (0.384)
