In [1]:
import pandas as pd
import numpy as np
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error
from sklearn.model_selection import StratifiedKFold, KFold
from xgboost import XGBRegressor
from sklearn.impute import KNNImputer,SimpleImputer
from sklearn.preprocessing import MinMaxScaler, PowerTransformer,FunctionTransformer

In [2]:
def imputer(train,method='none'):
    if method == 'SimpleImputer':
        imputer = SimpleImputer(missing_values=np.nan, strategy='mean')
        return pd.DataFrame(imputer.fit_transform(train),columns=features)
    elif method == 'KNNImputer':
        imputer = KNNImputer(n_neighbors=2, weights="uniform")
        return pd.DataFrame(imputer.fit_transform(train),columns=features)
    else :
        return train
     

In [3]:
train=pd.read_excel('dataset.xlsx').drop(['Compressive\nstrength tests', 'Elastic\nmodulus tests',
       'Flexural\nstrength tests', 'Splitting\ntensile strength tests'],axis=1)
target_col='Compressive\nstrength (fâ€™c) (MPa)'
features=list(train.columns)
train=train.dropna(how='all')

train=imputer(train,'KNNImputer')
test=train.drop([target_col],axis=1)

In [4]:
sc=MinMaxScaler()
cont_cols=[f for f in train.columns if f not in [target_col] and train[f].nunique()>100]
print(cont_cols)
print("------------------------------")
for col in cont_cols:
    
    # Log Transformation after MinMax Scaling(keeps data between 0 and 1)
    train["log_"+col]=np.log1p(sc.fit_transform(train[[col]]))
    test["log_"+col]=np.log1p(sc.transform(test[[col]]))
    
    # Square Root Transformation
    train["sqrt_"+col]=np.sqrt(sc.fit_transform(train[[col]]))
    test["sqrt_"+col]=np.sqrt(sc.transform(test[[col]]))
    
    # Box-Cox transformation
    transformer = PowerTransformer(method='box-cox')
    train["bx_cx_"+col] = transformer.fit_transform(sc.fit_transform(train[[col]])+1) # adjusted to make it +ve
    test["bx_cx_"+col] = transformer.transform(sc.transform(test[[col]])+1)
    
    # Yeo-Johnson transformation
    transformer = PowerTransformer(method='yeo-johnson')
    train["y_J_"+col] = transformer.fit_transform(train[[col]])
    test["y_J_"+col] = transformer.transform(test[[col]])
    
    # Power transformation, 0.25
    power_transform = lambda x: np.power(x, 0.25) 
    transformer = FunctionTransformer(power_transform)
    train["pow_"+col] = transformer.fit_transform(sc.fit_transform(train[[col]]))
    test["pow_"+col] = transformer.transform(sc.transform(test[[col]]))
    
    # Power transformation, 0.1
    power_transform = lambda x: np.power(x, 0.1) 
    transformer = FunctionTransformer(power_transform)
    train["pow2_"+col] = transformer.fit_transform(sc.fit_transform(train[[col]]))
    test["pow2_"+col] = transformer.transform(sc.transform(test[[col]]))
    
    # log to power transformation
    train["log_pow2"+col]=np.log1p(train["pow2_"+col])
    test["log_pow2"+col]=np.log1p(test["pow2_"+col])
    
    temp_cols=[col,"log_"+col,"sqrt_"+col, "bx_cx_"+col,"y_J_"+col ,"pow_"+col,"pow2_"+col,"log_pow2"+col ]
    
    # See which transformation along with the original is giving you the best univariate fit with target
    kf=KFold(n_splits=10, shuffle=True, random_state=42)
    
    MAE=[]
    
    for f in temp_cols:
        X=train[[f]].values
        y=train[target_col].values
        
        mae=[]
        for train_idx, val_idx in kf.split(X,y):
            X_train,y_train=X[train_idx],y[train_idx]
            x_val,y_val=X[val_idx],y[val_idx]
            model=XGBRegressor()
            model.fit(X_train,y_train)
            y_pred=model.predict(x_val)
            mae.append(mean_absolute_error(y_val,y_pred))
        MAE.append((f,np.mean(mae)))
    best_col, best_acc=sorted(MAE, key=lambda x:x[1], reverse=False)[0]
    
    # check correlation between best_col and other columns and drop if correlation >0.9
    corr = train[temp_cols].corr(method='pearson')
    corr_with_best_col = corr[best_col]
    cols_to_drop = [f for f in temp_cols if corr_with_best_col[f] > 0.9 and f != best_col]
    final_selection=[f for f in temp_cols if f not in cols_to_drop]
    if cols_to_drop:
        train = train.drop(columns=cols_to_drop)
        test = test.drop(columns=cols_to_drop)
    
    print(col,final_selection,best_acc)

['Bulk\ndensity of RCA (kg/m3)', 'Density of\nhardened\nconcrete\nAD (qad)\n(kg/m3)', 'Density of\nhardened\nconcrete\nSSD (qSSD)\n(kg/m3)']
------------------------------
Bulk
density of RCA (kg/m3) ['bx_cx_Bulk\ndensity of RCA (kg/m3)', 'pow_Bulk\ndensity of RCA (kg/m3)', 'pow2_Bulk\ndensity of RCA (kg/m3)', 'log_pow2Bulk\ndensity of RCA (kg/m3)'] 7.839673692703248
Density of
hardened
concrete
AD (qad)
(kg/m3) ['Density of\nhardened\nconcrete\nAD (qad)\n(kg/m3)', 'pow2_Density of\nhardened\nconcrete\nAD (qad)\n(kg/m3)', 'log_pow2Density of\nhardened\nconcrete\nAD (qad)\n(kg/m3)'] 8.346966341140481
Density of
hardened
concrete
SSD (qSSD)
(kg/m3) ['Density of\nhardened\nconcrete\nSSD (qSSD)\n(kg/m3)', 'pow_Density of\nhardened\nconcrete\nSSD (qSSD)\n(kg/m3)', 'pow2_Density of\nhardened\nconcrete\nSSD (qSSD)\n(kg/m3)', 'log_pow2Density of\nhardened\nconcrete\nSSD (qSSD)\n(kg/m3)'] 10.272632942606645


In [5]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(train.drop([target_col],axis=1),train[target_col], test_size=0.33)
model=XGBRegressor()
model.fit(X_train,y_train)
np.column_stack((model.feature_importances_,X_train.columns))

array([[0.039636146277189255, 'Effective\nwater- to-cement ratio'],
       [0.02500918135046959, 'Aggregate-\nto-cement ratio (a/c)'],
       [0.009774562902748585, 'RCA\nreplacement ratio (RCA %)'],
       [0.29666340351104736, 'Parent\nconcrete strength(MPa)'],
       [0.20828048884868622, 'Nominal\nmaximum RCA size(mm)'],
       [0.09794824570417404, 'Nominal\nmaximum NA size(mm)'],
       [0.025057395920157433, 'Bulk\ndensity of NA (kg/m3)'],
       [0.038900118321180344, 'Water\nabsorption of RCA(WARCA) (%)'],
       [0.041675638407468796, 'Water\nabsorption of NA'],
       [0.07052599638700485, 'Los\nAngeles abrasion of RCA'],
       [0.054415613412857056, 'Los\nAngeles abrasion of NA'],
       [0.040289852768182755,
        'Density of\nhardened\nconcrete\nAD (qad)\n(kg/m3)'],
       [0.021808026358485222,
        'Density of\nhardened\nconcrete\nSSD (qSSD)\n(kg/m3)'],
       [0.03001531958580017, 'bx_cx_Bulk\ndensity of RCA (kg/m3)'],
       [0.0, 'pow_Bulk\ndensity of RCA (kg/