## Importing Required Libraries

In [29]:
# Basic Import
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt 
import seaborn as sns
# Modelling
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor,AdaBoostRegressor, GradientBoostingRegressor
from sklearn.linear_model import LinearRegression, Ridge,Lasso
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error
from sklearn.model_selection import RandomizedSearchCV
from catboost import CatBoostRegressor
from xgboost import XGBRegressor
import warnings
warnings.filterwarnings('ignore')

## Loading the Dataset

In [2]:
df = pd.read_csv('Concrete_Data.csv')
df.head()

Unnamed: 0,cement,slag,flyash,water,superplasticizer,coarseaggregate,fineaggregate,age,csMPa
0,540.0,0.0,0.0,162.0,2.5,1040.0,676.0,28,79.99
1,540.0,0.0,0.0,162.0,2.5,1055.0,676.0,28,61.89
2,332.5,142.5,0.0,228.0,0.0,932.0,594.0,270,40.27
3,332.5,142.5,0.0,228.0,0.0,932.0,594.0,365,41.05
4,198.6,132.4,0.0,192.0,0.0,978.4,825.5,360,44.3


## Dropping the duplicate values

In [3]:
df.drop_duplicates(inplace=True)

In [4]:
df.reset_index(inplace=True)

## Preparing x and y variables

In [5]:
x = df.drop(columns=['csMPa'],axis=1)

In [6]:
x

Unnamed: 0,index,cement,slag,flyash,water,superplasticizer,coarseaggregate,fineaggregate,age
0,0,540.0,0.0,0.0,162.0,2.5,1040.0,676.0,28
1,1,540.0,0.0,0.0,162.0,2.5,1055.0,676.0,28
2,2,332.5,142.5,0.0,228.0,0.0,932.0,594.0,270
3,3,332.5,142.5,0.0,228.0,0.0,932.0,594.0,365
4,4,198.6,132.4,0.0,192.0,0.0,978.4,825.5,360
...,...,...,...,...,...,...,...,...,...
1000,1025,276.4,116.0,90.3,179.6,8.9,870.1,768.3,28
1001,1026,322.2,0.0,115.6,196.0,10.4,817.9,813.4,28
1002,1027,148.5,139.4,108.6,192.7,6.1,892.4,780.0,28
1003,1028,159.1,186.7,0.0,175.6,11.3,989.6,788.9,28


In [7]:
y = df['csMPa']

In [8]:
y

0       79.99
1       61.89
2       40.27
3       41.05
4       44.30
        ...  
1000    44.28
1001    31.18
1002    23.70
1003    32.77
1004    32.40
Name: csMPa, Length: 1005, dtype: float64

In [9]:
# Creating Pipeline with Column Transformer
num_features = x.select_dtypes(exclude="object").columns

from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler

# Numerical Pipeline
num_pipeline = Pipeline(
                steps = [
                ('imputer',SimpleImputer(strategy='median')),
                ('scaler',StandardScaler())                
                ]
            )

preprocessor = ColumnTransformer(
                [
                ('num_pipeline',num_pipeline,num_features),
                ]
            )

In [10]:
x = preprocessor.fit_transform(x)

In [11]:
x.shape

(1005, 9)

In [12]:
# separate dataset into train and test
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x,y,test_size=0.2,random_state=42)
x_train.shape, x_test.shape

((804, 9), (201, 9))

### Create an Evaluate Function to give all metrics after model Training

In [13]:
def evaluate_model(true, predicted):
    mae = mean_absolute_error(true, predicted)
    mse = mean_squared_error(true, predicted)
    rmse = np.sqrt(mean_squared_error(true, predicted))
    r2_square = r2_score(true, predicted)
    return mae, rmse, r2_square

In [14]:
models = {
    "Linear Regression": LinearRegression(),
    "Lasso": Lasso(),
    "Ridge": Ridge(),
    "K-Neighbors Regressor": KNeighborsRegressor(),
    "Decision Tree": DecisionTreeRegressor(),
    "Random Forest Regressor": RandomForestRegressor(),
    "XGBRegressor": XGBRegressor(), 
    "CatBoosting Regressor": CatBoostRegressor(verbose=False),
    "AdaBoost Regressor": AdaBoostRegressor(),
    "GradientBoosting Regressor": GradientBoostingRegressor()
}
model_list = []
r2_list =[]

for i in range(len(list(models))):
    model = list(models.values())[i]
    model.fit(x_train, y_train) # Train model

    # Make predictions
    y_train_pred = model.predict(x_train)
    y_test_pred = model.predict(x_test)
    
    # Evaluate Train and Test dataset
    model_train_mae , model_train_rmse, model_train_r2 = evaluate_model(y_train, y_train_pred)

    model_test_mae , model_test_rmse, model_test_r2 = evaluate_model(y_test, y_test_pred)

    
    print(list(models.keys())[i])
    model_list.append(list(models.keys())[i])
    
    print('Model performance for Training set')
    print("- Root Mean Squared Error: {:.4f}".format(model_train_rmse))
    print("- Mean Absolute Error: {:.4f}".format(model_train_mae))
    print("- R2 Score: {:.4f}".format(model_train_r2))

    print('----------------------------------')
    
    print('Model performance for Test set')
    print("- Root Mean Squared Error: {:.4f}".format(model_test_rmse))
    print("- Mean Absolute Error: {:.4f}".format(model_test_mae))
    print("- R2 Score: {:.4f}".format(model_test_r2))
    r2_list.append(model_test_r2)
    
    print('='*35)
    print('\n')

Linear Regression
Model performance for Training set
- Root Mean Squared Error: 9.9923
- Mean Absolute Error: 7.9464
- R2 Score: 0.6106
----------------------------------
Model performance for Test set
- Root Mean Squared Error: 11.2057
- Mean Absolute Error: 8.8956
- R2 Score: 0.5791


Lasso
Model performance for Training set
- Root Mean Squared Error: 10.6324
- Mean Absolute Error: 8.6274
- R2 Score: 0.5591
----------------------------------
Model performance for Test set
- Root Mean Squared Error: 12.0778
- Mean Absolute Error: 9.6322
- R2 Score: 0.5110


Ridge
Model performance for Training set
- Root Mean Squared Error: 9.9926
- Mean Absolute Error: 7.9517
- R2 Score: 0.6106
----------------------------------
Model performance for Test set
- Root Mean Squared Error: 11.2091
- Mean Absolute Error: 8.8979
- R2 Score: 0.5788


K-Neighbors Regressor
Model performance for Training set
- Root Mean Squared Error: 7.3138
- Mean Absolute Error: 5.5271
- R2 Score: 0.7914
-------------------

## Results

In [15]:
pd.DataFrame(list(zip(model_list, r2_list)), columns=['Model Name', 'R2_Score']).sort_values(by=["R2_Score"],ascending=False)

Unnamed: 0,Model Name,R2_Score
6,XGBRegressor,0.943434
7,CatBoosting Regressor,0.94259
5,Random Forest Regressor,0.915569
9,GradientBoosting Regressor,0.908715
4,Decision Tree,0.861181
8,AdaBoost Regressor,0.80262
3,K-Neighbors Regressor,0.71255
0,Linear Regression,0.579089
2,Ridge,0.578838
1,Lasso,0.511025


## Tuning Catboost

In [16]:
# Initializing catboost
cbr = CatBoostRegressor(verbose=False)

# Creating the hyperparameter grid
param_dist = {'depth'          : [4,5,6,7,8,9, 10],
              'learning_rate' : [0.01,0.02,0.03,0.04],
              'iterations'    : [300,400,500,600,700,800,900]}

#Instantiate RandomSearchCV object
rs_cv = RandomizedSearchCV(cbr , param_dist, scoring='r2', cv =5, n_jobs=-1)

# Fit the model
rs_cv.fit(x_train, y_train.values.flatten())

# Print the tuned parameters and score
print(rs_cv.best_params_)
print(rs_cv.best_score_)

{'learning_rate': 0.04, 'iterations': 800, 'depth': 4}
0.9237274758178733


## Function to print evaluated model results

In [17]:
def print_evaluated_results(model,xtrain,ytrain,xtest,ytest):
    ytrain_pred = model.predict(xtrain)
    ytest_pred = model.predict(xtest)

    # Evaluate Train and Test dataset
    model_train_mae , model_train_rmse, model_train_r2 = evaluate_model(ytrain, ytrain_pred)
    model_test_mae , model_test_rmse, model_test_r2 = evaluate_model(ytest, ytest_pred)

    # Printing results
    print('Model performance for Training set')
    print("- Root Mean Squared Error: {:.4f}".format(model_train_rmse))
    print("- Mean Absolute Error: {:.4f}".format(model_train_mae))
    print("- R2 Score: {:.4f}".format(model_train_r2))

    print('----------------------------------')
    
    print('Model performance for Test set')
    print("- Root Mean Squared Error: {:.4f}".format(model_test_rmse))
    print("- Mean Absolute Error: {:.4f}".format(model_test_mae))
    print("- R2 Score: {:.4f}".format(model_test_r2))

In [18]:
# Selecting best model
best_cbr = rs_cv.best_estimator_

# Evaluate Train and Test dataset
print_evaluated_results(best_cbr,x_train,y_train,x_test,y_test)

Model performance for Training set
- Root Mean Squared Error: 2.7175
- Mean Absolute Error: 1.9742
- R2 Score: 0.9712
----------------------------------
Model performance for Test set
- Root Mean Squared Error: 4.4658
- Mean Absolute Error: 3.0462
- R2 Score: 0.9331


## Tuning XGBoost

In [34]:
# Initializing xgboost
xgb = XGBRegressor()

# Parameters
params = {
 'tree_method': ['exact','hist','gpu_hist','approx'],
 'learning_rate' : [0.05,0.10,0.15,0.20,0.25,0.30],
 'max_depth' : [ 3, 4, 5, 6, 8, 10, 12, 15],
 'min_child_weight' : [ 1, 3, 5, 7 ],
 'gamma': [ 0.0, 0.1, 0.2 , 0.3, 0.4 ],
 'colsample_bytree' : [ 0.3, 0.4, 0.5 , 0.7 ],
 'n_estimators':[300,400,500,600,700,800,900]
}

rs_xgb=RandomizedSearchCV(xgb,param_distributions=params,scoring='r2',n_jobs=-1,cv=5)
rs_xgb.fit(x_train, y_train.values.flatten())

# Print the tuned parameters and score
print(rs_xgb.best_params_)
print(rs_xgb.best_score_)

{'tree_method': 'gpu_hist', 'n_estimators': 800, 'min_child_weight': 1, 'max_depth': 3, 'learning_rate': 0.25, 'gamma': 0.2, 'colsample_bytree': 0.7}
0.9181831386479384


In [35]:
# Selecting best xgb model
best_xgb = rs_xgb.best_estimator_

# Evaluate Train and Test dataset
print_evaluated_results(best_xgb,x_train,y_train,x_test,y_test)

Model performance for Training set
- Root Mean Squared Error: 1.4518
- Mean Absolute Error: 0.9902
- R2 Score: 0.9918
----------------------------------
Model performance for Test set
- Root Mean Squared Error: 3.9867
- Mean Absolute Error: 2.8095
- R2 Score: 0.9467


## Tuning RandomForest

In [30]:
# Initializing RandomForest
rfr = RandomForestRegressor(bootstrap=False)

# Parametrs
params = {"n_estimators": [100, 200, 500, 600, 800, 900],
          "criterion": ['squared_error', 'absolute_error', 'friedman_mse', 'poisson'],
          "max_depth": [None, 2, 3, 5, 7, 8],
          "min_samples_split": [2, 5, 7, 9, 10],
          "min_samples_leaf": [1, 2, 5, 7, 9, 10],
          "max_features": ["auto", "sqrt", "log2"]}

rs_rfr = RandomizedSearchCV(RandomForestRegressor(), params, cv=5, verbose=0, scoring="r2", n_jobs=-1)
rs_rfr.fit(x_train, y_train.values.flatten())

# Print the tuned parameters and score
print(rs_rfr.best_params_)
print(rs_rfr.best_score_)

{'n_estimators': 600, 'min_samples_split': 2, 'min_samples_leaf': 2, 'max_features': 'auto', 'max_depth': None, 'criterion': 'friedman_mse'}
0.8859258011326304


In [31]:
# Selecting best randomforest model
best_rfr = rs_rfr.best_estimator_

# Evaluate Train and Test dataset
print_evaluated_results(best_rfr,x_train,y_train,x_test,y_test)

Model performance for Training set
- Root Mean Squared Error: 2.4687
- Mean Absolute Error: 1.7180
- R2 Score: 0.9762
----------------------------------
Model performance for Test set
- Root Mean Squared Error: 5.2095
- Mean Absolute Error: 3.5912
- R2 Score: 0.9090


## Tuning GradientBoosting

In [32]:
# Initializing GradientBoosting
gbr = GradientBoostingRegressor()

# Parametrs
params = {"n_estimators": [100, 200, 500, 600, 800, 900],
          "criterion": ['friedman_mse', 'squared_error'],
          "learning_rate": [0.05,0.10,0.15,0.20,0.25,0.30],
          "max_depth": [None, 2, 3, 5, 7, 8],
          "min_samples_split": [2, 5, 7, 9, 10],
          "min_samples_leaf": [1, 2, 5, 7, 9, 10]}

rs_gbr = RandomizedSearchCV(GradientBoostingRegressor(), params, cv=5, verbose=0, scoring="r2", n_jobs=-1)
rs_gbr.fit(x_train, y_train.values.flatten())

# Print the tuned parameters and score
print(rs_gbr.best_params_)
print(rs_gbr.best_score_)

{'n_estimators': 600, 'min_samples_split': 9, 'min_samples_leaf': 9, 'max_depth': 5, 'learning_rate': 0.05, 'criterion': 'friedman_mse'}
0.9200241221561065


In [33]:
# Selecting best gradientboosting model
best_gbr = rs_gbr.best_estimator_

# Evaluate Train and Test dataset
print_evaluated_results(best_gbr,x_train,y_train,x_test,y_test)

Model performance for Training set
- Root Mean Squared Error: 1.4525
- Mean Absolute Error: 0.9803
- R2 Score: 0.9918
----------------------------------
Model performance for Test set
- Root Mean Squared Error: 4.1934
- Mean Absolute Error: 2.7319
- R2 Score: 0.9411
