In [1]:
# import the necessary libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# for train-test split
from sklearn.model_selection import train_test_split

# for scaling
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import PowerTransformer

# for model training
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression, Lasso, Ridge, ElasticNet
from sklearn.svm import SVR
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor

# for evaluation
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

In [2]:
# reading the dataset
df = pd.read_csv('data/concrete_data.csv')

In [3]:
df.head()

Unnamed: 0,cement,blast_furnace_slag,fly_ash,water,superplasticizer,coarse_aggregate,fine_aggregate,age,concrete_compressive_strength
0,540.0,0.0,0.0,162.0,2.5,1040.0,676.0,28,79.99
1,540.0,0.0,0.0,162.0,2.5,1055.0,676.0,28,61.89
2,332.5,142.5,0.0,228.0,0.0,932.0,594.0,270,40.27
3,332.5,142.5,0.0,228.0,0.0,932.0,594.0,365,41.05
4,198.6,132.4,0.0,192.0,0.0,978.4,825.5,360,44.3


In [4]:
# separating dataset into features and labels
X = df.drop('concrete_compressive_strength',axis=1)
y = df[['concrete_compressive_strength']]

In [5]:
X.head()

Unnamed: 0,cement,blast_furnace_slag,fly_ash,water,superplasticizer,coarse_aggregate,fine_aggregate,age
0,540.0,0.0,0.0,162.0,2.5,1040.0,676.0,28
1,540.0,0.0,0.0,162.0,2.5,1055.0,676.0,28
2,332.5,142.5,0.0,228.0,0.0,932.0,594.0,270
3,332.5,142.5,0.0,228.0,0.0,932.0,594.0,365
4,198.6,132.4,0.0,192.0,0.0,978.4,825.5,360


In [6]:
y.head()

Unnamed: 0,concrete_compressive_strength
0,79.99
1,61.89
2,40.27
3,41.05
4,44.3


In [7]:
# creating a pipeline for handle datset and scaling
numerical_cols = X.select_dtypes(exclude='object').columns

In [8]:
numerical_cols

Index(['cement', 'blast_furnace_slag', 'fly_ash', 'water', 'superplasticizer',
       'coarse_aggregate', 'fine_aggregate ', 'age'],
      dtype='object')

In [9]:
## Numerical Pipeline
num_pipeline=Pipeline(
    steps=[
    ('imputer',SimpleImputer(strategy='median')),
    ('scaler',PowerTransformer(method='yeo-johnson'))
    ]
)
preprocessor=ColumnTransformer([
('num_pipeline',num_pipeline,numerical_cols)
])

In [10]:
# train-test split
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.30,random_state=42)

In [11]:
X_train=pd.DataFrame(preprocessor.fit_transform(X_train),columns=preprocessor.get_feature_names_out())
X_test=pd.DataFrame(preprocessor.transform(X_test),columns=preprocessor.get_feature_names_out())

In [12]:
X_train.head()

Unnamed: 0,num_pipeline__cement,num_pipeline__blast_furnace_slag,num_pipeline__fly_ash,num_pipeline__water,num_pipeline__superplasticizer,num_pipeline__coarse_aggregate,num_pipeline__fine_aggregate,num_pipeline__age
0,-0.793093,-1.058122,1.114033,-0.762516,0.542379,0.414249,1.779195,0.070018
1,0.50467,-1.058122,-0.868937,0.112153,-1.206559,1.137548,0.097634,-1.08947
2,0.453516,1.167725,-0.868937,-1.238962,1.190829,-1.548958,1.412397,-1.698165
3,0.771969,0.162478,1.202744,-1.321216,0.902977,-0.406726,0.330264,-1.698165
4,-1.228135,1.119626,1.215014,-0.124204,1.516252,-1.727678,-0.426811,0.070018


In [13]:
X_test.head()

Unnamed: 0,num_pipeline__cement,num_pipeline__blast_furnace_slag,num_pipeline__fly_ash,num_pipeline__water,num_pipeline__superplasticizer,num_pipeline__coarse_aggregate,num_pipeline__fine_aggregate,num_pipeline__age
0,-0.018154,0.901535,-0.868937,2.154452,-1.206559,-0.534007,-1.254564,2.465296
1,0.799059,1.11736,-0.868937,-0.796064,0.967012,-0.372351,-0.244945,-1.08947
2,0.997721,1.11736,-0.868937,-1.71499,1.694042,-0.372351,-0.244945,0.070018
3,0.799059,1.11736,-0.868937,-0.796064,0.967012,-0.372351,-0.244945,-1.698165
4,-1.482981,-1.058122,1.262175,0.955451,0.601714,-1.905011,1.251437,0.070018


In [14]:
# model-training
models = {
    'Linear Regression': LinearRegression(),
    'Lasso': Lasso(),
    'Ridge': Ridge(),
    'ElasticNet': ElasticNet(),
    'SVR': SVR(),
    'Decision Tree': DecisionTreeRegressor(),
    'Random Forest': RandomForestRegressor()
}

In [15]:
# Train and evaluate each model
for name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    mse = mean_squared_error(y_test, y_pred)
    mae = mean_absolute_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)
    print(f"Model: {name}")
    print(f"MSE: {mse}")
    print(f"MAE: {mae}")
    print(f"R^2 Score: {r2}")
    print("-" * 40)

Model: Linear Regression
MSE: 53.94327737004961
MAE: 5.8257083121572215
R^2 Score: 0.8006341175591372
----------------------------------------
Model: Lasso
MSE: 61.73374339743973
MAE: 6.115200474288795
R^2 Score: 0.7718417784596493
----------------------------------------
Model: Ridge
MSE: 53.975154617033716
MAE: 5.826410047972781
R^2 Score: 0.8005163042599714
----------------------------------------
Model: ElasticNet
MSE: 88.2420261249994
MAE: 7.623606810985491
R^2 Score: 0.6738713280971715
----------------------------------------
Model: SVR
MSE: 62.18775513728981
MAE: 6.021275628306828
R^2 Score: 0.7701638223626773
----------------------------------------
Model: Decision Tree
MSE: 48.444599352750814
MAE: 4.503042071197411
R^2 Score: 0.8209563680530533
----------------------------------------


  y = column_or_1d(y, warn=True)
  model.fit(X_train, y_train)


Model: Random Forest
MSE: 30.95626627466287
MAE: 3.794060926182773
R^2 Score: 0.8855905009147795
----------------------------------------


# Conclusion - 
    we can observe that Random Forset is best model as it has low MSE and MAE and high R^2 score but we can increase this more with hyperparamter tuning.

In [16]:
import warnings
warnings.filterwarnings('ignore')

In [17]:
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import randint

param_dist = {
    'n_estimators': randint(100, 500),
    'max_depth': [None, 5, 10, 15, 20],
    'min_samples_split': randint(2, 20),
    'min_samples_leaf': randint(1, 10),
    'max_features': ['log2', 'sqrt'],
    'bootstrap': [True, False]
}

# Create the Random Forest regressor
rf_model = RandomForestRegressor(random_state=42)

In [18]:
random_search = RandomizedSearchCV(estimator=rf_model, param_distributions=param_dist, n_iter=10, cv=5)
random_search.fit(X_train, y_train)

In [19]:
best_params_random = random_search.best_params_
best_score_random = random_search.best_score_
print(best_params_random)
print(best_score_random)

{'bootstrap': False, 'max_depth': 15, 'max_features': 'log2', 'min_samples_leaf': 6, 'min_samples_split': 11, 'n_estimators': 300}
0.8655797743889432


In [20]:
best_rf_model_random = RandomForestRegressor(**best_params_random)
best_rf_model_random.fit(X_train, y_train)

In [21]:
y_pred_random = best_rf_model_random.predict(X_test)

In [22]:
r2_random = r2_score(y_test, y_pred_random)

In [23]:
r2_random

0.8636756253072037

We can see that randomizedsearchcv has reduced the r2 score which is not good therefore we will use original model only.

# Best Model
    Model: Random Forest
    MSE: 30.95626627466287
    MAE: 3.794060926182773
    R^2 Score: 0.8855905009147795

# Best Scaler
    PowerTransformer
    method = 'yeo-johnson'