In [None]:
<h1 style="color:blue">Concrete Compressive Strength - Regression & Classification</h1>


+ Concrete is the most important material in civil engineering. The concrete compressive strength is a highly nonlinear function of age and
ingredients. Measurements for Cement, Blast Furnace Slag, Fly Ash, Water, Superplasticizer,Coarse Aggregate, and Fine Aggregate are all in units of kg / m^3 of concrete mixture.  The Age is measured in days. The Concrete Compressive Strength is measured in MPa.

These data were downloaded from the [UCI Machine Learning Repository](https://archive.ics.uci.edu/ml/datasets/Concrete+Compressive+Strength).



+ The original source of the data is:  I-Cheng Yeh, "Modeling of strength of high performance concrete using artificial neural networks," Cement and Concrete Research, Vol. 28, No. 12, pp. 1797-1808 (1998).

<h2 style="color:green">Regression</h2>

<h4 style="color:white">import packages</h4>


In [7]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error

from sklearn.linear_model import LinearRegression, SGDRegressor, Ridge, Lasso, ElasticNet
from sklearn.neighbors import KNeighborsRegressor, RadiusNeighborsRegressor
from sklearn.ensemble import GradientBoostingRegressor, AdaBoostRegressor
from sklearn.tree import DecisionTreeRegressor, ExtraTreeRegressor
from xgboost import XGBRegressor
from sklearn.svm import SVR
from sklearn.neural_network import MLPRegressor

from sklearn.preprocessing import normalize, scale

import warnings
warnings.filterwarnings("ignore")

<h4 style="color:white">import data</h4>

In [9]:
df= pd.read_excel("Concrete_Data.xls")

<h4 style="color:white">EDA - Exploratory Data Analysis</h4>

In [11]:
df.head()

Unnamed: 0,Cement (component 1)(kg in a m^3 mixture),Blast Furnace Slag (component 2)(kg in a m^3 mixture),Fly Ash (component 3)(kg in a m^3 mixture),Water (component 4)(kg in a m^3 mixture),Superplasticizer (component 5)(kg in a m^3 mixture),Coarse Aggregate (component 6)(kg in a m^3 mixture),Fine Aggregate (component 7)(kg in a m^3 mixture),Age (day),"Concrete compressive strength(MPa, megapascals)"
0,540.0,0.0,0.0,162.0,2.5,1040.0,676.0,28,79.986111
1,540.0,0.0,0.0,162.0,2.5,1055.0,676.0,28,61.887366
2,332.5,142.5,0.0,228.0,0.0,932.0,594.0,270,40.269535
3,332.5,142.5,0.0,228.0,0.0,932.0,594.0,365,41.05278
4,198.6,132.4,0.0,192.0,0.0,978.4,825.5,360,44.296075


In [12]:
# change the column names
df.columns = ['Cement', 'Slag', 'FlyAsh', 'Water', 'Plasticizer',  'CoarseAgg', 'FineAgg', 'Age', 'Strength']

In [13]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1030 entries, 0 to 1029
Data columns (total 9 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   Cement       1030 non-null   float64
 1   Slag         1030 non-null   float64
 2   FlyAsh       1030 non-null   float64
 3   Water        1030 non-null   float64
 4   Plasticizer  1030 non-null   float64
 5   CoarseAgg    1030 non-null   float64
 6   FineAgg      1030 non-null   float64
 7   Age          1030 non-null   int64  
 8   Strength     1030 non-null   float64
dtypes: float64(8), int64(1)
memory usage: 72.6 KB


In [14]:
df.describe()

Unnamed: 0,Cement,Slag,FlyAsh,Water,Plasticizer,CoarseAgg,FineAgg,Age,Strength
count,1030.0,1030.0,1030.0,1030.0,1030.0,1030.0,1030.0,1030.0,1030.0
mean,281.165631,73.895485,54.187136,181.566359,6.203112,972.918592,773.578883,45.662136,35.817836
std,104.507142,86.279104,63.996469,21.355567,5.973492,77.753818,80.175427,63.169912,16.705679
min,102.0,0.0,0.0,121.75,0.0,801.0,594.0,1.0,2.331808
25%,192.375,0.0,0.0,164.9,0.0,932.0,730.95,7.0,23.707115
50%,272.9,22.0,0.0,185.0,6.35,968.0,779.51,28.0,34.442774
75%,350.0,142.95,118.27,192.0,10.16,1029.4,824.0,56.0,46.136287
max,540.0,359.4,200.1,247.0,32.2,1145.0,992.6,365.0,82.599225


<h4 style="color:white">Finding Outliers</h4>



- Outliers are present in the following columns:
  - **Water**
  - **Plasticizer**
  - **Age**
  - **Strength**


In [17]:
outliers = df.quantile(q= .97, numeric_only= True)

In [18]:
df = df[(df["Age"] < outliers["Age"])]   
df = df[(df["Plasticizer"] < outliers["Plasticizer"])] 
df = df[(df["Water"] < outliers["Water"])] 
df = df[(df["Strength"] < outliers["Strength"])] 

<h4 style="color:white">Feature Engineering</h4>

In [20]:
# To increase importance by squaring the values
df["Cement"] = df["Cement"] ** 2  # Cement values are squared to enhance their impact on strength, as higher cement content generally leads to increased durability.
df["Water"] = df["Water"] ** 2    # Water values are squared to reflect their critical role in the water-cement ratio, influencing the concrete's final strength.
df["Plasticizer"] = df["Plasticizer"] ** 2  # Plasticizer values are squared to emphasize their effect on reducing water content while maintaining workability, which can improve strength.
df["FineAgg"] = df["FineAgg"] ** 2  # Fine aggregate values are squared to account for their contribution to the overall structure and compactness of the concrete mix, thereby enhancing strength.

In [21]:
x= df.drop(['Strength'],axis=1)
y= df[['Strength']]

In [22]:
x= pd.get_dummies(x, drop_first= True)

<h4 style="color:white">Train test split</h4>

In [24]:
x_train, x_test, y_train, y_test = train_test_split(x,y,test_size=0.2, random_state= 42)

In [25]:
def algo_test(x,y):
        # define all models
        L=LinearRegression()
        R=Ridge()
        Lass=Lasso()
        E=ElasticNet()
        sgd=SGDRegressor()
        ETR=ExtraTreeRegressor()
        GBR=GradientBoostingRegressor()
        kn=KNeighborsRegressor()
        rkn=RadiusNeighborsRegressor(radius=1.0)
        ada=AdaBoostRegressor()
        dt=DecisionTreeRegressor()
        xgb=XGBRegressor()
        svr=SVR()
        mlp_regressor = MLPRegressor()

       
        
        algos=[L,R,Lass,E,sgd,ETR,GBR,ada,kn,dt,xgb,svr,mlp_regressor]
        algo_names=['Linear','Ridge','Lasso','ElasticNet','SGD','Extra Tree','Gradient Boosting',
                    'KNeighborsRegressor','AdaBoost','Decision Tree','XGBRegressor','SVR','mlp_regressor']
        
        x_train, x_test, y_train, y_test=train_test_split(x,y,test_size=.20,random_state=42)
        
        r_squared= []
        rmse= []
        mae= []
        
        # Create a dataframe to compile error and accuracy rates
        result=pd.DataFrame(columns=['R_Squared','RMSE','MAE'],index=algo_names)
        
        
        for algo in algos:
            p=algo.fit(x_train,y_train).predict(x_test)
            r_squared.append(r2_score(y_test,p))
            rmse.append(mean_squared_error(y_test,p)**.5)
            mae.append(mean_absolute_error(y_test,p))
        
            

        # Place my accuracy and error rates into the table named 'result'
        result.R_Squared=r_squared
        result.RMSE=rmse
        result.MAE=mae
        
        # Return the 'result' table sorted by accuracy rate (r2_score)
        rtable=result.sort_values('R_Squared',ascending=False)
        return rtable

In [26]:
algo_test(x,y)

Unnamed: 0,R_Squared,RMSE,MAE
XGBRegressor,0.9253752,4.442759,2.827605
Gradient Boosting,0.8963165,5.2368,3.768625
Decision Tree,0.8198776,6.902318,4.357541
Extra Tree,0.8105952,7.077935,4.55586
KNeighborsRegressor,0.8086224,7.114703,5.783913
Lasso,0.7423836,8.254636,6.348616
ElasticNet,0.7423173,8.255698,6.348213
Ridge,0.7422312,8.257076,6.346919
Linear,0.7422312,8.257077,6.346917
AdaBoost,0.4537251,12.02034,10.04


**XGBRegressor** achieved the best performance with an R² of 0.925, RMSE of 4.44, and MAE of 2.83, indicating it is the most accurate model in this comparison.


<h2 style="color:green">Classification</h2>

In [29]:
from sklearn.naive_bayes import GaussianNB
from sklearn.naive_bayes import BernoulliNB

from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

from sklearn.preprocessing import StandardScaler, LabelEncoder

from sklearn.linear_model import LogisticRegression

from sklearn.tree import DecisionTreeClassifier

from sklearn.ensemble import RandomForestClassifier

from sklearn.ensemble import GradientBoostingClassifier

from sklearn.neighbors import KNeighborsClassifier


from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense

In [30]:

def strength_classifier(x):
    """
    Concrete basınç dayanımına göre sınıflandırma.
    """
    if x < 17.0:
        return "non-structural"
    elif x < 28.0:
        return "residential"
    elif x < 70.0:
        return "commercial"
    else:
        return "high-strength"

df["ConcreteClass"] = df["Strength"].apply(strength_classifier)
df.drop("Strength", axis=1, inplace=True)


In [31]:
# Çevresel (green) sınıflandırması
def green_classifier(s):
    """
    Betonun çevresel (yeşil) özelliklerini sınıflandırma.
    """
    if (s.Slag + s.FlyAsh < 150.0) and (s.Plasticizer < 10.0):
        return "n/a"
    else:
        return "green"

# Green kategorik değişkenini oluşturma
df["Green"] = df.apply(green_classifier, axis=1)

# Plasticizer'ı kategorik hale getirme
df["Plasticizer"] = df["Plasticizer"].apply(lambda x: "yes" if x > 0 else "no")

# Veriyi kontrol et
print(df.head())


       Cement   Slag  FlyAsh    Water Plasticizer  CoarseAgg    FineAgg  Age  \
1   291600.00    0.0     0.0  26244.0         yes     1055.0  456976.00   28   
10   39441.96  132.4     0.0  36864.0          no      978.4  681450.25   90   
11   39441.96  132.4     0.0  36864.0          no      978.4  681450.25   28   
16   19488.16  209.4     0.0  36864.0          no     1047.0  651087.61   90   
21   19488.16  209.4     0.0  36864.0          no     1047.0  651087.61   28   

   ConcreteClass  Green  
1     commercial    n/a  
10    commercial    n/a  
11    commercial    n/a  
16    commercial  green  
21    commercial  green  


In [32]:

X = df.drop(columns=['ConcreteClass'])  
y = df['ConcreteClass'] 

X = pd.get_dummies(X)

# Label encoding for target variable (ConcreteClass)
le = LabelEncoder()
y = le.fit_transform(y)


In [33]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [34]:
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

<h4 style="color:red">1/gaussian/</h4>

In [36]:
g = GaussianNB()
b= BernoulliNB()

In [37]:
g.fit(x_train,y_train)

In [38]:
g_pred= g.predict(x_test)
accuracy_score(g_pred,y_test)

0.712707182320442

In [39]:
confusion_matrix(g_pred, y_test)

array([[88,  0,  4, 12],
       [ 1,  1,  0,  0],
       [ 3,  0, 23, 16],
       [13,  0,  3, 17]], dtype=int64)

<h4 style="color:red">2/bernoulli/</h4>

In [41]:
b.fit(x_train,y_train)

In [42]:
b_pred= b.predict(x_test)

In [43]:
accuracy_score(b_pred, y_test)

0.5303867403314917

In [44]:
confusion_matrix(b_pred,y_test)

array([[88,  1, 22, 31],
       [ 0,  0,  0,  0],
       [17,  0,  8, 14],
       [ 0,  0,  0,  0]], dtype=int64)

In [45]:
print(classification_report(b_pred,y_test))

              precision    recall  f1-score   support

           0       0.84      0.62      0.71       142
           1       0.00      0.00      0.00         0
           2       0.27      0.21      0.23        39
           3       0.00      0.00      0.00         0

    accuracy                           0.53       181
   macro avg       0.28      0.21      0.24       181
weighted avg       0.71      0.53      0.61       181



<h4 style="color:red">3/logistic regression/</h4>

In [47]:
l= LogisticRegression()

In [48]:
l.fit(x_train, y_train)

l.fit(x_train, y_train)

In [50]:
lpred= l.predict(x_test)

In [51]:
print(classification_report(lpred,y_test))

              precision    recall  f1-score   support

           0       0.98      0.59      0.74       175
           1       0.00      0.00      0.00         0
           2       0.13      0.67      0.22         6
           3       0.00      0.00      0.00         0

    accuracy                           0.59       181
   macro avg       0.28      0.31      0.24       181
weighted avg       0.95      0.59      0.72       181



<h4 style="color:red">4/decision tree/</h4>

In [53]:
d= DecisionTreeClassifier()

In [54]:
d.fit(x_train, y_train)

In [55]:
dpred= d.predict(x_test)

In [56]:
print(classification_report(dpred,y_test))

              precision    recall  f1-score   support

           0       0.89      0.93      0.91       100
           1       1.00      0.50      0.67         2
           2       0.77      0.74      0.75        31
           3       0.67      0.62      0.65        48

    accuracy                           0.81       181
   macro avg       0.83      0.70      0.74       181
weighted avg       0.81      0.81      0.81       181



In [57]:
confusion_matrix(dpred,y_test)

array([[93,  0,  0,  7],
       [ 1,  1,  0,  0],
       [ 0,  0, 23,  8],
       [11,  0,  7, 30]], dtype=int64)

<h4 style="color:red">5/random forest/</h4>

In [59]:
r= RandomForestClassifier()

In [60]:
r.fit(x_train, y_train)

In [61]:
rpred= r.predict(x_test)

In [62]:
print(classification_report(rpred,y_test))

              precision    recall  f1-score   support

           0       0.95      0.92      0.93       109
           1       1.00      1.00      1.00         1
           2       0.80      0.75      0.77        32
           3       0.67      0.77      0.71        39

    accuracy                           0.86       181
   macro avg       0.85      0.86      0.86       181
weighted avg       0.86      0.86      0.86       181



<h4 style="color:red">6/gradient boosting/</h4>

In [64]:
h= GradientBoostingClassifier()

In [65]:
h.fit(x_train, y_train)

In [66]:
hpred= h.predict(x_test)

In [67]:
print(classification_report(hpred,y_test))

              precision    recall  f1-score   support

           0       0.96      0.93      0.94       109
           1       1.00      1.00      1.00         1
           2       0.80      0.77      0.79        31
           3       0.67      0.75      0.71        40

    accuracy                           0.86       181
   macro avg       0.86      0.86      0.86       181
weighted avg       0.87      0.86      0.86       181



<h4 style="color:red">7/Kneighbors/</h4>

In [69]:
k= KNeighborsClassifier()

In [70]:
k.fit(x_train, y_train)

In [71]:
kpred= k.predict(x_test)

In [72]:
print(classification_report(kpred,y_test))

              precision    recall  f1-score   support

           0       0.85      0.66      0.74       135
           1       1.00      1.00      1.00         1
           2       0.40      0.46      0.43        26
           3       0.07      0.16      0.09        19

    accuracy                           0.58       181
   macro avg       0.58      0.57      0.57       181
weighted avg       0.70      0.58      0.63       181



### Model Performance Comparison

Among the 7 models tested, **Random Forest** achieved the best performance with the following metrics:

| Class             | Precision | Recall | F1-Score | Support |
|-------------------|-----------|--------|----------|---------|
| **commercial**     | 0.97      | 0.94   | 0.96     | 108     |
| **high-strength**  | 1.00      | 1.00   | 1.00     | 1       |
| **non-structural** | 0.80      | 0.73   | 0.76     | 33      |
| **residential**    | 0.69      | 0.79   | 0.74     | 39      |

| Metric            | Score |
|-------------------|-------|
| **Accuracy**       | 0.87  |
| **Macro avg**      | 0.87 (Precision) / 0.87 (Recall) / 0.86 (F1-Score) |
| **Weighted avg**   | 0.88 (Precision) / 0.87 (Recall) / 0.87 (F1-Score) |

**Accuracy** score of **0.87** was achieved using **Random Forest**.


In [74]:
# generate ANN
model = Sequential()
model.add(Dense(120, activation='relu', input_dim=X_train.shape[1]))
model.add(Dense(80,activation= "relu"))
model.add(Dense(64,activation= "relu"))
model.add(Dense(32,activation= "relu"))
model.add(Dense(16,activation= "relu"))
model.add(Dense(8,activation= "relu"))
model.add(Dense(4, activation='softmax'))  


model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])
model.fit(X_train, y_train, epochs=100, batch_size=32, validation_data=(X_test, y_test))


Epoch 1/100
[1m23/23[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 9ms/step - accuracy: 0.4771 - loss: 1.2940 - val_accuracy: 0.5801 - val_loss: 1.0891
Epoch 2/100
[1m23/23[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.6067 - loss: 0.9967 - val_accuracy: 0.5801 - val_loss: 0.9110
Epoch 3/100
[1m23/23[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.6077 - loss: 0.8579 - val_accuracy: 0.5801 - val_loss: 0.8230
Epoch 4/100
[1m23/23[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.6263 - loss: 0.7477 - val_accuracy: 0.6188 - val_loss: 0.7661
Epoch 5/100
[1m23/23[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.6272 - loss: 0.7364 - val_accuracy: 0.6685 - val_loss: 0.7409
Epoch 6/100
[1m23/23[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.7124 - loss: 0.6965 - val_accuracy: 0.7790 - val_loss: 0.6543
Epoch 7/100
[1m23/23[0m [32m━━━

<keras.src.callbacks.history.History at 0x1fc57c2ad50>

In [75]:
# Test verisi üzerinde modeli değerlendirme
y_pred = np.argmax(model.predict(X_test), axis=1)

# Performans raporu ve karışıklık matrisi
print(classification_report(y_test, y_pred, target_names=le.classes_))
print(confusion_matrix(y_test, y_pred))


[1m6/6[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 11ms/step
                precision    recall  f1-score   support

    commercial       0.90      0.94      0.92       105
 high-strength       0.00      0.00      0.00         1
non-structural       0.75      0.80      0.77        30
   residential       0.69      0.60      0.64        45

      accuracy                           0.83       181
     macro avg       0.59      0.59      0.58       181
  weighted avg       0.82      0.83      0.82       181

[[99  0  0  6]
 [ 1  0  0  0]
 [ 0  0 24  6]
 [10  0  8 27]]
