In [10]:
import pandas as pd

In [11]:
df = pd.read_csv("../Dataset/cement_data.csv")
df

Unnamed: 0,Cement (component 1)(kg in a m^3 mixture),Blast Furnace Slag (component 2)(kg in a m^3 mixture),Fly Ash (component 3)(kg in a m^3 mixture),Water (component 4)(kg in a m^3 mixture),Superplasticizer (component 5)(kg in a m^3 mixture),Coarse Aggregate (component 6)(kg in a m^3 mixture),Fine Aggregate (component 7)(kg in a m^3 mixture),Age (day),"Concrete compressive strength(MPa, megapascals)"
0,540.0,0.0,0.0,162.0,2.5,1040.0,676.0,28,79.986111
1,540.0,0.0,0.0,162.0,2.5,1055.0,676.0,28,61.887366
2,332.5,142.5,0.0,228.0,0.0,932.0,594.0,270,40.269535
3,332.5,142.5,0.0,228.0,0.0,932.0,594.0,365,41.052780
4,198.6,132.4,0.0,192.0,0.0,978.4,825.5,360,44.296075
...,...,...,...,...,...,...,...,...,...
1025,276.4,116.0,90.3,179.6,8.9,870.1,768.3,28,44.284354
1026,322.2,0.0,115.6,196.0,10.4,817.9,813.4,28,31.178794
1027,148.5,139.4,108.6,192.7,6.1,892.4,780.0,28,23.696601
1028,159.1,186.7,0.0,175.6,11.3,989.6,788.9,28,32.768036


In [12]:
from sklearn.model_selection import train_test_split

X = df.drop(columns="Concrete compressive strength(MPa, megapascals) ", axis= 1)

y = df["Concrete compressive strength(MPa, megapascals) "]

X_train, X_test, y_train, y_test = train_test_split(X,y, test_size= 0.2)
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((824, 8), (206, 8), (824,), (206,))

In [13]:
import pickle
from sklearn.preprocessing import StandardScaler 

##standard Scaling- Standardization
def scaler_standard(X_train, X_test):
    #scaling the data
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)
    
    #saving the model
    file = open('../Model/standardScalar.pkl','wb')
    pickle.dump(scaler,file)
    file.close()
    
    return X_train_scaled, X_test_scaled

In [14]:
X_train_scaled, X_test_scaled = scaler_standard(X_train, X_test)

In [15]:
X_train_scaled.shape, y_train.shape, X_test_scaled.shape, y_test.shape

((824, 8), (824,), (206, 8), (206,))

In [17]:
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.metrics import mean_squared_error, r2_score

models = {
    'Linear Regression': LinearRegression(),
    'Ridge Regression': Ridge(alpha=1),
    'Lasso Regression': Lasso(alpha=1),
    'Random Forest Regression': RandomForestRegressor(n_estimators=100, max_depth=5),
    'Gradient Boosting Regression': GradientBoostingRegressor(n_estimators=100, learning_rate=0.1)
}

for n,v in models.items():
    c = v
    c.fit(X_train_scaled, y_train)
    y_pred = c.predict(X_test_scaled)
    mse = mean_squared_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)
    print(f"{n}: mse: {mse}, r2:{r2}")

Linear Regression: mse: 106.73271906646335, r2:0.62913637112087
Ridge Regression: mse: 106.79791038292589, r2:0.6289098511895297
Lasso Regression: mse: 122.14532212872787, r2:0.5755822786912972
Random Forest Regression: mse: 45.07126981938105, r2:0.8433910910393155
Gradient Boosting Regression: mse: 24.712972560919336, r2:0.9141299616041291


## Random Forest is the best

In [19]:
r = RandomForestRegressor()
r.fit(X_train_scaled, y_train)
y_pred = r.predict(X_test_scaled)
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
print(mse)
print(r2)

22.720720973512833
0.9210524279275598


In [20]:
from xgboost import XGBRegressor
r = XGBRegressor()
r.fit(X_train_scaled, y_train)
y_pred = r.predict(X_test_scaled)
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
print(mse)
print(r2)

19.824634470421284
0.9311154447744892


In [23]:
from sklearn.model_selection import train_test_split, GridSearchCV
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [3, 4, 5],
    'learning_rate': [0.1, 0.01, 0.001]
}

# Initialize XGBoostRegressor
xgb = XGBRegressor()

# Initialize GridSearchCV
grid_search = GridSearchCV(estimator=xgb, param_grid=param_grid, cv=5, scoring='neg_root_mean_squared_error')

# Perform grid search on training data
grid_search.fit(X_train_scaled, y_train)

# Get the best parameters and best score
best_params = grid_search.best_params_
best_score = grid_search.best_score_
print("Best Parameters:", best_params)
print("Best Score (neg_mean_squared_error):", best_score)

# Use the best estimator to make predictions on the testing set
best_estimator = grid_search.best_estimator_
y_pred = best_estimator.predict(X_test_scaled)

# Calculate Mean Squared Error (MSE) on testing set
mse = mean_squared_error(y_test, y_pred)
print("Mean Squared Error:", mse)
r2_score(y_test, y_pred)

Best Parameters: {'learning_rate': 0.1, 'max_depth': 4, 'n_estimators': 300}
Best Score (neg_mean_squared_error): -4.881766365421443
Mean Squared Error: 16.053179787526446


0.9442200989244559

In [24]:
reg = XGBRegressor(learning_rate=0.1, max_depth=4, n_estimators=300)
reg.fit(X_train_scaled, y_train)
y_pred = reg.predict(X_test_scaled)
mse = mean_squared_error(y_test, y_pred)
print("Mean Squared Error:", mse)
r2_score(y_test, y_pred)

Mean Squared Error: 16.053179787526446


0.9442200989244559

In [25]:
import pickle
file = open('../Model/modelForPrediction.pkl','wb')
pickle.dump(reg,file)
file.close()