In [2]:
#importing library
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression,Ridge
from sklearn.svm import LinearSVR,SVR
from sklearn.tree import DecisionTreeRegressor
from sklearn.neural_network import MLPRegressor
from sklearn.ensemble import RandomForestRegressor,GradientBoostingRegressor,AdaBoostRegressor
from sklearn.model_selection import KFold,GridSearchCV

In [4]:
#loading the dataset
df=pd.read_csv('../input/yeh-concret-data/Concrete_Data_Yeh.csv')
#showing the data
df

Unnamed: 0,cement,slag,flyash,water,superplasticizer,coarseaggregate,fineaggregate,age,csMPa
0,540.0,0.0,0.0,162.0,2.5,1040.0,676.0,28,79.99
1,540.0,0.0,0.0,162.0,2.5,1055.0,676.0,28,61.89
2,332.5,142.5,0.0,228.0,0.0,932.0,594.0,270,40.27
3,332.5,142.5,0.0,228.0,0.0,932.0,594.0,365,41.05
4,198.6,132.4,0.0,192.0,0.0,978.4,825.5,360,44.30
...,...,...,...,...,...,...,...,...,...
1025,276.4,116.0,90.3,179.6,8.9,870.1,768.3,28,44.28
1026,322.2,0.0,115.6,196.0,10.4,817.9,813.4,28,31.18
1027,148.5,139.4,108.6,192.7,6.1,892.4,780.0,28,23.70
1028,159.1,186.7,0.0,175.6,11.3,989.6,788.9,28,32.77


In [5]:
#getting information about the dataset
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1030 entries, 0 to 1029
Data columns (total 9 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   cement            1030 non-null   float64
 1   slag              1030 non-null   float64
 2   flyash            1030 non-null   float64
 3   water             1030 non-null   float64
 4   superplasticizer  1030 non-null   float64
 5   coarseaggregate   1030 non-null   float64
 6   fineaggregate     1030 non-null   float64
 7   age               1030 non-null   int64  
 8   csMPa             1030 non-null   float64
dtypes: float64(8), int64(1)
memory usage: 72.5 KB


# #Preprocessing

In [12]:
#creating preprocessing dataset
def preprocess_inputs(df):
    #copying the dataset
    df=df.copy()
    #selecting target columns and storing it in y variable
    y=df['csMPa'].copy()
    #selecting feature columns and storing it in x variable
    x=df.drop('csMPa',axis=1)
    #train_test_split
    x_train,x_test,y_train,y_test=train_test_split(x,y,train_size=0.7,random_state=123)
    
    
    #Scale x  with a standard Scaler
    scaler=StandardScaler()
    scaler.fit(x_train)
    x_train=pd.DataFrame(scaler.transform(x_train),columns=x_train.columns)
    x_test=pd.DataFrame(scaler.transform(x_test),columns=x_test.columns)
    
    
    #returning the dataframe
    return x_train,x_test,y_train,y_test

In [13]:
#calling the function
x_train,x_test,y_train,y_test=preprocess_inputs(df)

print(x_train.shape)
print(x_test.shape)
print(y_train.shape)
print(y_test.shape)

(721, 8)
(309, 8)
(721,)
(309,)


In [15]:
x_train.var()

cement              1.001389
slag                1.001389
flyash              1.001389
water               1.001389
superplasticizer    1.001389
coarseaggregate     1.001389
fineaggregate       1.001389
age                 1.001389
dtype: float64

# Model Selection

In [18]:
#creating a dictionary with models
models={
    'Linear Regression':LinearRegression(),
    'L2 (Ridge) Regression': Ridge(),
    'Support Vector Machine': LinearSVR(),
    'Support Vector Machine(RBF Kernel)': SVR(),
    'Decision Tree': DecisionTreeRegressor(),
    'Neural Network': MLPRegressor(),
    'Random Forest': RandomForestRegressor(),
    'Graient Boosting': GradientBoostingRegressor(),
    'AdaBoost': AdaBoostRegressor()
}

In [19]:
models

{'Linear Regression': LinearRegression(),
 'L2 (Ridge) Regression': Ridge(),
 'Support Vector Machine': LinearSVR(),
 'Support Vector Machine(RBF Kernel)': SVR(),
 'Decision Tree': DecisionTreeRegressor(),
 'Neural Network': MLPRegressor(),
 'Random Forest': RandomForestRegressor(),
 'Graient Boosting': GradientBoostingRegressor(),
 'AdaBoost': AdaBoostRegressor()}

In [22]:
#creating looop through the model
for name,model in models.items():
    model.fit(x_train,y_train)
    print(name+'trained.')

Linear Regressiontrained.
L2 (Ridge) Regressiontrained.
Support Vector Machinetrained.
Support Vector Machine(RBF Kernel)trained.
Decision Treetrained.




Neural Networktrained.
Random Foresttrained.
Graient Boostingtrained.
AdaBoosttrained.


# #Model Optimization

In [21]:
best_model=GradientBoostingRegressor()
best_model.fit(x_train,y_train)
print('Model R^2 (Before Optimization):{:.5f}'.format(best_model.score(x_test,y_test)))

Model R^2 (Before Optimization):0.90696
