In [1]:
# Importing the libraries

import pandas as pd
import numpy as np 
import matplotlib.pyplot as plt

%matplotlib inline

# Ignore harmless warnings 

import warnings 
warnings.filterwarnings("ignore")

# Set to display all the columns in dataset

pd.set_option("display.max_columns", None)

# Import psql to run queries 

import pandasql as psql

In [2]:
#load the dataset
concrete=pd.read_csv(r"E:\AIML INTERNSHIP\Datasets\Concrete_Data_V1.0.csv",header=0)
concrete.head()

Unnamed: 0,Cement,Slag,Flyash,Water,SuperPlasticizer,CoarseAggregate,FineAggregate,Age,CSinMPa
0,540.0,0.0,0.0,162.0,2.5,1040.0,676.0,28,79.99
1,540.0,0.0,0.0,162.0,2.5,1055.0,676.0,28,61.89
2,332.5,142.5,0.0,228.0,0.0,932.0,594.0,270,40.27
3,332.5,142.5,0.0,228.0,0.0,932.0,594.0,365,41.05
4,198.6,132.4,0.0,192.0,0.0,978.4,825.5,360,44.3


In [3]:
concrete.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1030 entries, 0 to 1029
Data columns (total 9 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   Cement            1030 non-null   float64
 1   Slag              1030 non-null   float64
 2   Flyash            1030 non-null   float64
 3   Water             1030 non-null   float64
 4   SuperPlasticizer  1030 non-null   float64
 5   CoarseAggregate   1030 non-null   float64
 6   FineAggregate     1030 non-null   float64
 7   Age               1030 non-null   int64  
 8   CSinMPa           1030 non-null   float64
dtypes: float64(8), int64(1)
memory usage: 72.6 KB


In [4]:
concrete.shape

(1030, 9)

In [5]:
# Eliminate the outlier in 'CSinMPa' and write data to new file

CSinMPa_UL = round(concrete.CSinMPa.mean() + 3 * concrete.CSinMPa.std(),3)
CSinMPa_LL = round(concrete.CSinMPa.mean() - 3 * concrete.CSinMPa.std(),3)
concrete_new = concrete[(concrete.CSinMPa > CSinMPa_LL) & (concrete.CSinMPa < CSinMPa_UL)]
concrete_bk = concrete_new
concrete_new.shape

(1030, 9)

In [6]:
concrete_new.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1030 entries, 0 to 1029
Data columns (total 9 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   Cement            1030 non-null   float64
 1   Slag              1030 non-null   float64
 2   Flyash            1030 non-null   float64
 3   Water             1030 non-null   float64
 4   SuperPlasticizer  1030 non-null   float64
 5   CoarseAggregate   1030 non-null   float64
 6   FineAggregate     1030 non-null   float64
 7   Age               1030 non-null   int64  
 8   CSinMPa           1030 non-null   float64
dtypes: float64(8), int64(1)
memory usage: 72.6 KB


In [7]:
concrete.isnull().sum()

Cement              0
Slag                0
Flyash              0
Water               0
SuperPlasticizer    0
CoarseAggregate     0
FineAggregate       0
Age                 0
CSinMPa             0
dtype: int64

In [8]:
concrete_new.duplicated().any()

True

In [9]:
concrete_new.describe()

Unnamed: 0,Cement,Slag,Flyash,Water,SuperPlasticizer,CoarseAggregate,FineAggregate,Age,CSinMPa
count,1030.0,1030.0,1030.0,1030.0,1030.0,1030.0,1030.0,1030.0,1030.0
mean,281.167864,73.895825,54.18835,181.567282,6.20466,972.918932,773.580485,45.662136,35.817961
std,104.506364,86.279342,63.997004,21.354219,5.973841,77.753954,80.17598,63.169912,16.705742
min,102.0,0.0,0.0,121.8,0.0,801.0,594.0,1.0,2.33
25%,192.375,0.0,0.0,164.9,0.0,932.0,730.95,7.0,23.71
50%,272.9,22.0,0.0,185.0,6.4,968.0,779.5,28.0,34.445
75%,350.0,142.95,118.3,192.0,10.2,1029.4,824.0,56.0,46.135
max,540.0,359.4,200.1,247.0,32.2,1145.0,992.6,365.0,82.6


In [10]:
concrete_new.columns

Index(['Cement', 'Slag', 'Flyash', 'Water', 'SuperPlasticizer',
       'CoarseAggregate', 'FineAggregate', 'Age', 'CSinMPa'],
      dtype='object')

In [12]:
concrete_new.head()

Unnamed: 0,Cement,Slag,Flyash,Water,SuperPlasticizer,CoarseAggregate,FineAggregate,Age,CSinMPa
0,540.0,0.0,0.0,162.0,2.5,1040.0,676.0,28,79.99
1,540.0,0.0,0.0,162.0,2.5,1055.0,676.0,28,61.89
2,332.5,142.5,0.0,228.0,0.0,932.0,594.0,270,40.27
3,332.5,142.5,0.0,228.0,0.0,932.0,594.0,365,41.05
4,198.6,132.4,0.0,192.0,0.0,978.4,825.5,360,44.3


In [13]:
# Identify the independent and targrt (dependent) variables

IndepVar = []
for col in concrete_new.columns:
    if col != 'CSinMPa':
        IndepVar.append(col)
TargetVar ='CSinMPa'

x = concrete_new[IndepVar]
y = concrete_new[TargetVar]

In [14]:
 # Split the data into train and test (random sampling)

from sklearn.model_selection import train_test_split 

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state=42)

# Display the shape for train & test data

x_train.shape, x_test.shape, y_train.shape, y_test.shape

((721, 8), (309, 8), (721,), (309,))

In [None]:
# Scaling the features by using MinMaxScaler

from sklearn.preprocessing import MinMaxScaler

mmscaler = MinMaxScaler(feature_range=(0, 1))

x_train = mmscaler.fit_transform(x_train)
x_train = pd.DataFrame(x_train)

x_test = mmscaler.fit_transform(x_test)
x_test = pd.DataFrame(x_test)

# Logistic Regression Algorithm

In [None]:
# Build the multi regression model

from sklearn.linear_model import LinearRegression  

# Create object for the model

ModelMLR = LinearRegression()

# Train the model with training data

ModelMLR.fit(x_train, y_train)

# Predict the model with test dataset

y_pred = ModelMLR.predict(x_test)

# Evaluation metrics for Regression analysis

from sklearn import metrics

print('Mean Absolute Error (MAE):', round(metrics.mean_absolute_error(y_test, y_pred),3))  
print('Mean Squared Error (MSE):', round(metrics.mean_squared_error(y_test, y_pred),3))  
print('Root Mean Squared Error (RMSE):', round(np.sqrt(metrics.mean_squared_error(y_test, y_pred)),3))
print('R2_score:', round(metrics.r2_score(y_test, y_pred),6))
print('Root Mean Squared Log Error (RMSLE):', round(np.log(np.sqrt(metrics.mean_squared_error(y_test, y_pred))),3))
print('Mean Absolute Percentage Error (MAPE):', round(metrics.mean_absolute_percentage_error(y_test, y_pred)*100,3), '%')
# Define the function to calculate the MAPE - Mean Absolute Percentage Error

def MAPE (y_test, y_pred): 
    y_test, y_pred = np.array(y_test), np.array(y_pred)
    return np.mean(np.abs((y_test - y_pred) / y_test)) * 100

# Evaluation of MAPE 

result = MAPE(y_test, y_pred)
print('Mean Absolute Percentage Error (MAPE):', round(result, 3), '%')

# Calculate Adjusted R squared values 

r_squared = round(metrics.r2_score(y_test, y_pred),6)
adjusted_r_squared = round(1 - (1-r_squared)*(len(y)-1)/(len(y)-x.shape[1]-1),6)
print('Adj R Square: ', adjusted_r_squared)


Mean Absolute Error (MAE): 8.355
Mean Squared Error (MSE): 111.763
Root Mean Squared Error (RMSE): 10.572
R2_score: 0.58694
Root Mean Squared Log Error (RMSLE): 2.358
Mean Absolute Percentage Error (MAPE): 32.463 %
Mean Absolute Percentage Error (MAPE): 32.463 %
Adj R Square:  0.583703


# compare regression algorithms

In [None]:
#load the dataset
RGR=pd.read_csv(r"C:\Users\Dlc\vamsi munagala\Concrete_Data_V1.0.csv",header=0)
rgr_bk=RGR.copy()
RGR.head()

Unnamed: 0,Test_ID,Cement,Slag,Flyash,Water,SuperPlasticizer,CoarseAggregate,FineAggregate,Age,CSinMPa
0,C3417,540.0,0.0,0.0,162.0,2.5,1040.0,676.0,28,79.99
1,P7675,540.0,0.0,0.0,162.0,2.5,1055.0,676.0,28,61.89
2,Y5697,332.5,142.5,0.0,228.0,0.0,932.0,594.0,270,40.27
3,R3821,332.5,142.5,0.0,228.0,0.0,932.0,594.0,365,41.05
4,P3773,198.6,132.4,0.0,192.0,0.0,978.4,825.5,360,44.3


In [None]:
# Build the Regression / Regressor models

from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import ExtraTreesRegressor
#from sklearn.neighbors import KNeighborsRegressor
#from sklearn.linear_model import BayesianRidge
#from sklearn.svm import SVR

In [None]:
# Build the multi regression model

from sklearn.linear_model import LinearRegression  

# Create object for the model

ModelMLR = LinearRegression()

# Train the model with training data

ModelMLR.fit(x_train, y_train)

# Predict the model with test dataset

y_pred = ModelMLR.predict(x_test)
# Evaluation metrics for Regression analysis

from sklearn import metrics

print('Mean Absolute Error (MAE):', round(metrics.mean_absolute_error(y_test, y_pred),3))  
print('Mean Squared Error (MSE):', round(metrics.mean_squared_error(y_test, y_pred),3))  
print('Root Mean Squared Error (RMSE):', round(np.sqrt(metrics.mean_squared_error(y_test, y_pred)),3))
print('R2_score:', round(metrics.r2_score(y_test, y_pred),6))
print('Root Mean Squared Log Error (RMSLE):', round(np.log(np.sqrt(metrics.mean_squared_error(y_test, y_pred))),3))
print('Mean Absolute Percentage Error (MAPE):', round(metrics.mean_absolute_percentage_error(y_test, y_pred)*100,3), '%')
# Define the function to calculate the MAPE - Mean Absolute Percentage Error
def MAPE (y_test, y_pred): 
    y_test, y_pred = np.array(y_test), np.array(y_pred)
    return np.mean(np.abs((y_test - y_pred) / y_test)) * 100

# Evaluation of MAPE 

result = MAPE(y_test, y_pred)
print('Mean Absolute Percentage Error (MAPE):', round(result, 3), '%')

Mean Absolute Error (MAE): 8.355
Mean Squared Error (MSE): 111.763
Root Mean Squared Error (RMSE): 10.572
R2_score: 0.58694
Root Mean Squared Log Error (RMSLE): 2.358
Mean Absolute Percentage Error (MAPE): 32.463 %
Mean Absolute Percentage Error (MAPE): 32.463 %


In [None]:
# Calculate Adjusted R squared values 

r_squared = round(metrics.r2_score(y_test, y_pred),6)
adjusted_r_squared = round(1 - (1-r_squared)*(len(y)-1)/(len(y)-x.shape[1]-1),6)
print('Adj R Square: ', adjusted_r_squared)

Adj R Square:  0.583703


In [None]:
#display the final results

Results =pd.DataFrame({'CSinMPa_A':y_test,'CSinMPa_P':y_pred})

#merge two Dataframes on index of both the dataframes
ResultsFinal =concrete_bk.merge(Results,left_index=True,right_index=True)

#display 5 records randomly

ResultsFinal.sample(5)

Unnamed: 0,Cement,Slag,Flyash,Water,SuperPlasticizer,CoarseAggregate,FineAggregate,Age,CSinMPa,CSinMPa_A,CSinMPa_P
581,181.9,272.8,0.0,185.7,0.0,1012.4,714.3,7,12.37,12.37,34.838495
165,425.0,106.3,0.0,151.4,18.6,936.0,803.7,91,66.7,66.7,66.447377
76,469.0,117.2,0.0,137.8,32.2,852.1,840.5,3,40.2,40.2,68.887965
247,238.1,0.0,94.1,186.7,7.0,949.9,847.0,56,39.59,39.59,27.948245
39,237.5,237.5,0.0,228.0,0.0,932.0,594.0,180,36.25,36.25,46.207198


In [None]:
RGRResults = pd.read_csv(r"C:\Users\Dlc\vamsi munagala\RGRResults.csv",header =0)
RGRResults.head()

Unnamed: 0,Model Name,Mean_Absolute_Error_MAE,Adj_R_Square,Root_Mean_Squared_Error_RMSE,Mean_Absolute_Percentage_Error_MAPE,Mean_Squared_Error_MSE,Root_Mean_Squared_Log_Error_RMSLE,R2_score


In [None]:
# Build the Regression / Regressor models

from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import ExtraTreesRegressor
#from sklearn.neighbors import KNeighborsRegressor
#from sklearn.linear_model import BayesianRidge
#from sklearn.svm import SVR

# Create objects of Regression / Regressor models with default hyper-parameters

ModelMLR = LinearRegression()
ModelDCR = DecisionTreeRegressor()
ModelRFR = RandomForestRegressor()
ModelETR = ExtraTreesRegressor()
#ModelKNN = KNeighborsRegressor(n_neighbors=5)
#ModelBRR = BayesianRidge()
#ModelSVR = SVR()

# Evalution matrix for all the algorithms

#MM = [ModelMLR, ModelDCR, ModelRFR, ModelETR, ModelKNN, ModelBRR, ModelSVR] 
MM = [ModelMLR, ModelDCR, ModelRFR, ModelETR]

for models in MM:
    
    # Fit the model with train data
    
    models.fit(x_train, y_train)
    
    # Predict the model with test data

    y_pred = models.predict(x_test)
    
    # Print the model name
    
    print('Model Name: ', models)
    
    # Evaluation metrics for Regression analysis

    from sklearn import metrics

    print('Mean Absolute Error (MAE):', round(metrics.mean_absolute_error(y_test, y_pred),3))  
    print('Mean Squared Error (MSE):', round(metrics.mean_squared_error(y_test, y_pred),3))  
    print('Root Mean Squared Error (RMSE):', round(np.sqrt(metrics.mean_squared_error(y_test, y_pred)),3))
    print('R2_score:', round(metrics.r2_score(y_test, y_pred),6))
    print('Root Mean Squared Log Error (RMSLE):', round(np.log(np.sqrt(metrics.mean_squared_error(y_test, y_pred))),3))
    
    # Define the function to calculate the MAPE - Mean Absolute Percentage Error

    def MAPE (y_test, y_pred):
        y_test, y_pred = np.array(y_test), np.array(y_pred)
        return np.mean(np.abs((y_test - y_pred) / y_test)) * 100
    
    # Evaluation of MAPE 

    result = MAPE(y_test, y_pred)
    print('Mean Absolute Percentage Error (MAPE):', round(result, 2), '%')
    
    # Calculate Adjusted R squared values 

    r_squared = round(metrics.r2_score(y_test, y_pred),6)
    adjusted_r_squared = round(1 - (1-r_squared)*(len(y)-1)/(len(y)-x.shape[1]-1),6)
    print('Adj R Square: ', adjusted_r_squared)
    print('------------------------------------------------------------------------------------------------------------')
    #-------------------------------------------------------------------------------------------
    new_row = {'Model Name' : models,
               'Mean_Absolute_Error_MAE' : metrics.mean_absolute_error(y_test, y_pred),
               'Adj_R_Square' : adjusted_r_squared,
               'Root_Mean_Squared_Error_RMSE' : np.sqrt(metrics.mean_squared_error(y_test, y_pred)),
               'Mean_Absolute_Percentage_Error_MAPE' : result,
               'Mean_Squared_Error_MSE' : metrics.mean_squared_error(y_test, y_pred),
               'Root_Mean_Squared_Log_Error_RMSLE': np.log(np.sqrt(metrics.mean_squared_error(y_test, y_pred))),
               'R2_score' : metrics.r2_score(y_test, y_pred)}
    RGRResults = RGRResults.append(new_row, ignore_index=True)
    #-------------------------------------------------------------------------------------------





Model Name:  LinearRegression()
Mean Absolute Error (MAE): 8.355
Mean Squared Error (MSE): 111.763
Root Mean Squared Error (RMSE): 10.572
R2_score: 0.58694
Root Mean Squared Log Error (RMSLE): 2.358
Mean Absolute Percentage Error (MAPE): 32.46 %
Adj R Square:  0.583703
------------------------------------------------------------------------------------------------------------
Model Name:  DecisionTreeRegressor()
Mean Absolute Error (MAE): 4.899
Mean Squared Error (MSE): 54.621
Root Mean Squared Error (RMSE): 7.391
R2_score: 0.798128
Root Mean Squared Log Error (RMSLE): 2.0
Mean Absolute Percentage Error (MAPE): 16.95 %
Adj R Square:  0.796546
------------------------------------------------------------------------------------------------------------
Model Name:  RandomForestRegressor()
Mean Absolute Error (MAE): 3.891
Mean Squared Error (MSE): 32.278
Root Mean Squared Error (RMSE): 5.681
R2_score: 0.880706
Root Mean Squared Log Error (RMSLE): 1.737
Mean Absolute Percentage Error (MAPE)

In [None]:
RGRResults.head(10)

Unnamed: 0,Model Name,Mean_Absolute_Error_MAE,Adj_R_Square,Root_Mean_Squared_Error_RMSE,Mean_Absolute_Percentage_Error_MAPE,Mean_Squared_Error_MSE,Root_Mean_Squared_Log_Error_RMSLE,R2_score
0,LinearRegression(),8.354868,0.583703,10.57182,32.462977,111.763381,2.358192,0.58694
1,DecisionTreeRegressor(),4.899256,0.796546,7.390626,16.951688,54.621347,2.000212,0.798128
2,"(DecisionTreeRegressor(max_features='auto', ra...",3.891467,0.879771,5.681351,13.043969,32.277755,1.737189,0.880706
3,"(ExtraTreeRegressor(random_state=1141241845), ...",4.041175,0.876267,5.763566,13.211867,33.218691,1.751556,0.877229


In [None]:
y_predF=ModelETR.predict(x_test)

In [None]:
#display the final results

Results =pd.DataFrame({'CSinMPa_A':y_test,'CSinMPa_F':y_predF})

#merge two Dataframes on index of both the dataframes
ResultsFinal =concrete_bk.merge(Results,left_index=True,right_index=True)

#display 5 records randomly

ResultsFinal.sample(5)

Unnamed: 0,Cement,Slag,Flyash,Water,SuperPlasticizer,CoarseAggregate,FineAggregate,Age,CSinMPa,CSinMPa_A,CSinMPa_F
825,397.0,0.0,0.0,185.0,0.0,1040.0,734.0,28,39.09,39.09,35.4112
445,165.0,0.0,143.6,163.8,0.0,1005.6,900.9,56,36.56,36.56,28.5176
910,144.0,136.0,106.0,178.0,7.0,941.0,774.0,28,26.14,26.14,27.4467
1025,276.4,116.0,90.3,179.6,8.9,870.1,768.3,28,44.28,44.28,43.4441
72,425.0,106.3,0.0,153.5,16.5,852.1,887.1,3,33.4,33.4,33.1361
