### Importing Libraries

In [1]:
import pandas as pd 
import numpy as np
import matplotlib.pyplot as plt

### Loading the dataset

In [2]:
df = pd.read_csv('Datasets/Slump.csv',sep= '\t')

### Performing EDA

In [3]:
df.shape

(103, 10)

In [4]:
df.head()

Unnamed: 0,Cement,Slag,Fly ash,Water,SP,Coarse Aggr.,Fine Aggr.,SLUMP(cm),FLOW(cm),Compressive Strength (28-day)(Mpa)
0,273.0,82.0,105.0,210.0,9.0,904.0,680.0,23.0,62.0,34.99
1,163.0,149.0,191.0,180.0,12.0,843.0,746.0,0.0,20.0,41.14
2,162.0,148.0,191.0,179.0,16.0,840.0,743.0,1.0,20.0,41.81
3,162.0,148.0,190.0,179.0,19.0,838.0,741.0,3.0,21.5,42.08
4,154.0,112.0,144.0,220.0,10.0,923.0,658.0,20.0,64.0,26.82


In [5]:
#check for missing values
df.isnull().sum()

Cement                                0
Slag                                  0
Fly ash                               0
Water                                 0
SP                                    0
Coarse Aggr.                          0
Fine Aggr.                            0
SLUMP(cm)                             0
FLOW(cm)                              0
Compressive Strength (28-day)(Mpa)    0
dtype: int64

In [6]:
#check for categorical variables
df.dtypes

Cement                                float64
Slag                                  float64
Fly ash                               float64
Water                                 float64
SP                                    float64
Coarse Aggr.                          float64
Fine Aggr.                            float64
SLUMP(cm)                             float64
FLOW(cm)                              float64
Compressive Strength (28-day)(Mpa)    float64
dtype: object

In [7]:
df.shape

(103, 10)

In [8]:
#checking the correlation
df.corr()

Unnamed: 0,Cement,Slag,Fly ash,Water,SP,Coarse Aggr.,Fine Aggr.,SLUMP(cm),FLOW(cm),Compressive Strength (28-day)(Mpa)
Cement,1.0,-0.243553,-0.486535,0.221091,-0.106387,-0.309857,0.056959,0.145913,0.186461,0.445656
Slag,-0.243553,1.0,-0.322619,-0.026775,0.306504,-0.223792,-0.183522,-0.284037,-0.327231,-0.331522
Fly ash,-0.486535,-0.322619,1.0,-0.241321,-0.143508,0.17262,-0.282854,-0.118621,-0.055423,0.44438
Water,0.221091,-0.026775,-0.241321,1.0,-0.155456,-0.602201,0.114591,0.466568,0.632026,-0.25432
SP,-0.106387,0.306504,-0.143508,-0.155456,1.0,-0.104159,0.05829,-0.212902,-0.176314,-0.037909
Coarse Aggr.,-0.309857,-0.223792,0.17262,-0.602201,-0.104159,1.0,-0.488537,-0.188157,-0.325787,-0.16061
Fine Aggr.,0.056959,-0.183522,-0.282854,0.114591,0.05829,-0.488537,1.0,0.202272,0.19037,-0.154532
SLUMP(cm),0.145913,-0.284037,-0.118621,0.466568,-0.212902,-0.188157,0.202272,1.0,0.906135,-0.223499
FLOW(cm),0.186461,-0.327231,-0.055423,0.632026,-0.176314,-0.325787,0.19037,0.906135,1.0,-0.124189
Compressive Strength (28-day)(Mpa),0.445656,-0.331522,0.44438,-0.25432,-0.037909,-0.16061,-0.154532,-0.223499,-0.124189,1.0


In [9]:
#split X and y
X = df.iloc[:,:-1].values
y = df.iloc[:,-1].values

In [10]:
X.shape

(103, 9)

In [11]:
y.shape

(103,)

In [12]:
y = y.reshape(-1,1)

In [13]:
#splitting into train and test
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size = 0.2,random_state = 0)

### Model Building without hyperparameter tuning

In [14]:
from sklearn.tree import DecisionTreeRegressor
model = DecisionTreeRegressor(random_state = 0)
model.fit(X_train,y_train)
y_pred = model.predict(X_test)

In [15]:
from sklearn.metrics import r2_score, mean_squared_error
r2_score(y_test,y_pred)

0.8360911078435956

### Hyperparameter tuning

In [16]:
from sklearn.model_selection import GridSearchCV
param_grid = {
    'max_depth' : [1,2,3,4,5,6,7],
    'min_samples_leaf' : [1,2,3,4,5],
    'max_features' : [2,3,4,5,6,7]
}
estimator = DecisionTreeRegressor(random_state = 0)

In [17]:
best_model = GridSearchCV(estimator,param_grid,cv=5)

In [18]:
best_model.fit(X_train,y_train)

GridSearchCV(cv=5, error_score=nan,
             estimator=DecisionTreeRegressor(ccp_alpha=0.0, criterion='mse',
                                             max_depth=None, max_features=None,
                                             max_leaf_nodes=None,
                                             min_impurity_decrease=0.0,
                                             min_impurity_split=None,
                                             min_samples_leaf=1,
                                             min_samples_split=2,
                                             min_weight_fraction_leaf=0.0,
                                             presort='deprecated',
                                             random_state=0, splitter='best'),
             iid='deprecated', n_jobs=None,
             param_grid={'max_depth': [1, 2, 3, 4, 5, 6, 7],
                         'max_features': [2, 3, 4, 5, 6, 7],
                         'min_samples_leaf': [1, 2, 3, 4, 5]},
             pre_d

In [19]:
best_model.best_estimator_

DecisionTreeRegressor(ccp_alpha=0.0, criterion='mse', max_depth=5,
                      max_features=7, max_leaf_nodes=None,
                      min_impurity_decrease=0.0, min_impurity_split=None,
                      min_samples_leaf=5, min_samples_split=2,
                      min_weight_fraction_leaf=0.0, presort='deprecated',
                      random_state=0, splitter='best')

In [20]:
best_model.best_params_

{'max_depth': 5, 'max_features': 7, 'min_samples_leaf': 5}

### Model Building with Hyperparameter Tuning

In [21]:
model = DecisionTreeRegressor(ccp_alpha=0.0, criterion='mse', max_depth=5,
                      max_features=7, max_leaf_nodes=None,
                      min_impurity_decrease=0.0, min_impurity_split=None,
                      min_samples_leaf=5, min_samples_split=2,
                      min_weight_fraction_leaf=0.0, presort='deprecated',
                      random_state=0, splitter='best')

In [22]:
model.fit(X_train,y_train)
y_pred = model.predict(X_test)

### Model Evaluation

In [23]:
r2_score(y_test,y_pred)

0.7620913447133147