In [2]:
from warnings  import filterwarnings
filterwarnings('ignore')

In [3]:
path  =  r"C:\Users\admin\Downloads\PowerPlant.csv"

In [4]:
import pandas as pd
import numpy as np
import  seaborn as  sns
import matplotlib.pyplot as plt


In [5]:
df  =  pd.read_csv(path)

# basic checks and quality checks

In [6]:
df.head()

Unnamed: 0,AT,V,AP,RH,PE
0,8.34,40.77,1010.84,90.01,480.48
1,23.64,58.49,1011.4,74.2,445.75
2,29.74,56.9,1007.15,41.91,438.76
3,19.07,49.69,1007.22,76.79,453.09
4,11.8,40.66,1017.13,97.2,464.43


In [7]:
df.shape

(9568, 5)

In [8]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9568 entries, 0 to 9567
Data columns (total 5 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   AT      9568 non-null   float64
 1   V       9568 non-null   float64
 2   AP      9568 non-null   float64
 3   RH      9568 non-null   float64
 4   PE      9568 non-null   float64
dtypes: float64(5)
memory usage: 373.9 KB


In [9]:
df.isnull().sum()

AT    0
V     0
AP    0
RH    0
PE    0
dtype: int64

In [10]:
df.duplicated().sum()

np.int64(41)

In [13]:
df.drop_duplicates( keep = 'first' , inplace  =  True   , ignore_index =  True)

In [14]:
df.shape

(9527, 5)

# split x  &y

In [15]:
df.columns

Index(['AT', 'V', 'AP', 'RH', 'PE'], dtype='object')

In [16]:
x  =  df.drop('PE' ,  axis  = 1)
y  =  df['PE']

In [17]:
x.head()

Unnamed: 0,AT,V,AP,RH
0,8.34,40.77,1010.84,90.01
1,23.64,58.49,1011.4,74.2
2,29.74,56.9,1007.15,41.91
3,19.07,49.69,1007.22,76.79
4,11.8,40.66,1017.13,97.2


In [18]:
y.head()

0    480.48
1    445.75
2    438.76
3    453.09
4    464.43
Name: PE, dtype: float64

# create pipeline

In [19]:
from sklearn.pipeline  import Pipeline
from sklearn.preprocessing  import  StandardScaler
from sklearn.impute import SimpleImputer

In [20]:
num_pipeline =   Pipeline(steps  =  [ ('imputer' , SimpleImputer()) ,  ('scaler' ,  StandardScaler())])

In [22]:
x_scaled  =  num_pipeline.fit_transform(x)

In [23]:
x_scaled  =  pd.DataFrame(x_scaled  , columns  =  x.columns)

In [24]:
x_scaled.head()

Unnamed: 0,AT,V,AP,RH
0,-1.520448,-1.066041,-0.403535,1.141599
1,0.534897,0.330813,-0.309262,0.059223
2,1.354348,0.205475,-1.024725,-2.1514
3,-0.07902,-0.362884,-1.012941,0.236538
4,-1.055645,-1.074713,0.655349,1.633837


# train  test  split

In [25]:
from  sklearn.model_selection  import  train_test_split

In [26]:
x_train  , x_test  , y_train  ,  y_test  =  train_test_split(x_scaled  , y  ,  random_state   = 23  ,  test_size  = 0.2)

In [27]:
print(x_train.shape ,  y_train.shape)
print(x_test.shape ,  y_test.shape)

(7621, 4) (7621,)
(1906, 4) (1906,)


# Model  fitting

In [31]:
from sklearn.tree  import  DecisionTreeRegressor

In [39]:
model  =  DecisionTreeRegressor(max_depth  =  4   ,  min_samples_split=15  , )

In [40]:
model.fit(x_train  , y_train)

0,1,2
,criterion,'squared_error'
,splitter,'best'
,max_depth,4
,min_samples_split,15
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,
,random_state,
,max_leaf_nodes,
,min_impurity_decrease,0.0


In [41]:
model.score(x_train  ,  y_train )

0.9276707710472929

In [42]:
model.score(x_test  , y_test)

0.9193557987169496

# model  evaluation 

In [45]:
from  sklearn.metrics  import (mean_squared_error  ,  
root_mean_squared_error  ,
 r2_score ,
 mean_absolute_error)

In [46]:
def evaluate(model  ,  x  , y):
    pred  =  model.predict(x)
    MSE = mean_squared_error (y ,  pred)
    MAE  =   mean_absolute_error(y  , pred)
    RMSE =  root_mean_squared_error(y ,  pred)
    r2  =  r2_score(y , pred)

    print(f"MSE  -->  {MSE}" )
    print(f"RMSE  -->  {RMSE}" )
    print(f"MAE  -->  {MAE}" )
    print(f"r2  -->  {r2}" )




# training evaluation

In [47]:
evaluate(model  ,  x_train  ,  y_train)

MSE  -->  21.026289120369075
RMSE  -->  4.5854431760048096
MAE  -->  3.5600757424615113
r2  -->  0.9276707710472929


# testing evalaution

In [48]:
evaluate(model  , x_test  ,  y_test)

MSE  -->  23.28002777881791
RMSE  -->  4.824938111397691
MAE  -->  3.6819395910796184
r2  -->  0.9193557987169496


# hyper parameter  tunning
-  grid  search  cv

In [51]:
para =  {"criterion" : ['squared_error' , 'absolute_error' ],
        "max_depth"  :   range(2  , 5) ,  
        "min_samples_split":range(10  ,  20)}


In [53]:
from  sklearn.model_selection  import GridSearchCV
DT  = DecisionTreeRegressor()

In [54]:
gscv  = GridSearchCV(DT  ,  param_grid  =  para ,  cv  = 5  )

In [55]:
gscv.fit(x_train  ,  y_train)

0,1,2
,estimator,DecisionTreeRegressor()
,param_grid,"{'criterion': ['squared_error', 'absolute_error'], 'max_depth': range(2, 5), 'min_samples_split': range(10, 20)}"
,scoring,
,n_jobs,
,refit,True
,cv,5
,verbose,0
,pre_dispatch,'2*n_jobs'
,error_score,
,return_train_score,False

0,1,2
,criterion,'squared_error'
,splitter,'best'
,max_depth,4
,min_samples_split,10
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,
,random_state,
,max_leaf_nodes,
,min_impurity_decrease,0.0


In [56]:
gscv.best_params_

{'criterion': 'squared_error', 'max_depth': 4, 'min_samples_split': 10}

In [57]:
gscv.score(x_train  ,  y_train)

0.9276707710472929

In [58]:
gscv.score(x_test  ,  y_test)

0.9193557987169496

In [59]:
evaluate(gscv ,  x_train  ,  y_train )

MSE  -->  21.026289120369075
RMSE  -->  4.5854431760048096
MAE  -->  3.560075742461509
r2  -->  0.9276707710472929


In [60]:
evaluate(gscv  ,  x_test  ,  y_test)

MSE  -->  23.280027778817917
RMSE  -->  4.824938111397691
MAE  -->  3.681939591079613
r2  -->  0.9193557987169496


# sample  file

In [61]:
sample =   pd.read_csv(r"C:\Users\admin\Downloads\test_PowerPlant.csv")

In [63]:
sample_x  =   num_pipeline.transform(sample)

In [64]:
sample_x  = pd.DataFrame(sample_x , columns  =  x.columns)

In [65]:
sample_x.head()

Unnamed: 0,AT,V,AP,RH
0,0.38041,0.356827,-0.590397,0.811615
1,1.238819,0.047028,-1.023041,-1.830315
2,1.105826,1.49118,-2.006171,0.858853
3,0.810286,0.628,-0.312629,0.051007
4,-1.007284,-1.062888,2.113209,-0.161908


In [66]:
sample_pred  =   gscv.predict(sample_x)

In [67]:
sample_pred

array([444.94753507, 439.85990676, 434.56208491, 444.94753507,
       469.59130112, 444.94753507, 450.22630769, 481.67360714,
       465.73017331, 476.7432128 , 457.72740541, 472.11744828,
       450.22630769, 439.85990676, 432.23518433, 432.23518433,
       450.22630769, 465.73017331, 444.60168831, 434.56208491,
       434.56208491, 469.59130112, 481.67360714, 434.56208491,
       462.66899061, 444.94753507, 439.85990676, 434.56208491,
       439.85990676, 444.94753507, 481.67360714, 465.73017331,
       434.56208491, 444.94753507, 434.56208491, 444.94753507,
       465.73017331, 469.59130112, 434.56208491, 476.7432128 ,
       450.22630769, 450.22630769, 481.67360714, 432.23518433,
       456.04356932, 444.94753507, 450.22630769, 450.22630769,
       456.04356932, 456.04356932, 476.7432128 , 450.22630769,
       469.59130112, 462.66899061, 465.73017331, 439.85990676,
       432.23518433, 450.22630769, 469.59130112, 476.7432128 ,
       438.81386293, 434.56208491, 457.72740541, 450.22

In [68]:
sample_pred  = pd.DataFrame(sample_pred)

In [69]:
sample_pred

Unnamed: 0,0
0,444.947535
1,439.859907
2,434.562085
3,444.947535
4,469.591301
...,...
95,434.562085
96,476.743213
97,485.798904
98,438.813863
