###  power  plant dataset

#####  Read file / data ingestion

In [1]:
import pandas   as  pd
import numpy as  np
import  seaborn as sns
import matplotlib.pyplot as  plt

In [2]:
import warnings
warnings.filterwarnings('ignore')


In [3]:
path  =  r"D:\datasets\PowerPlant (1).csv"

In [4]:
df  =  pd.read_csv(path)

####  basic data quality  checks

In [5]:
df.head()

Unnamed: 0,AT,V,AP,RH,PE
0,8.34,40.77,1010.84,90.01,480.48
1,23.64,58.49,1011.4,74.2,445.75
2,29.74,56.9,1007.15,41.91,438.76
3,19.07,49.69,1007.22,76.79,453.09
4,11.8,40.66,1017.13,97.2,464.43


In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9568 entries, 0 to 9567
Data columns (total 5 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   AT      9568 non-null   float64
 1   V       9568 non-null   float64
 2   AP      9568 non-null   float64
 3   RH      9568 non-null   float64
 4   PE      9568 non-null   float64
dtypes: float64(5)
memory usage: 373.9 KB


In [7]:
df.isnull().sum()

AT    0
V     0
AP    0
RH    0
PE    0
dtype: int64

In [8]:
df.duplicated().sum()

np.int64(41)

In [9]:
df.drop_duplicates(keep  =  'first' ,  
ignore_index  =  True  ,  inplace  =  True)

In [10]:
df.shape

(9527, 5)

In [11]:
df.columns

Index(['AT', 'V', 'AP', 'RH', 'PE'], dtype='object')

####  seperate  x  and  y 

In [12]:
x  =  df.drop('PE' , axis =  1)
y =  df['PE']

##### create  pipeline

In [13]:
from  sklearn.pipeline  import Pipeline
from sklearn.preprocessing import StandardScaler
from  sklearn.impute import SimpleImputer

In [14]:
num_pipeline  =  Pipeline(steps  =  [('imputer' ,  SimpleImputer(strategy='mean' ) ) , 
                                      ('scaler' ,  StandardScaler())])


In [15]:
num_pipeline

0,1,2
,steps,"[('imputer', ...), ('scaler', ...)]"
,transform_input,
,memory,
,verbose,False

0,1,2
,missing_values,
,strategy,'mean'
,fill_value,
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,copy,True
,with_mean,True
,with_std,True


In [16]:
##  apply  on  data
x_scaled =  num_pipeline.fit_transform(x)


In [17]:
x_scaled

array([[-1.52044806, -1.0660414 , -0.40353495,  1.1415991 ],
       [ 0.53489677,  0.33081328, -0.30926226,  0.05922255],
       [ 1.35434798,  0.20547474, -1.02472466, -2.15139976],
       ...,
       [-0.49277565, -0.86345017,  0.16210121,  0.36456091],
       [-0.26977745,  0.43959767,  0.89944835,  1.45926053],
       [ 0.54027022, -0.23596918, -0.23182397, -0.14342353]],
      shape=(9527, 4))

In [18]:
###  convert  in dataframe 
x_scaled =  pd.DataFrame(x_scaled , columns = x.columns)

In [19]:
x_scaled.head()

Unnamed: 0,AT,V,AP,RH
0,-1.520448,-1.066041,-0.403535,1.141599
1,0.534897,0.330813,-0.309262,0.059223
2,1.354348,0.205475,-1.024725,-2.1514
3,-0.07902,-0.362884,-1.012941,0.236538
4,-1.055645,-1.074713,0.655349,1.633837


In [20]:
###  train  test split 
from sklearn.model_selection import train_test_split
x_train  ,  x_test , y_train  ,  y_test =  train_test_split(x_scaled , y  , 
                                                             test_size  =  0.2  ,
                                                             random_state=3)

In [21]:
print(x_train.shape  ,  y_train.shape)
print(x_test.shape  , y_test.shape)

(7621, 4) (7621,)
(1906, 4) (1906,)


In [22]:
x_train.head()

Unnamed: 0,AT,V,AP,RH
2678,-0.25097,-0.740477,1.222669,-0.247485
58,-0.53039,-1.11728,-0.071897,1.651637
1951,-0.194549,-0.980906,0.241223,-1.1279
8950,-0.195893,-0.124032,2.126677,-1.3059
2781,0.802226,1.362688,-0.521376,0.974553


In [23]:
x_test.head()

Unnamed: 0,AT,V,AP,RH
7718,1.644514,1.507734,-0.406902,-0.873223
7006,-1.296107,-1.03451,0.933117,0.924576
2181,-0.82593,-0.87291,-0.607231,0.115361
4887,1.048061,0.596468,-0.142602,-0.496
8947,0.530867,-0.404664,-1.826043,-0.527493


In [24]:
y_train.head()

2678    457.72
58      462.60
1951    468.40
8950    463.47
2781    433.76
Name: PE, dtype: float64

In [25]:
y_test.head()

7718    432.27
7006    479.40
2181    465.89
4887    439.72
8947    443.50
Name: PE, dtype: float64

### Model  fitting

In [26]:
from sklearn.tree  import DecisionTreeRegressor
model  =  DecisionTreeRegressor(max_depth= 3 )


In [27]:
model.fit(x_train ,  y_train)

0,1,2
,criterion,'squared_error'
,splitter,'best'
,max_depth,3
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,
,random_state,
,max_leaf_nodes,
,min_impurity_decrease,0.0


In [28]:
model.score(x_train,  y_train)

0.9122605209046077

In [29]:
model.score(x_test  , y_test)

0.9022861251047828

#### model  evaluation

In [31]:
from sklearn.metrics import (mean_absolute_error , 
                             mean_squared_error , 
                               mean_absolute_percentage_error , 
                    root_mean_squared_error,
                               r2_score)

In [34]:
###  create  fundtion  for  model evaluation
def  evaluate(model  , x ,  y):
    pred  =  model.predict(x)
    MSE  =  mean_squared_error(y  ,  pred)
    RMSE = root_mean_squared_error(y  ,  pred)
    MAE  =  mean_absolute_error(y  ,  pred)
    MAPE  =  mean_absolute_percentage_error(y  ,  pred)
    r2 =  r2_score(y  ,  pred)
    
    print(f'MSE  -- >  {MSE}')
    print(f'RMSE  -- >  {RMSE}')
    print(f'MAE  -- >  {MAE}')
    print(f'MAPE  -- >  {MAPE}')
    print(f'R2_SCORE -- > {r2}')


####  training Evaluation

In [35]:
evaluate(model  ,  x_train ,  y_train)

MSE  -- >  25.3678410815547
RMSE  -- >  5.036649787463359
MAE  -- >  3.9547671331415115
MAPE  -- >  0.008715455767665655
R2_SCORE -- > 0.9122605209046077


In [36]:
y_train.head()

2678    457.72
58      462.60
1951    468.40
8950    463.47
2781    433.76
Name: PE, dtype: float64

#### testing evaluation

In [37]:
evaluate(model  ,  x_test  ,  y_test)

MSE  -- >  28.823983558906868
RMSE  -- >  5.368797217152728
MAE  -- >  4.143691273219847
MAPE  -- >  0.009129051065845935
R2_SCORE -- > 0.9022861251047828


#### cross  validation score

In [38]:
from sklearn.model_selection import cross_val_score
DTR   =  DecisionTreeRegressor(max_depth  =  3)
cvs  =  cross_val_score(DTR  ,  x_train , y_train ,  cv  =  5  ,  n_jobs  =  -1  )

In [39]:
cvs

array([0.90457656, 0.91174207, 0.9044196 , 0.91347055, 0.90911642])

In [40]:
cvs.mean()

np.float64(0.90866503850818)

In [41]:
cvs.std()

np.float64(0.0036743471424623723)

####  grid search cv 

In [49]:
from sklearn.model_selection import GridSearchCV
param  =  {'max_depth' :  range(2 , 5) ,  
           "criterion" :  ['absolute_error' , 'squared_error'] , 
           'min_samples_split' :  range(2 , 10)}
DTR1  =  DecisionTreeRegressor( )
DTR_grid  =  GridSearchCV( DTR1  ,  param_grid  =  param   ,
                           cv =  5  ,  n_jobs= -1)


In [50]:
DTR_grid.fit(x_train  ,  y_train)

0,1,2
,estimator,DecisionTreeRegressor()
,param_grid,"{'criterion': ['absolute_error', 'squared_error'], 'max_depth': range(2, 5), 'min_samples_split': range(2, 10)}"
,scoring,
,n_jobs,-1
,refit,True
,cv,5
,verbose,0
,pre_dispatch,'2*n_jobs'
,error_score,
,return_train_score,False

0,1,2
,criterion,'squared_error'
,splitter,'best'
,max_depth,4
,min_samples_split,7
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,
,random_state,
,max_leaf_nodes,
,min_impurity_decrease,0.0


In [51]:
DTR_grid.best_params_

{'criterion': 'squared_error', 'max_depth': 4, 'min_samples_split': 7}

### evaluate 

In [53]:
evaluate(DTR_grid  ,  x_train  ,  y_train)

MSE  -- >  20.39190932786583
RMSE  -- >  4.515740174973072
MAE  -- >  3.4895329007497793
MAPE  -- >  0.007690166501282645
R2_SCORE -- > 0.9294707225405809


In [54]:
evaluate(DTR_grid ,  x_test  ,  y_test)

MSE  -- >  24.844760514884236
RMSE  -- >  4.984451877075777
MAE  -- >  3.7683471479380386
MAPE  -- >  0.008298624098851157
R2_SCORE -- > 0.9157757700009909
