# Ensemble Techniques
Combining multiple models to improve accuracy
1. Bagging - Random Forest (parallel Trees are created)
2. Boosting - GradientBoost (sequentially corrective models are added)

In [10]:
from warnings import filterwarnings
filterwarnings("ignore")

### Step 1 : Read the Dataset

In [11]:
import pandas as pd
df = pd.read_csv("PowerPlant (1).csv")
df

Unnamed: 0,AT,V,AP,RH,PE
0,8.34,40.77,1010.84,90.01,480.48
1,23.64,58.49,1011.40,74.20,445.75
2,29.74,56.90,1007.15,41.91,438.76
3,19.07,49.69,1007.22,76.79,453.09
4,11.80,40.66,1017.13,97.20,464.43
...,...,...,...,...,...
9563,15.12,48.92,1011.80,72.93,462.59
9564,33.41,77.95,1010.30,59.72,432.90
9565,15.99,43.34,1014.20,78.66,465.96
9566,17.65,59.87,1018.58,94.65,450.93


### Step 2 : Basic data quality check

In [12]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9568 entries, 0 to 9567
Data columns (total 5 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   AT      9568 non-null   float64
 1   V       9568 non-null   float64
 2   AP      9568 non-null   float64
 3   RH      9568 non-null   float64
 4   PE      9568 non-null   float64
dtypes: float64(5)
memory usage: 373.9 KB


In [13]:
df.isna().sum()

AT    0
V     0
AP    0
RH    0
PE    0
dtype: int64

In [14]:
df.duplicated().sum()

41

In [20]:
df = df.drop_duplicates(keep = "first",ignore_index=True)

In [21]:
df.duplicated().sum()

0

In [22]:
df

Unnamed: 0,AT,V,AP,RH,PE
0,8.34,40.77,1010.84,90.01,480.48
1,23.64,58.49,1011.40,74.20,445.75
2,29.74,56.90,1007.15,41.91,438.76
3,19.07,49.69,1007.22,76.79,453.09
4,11.80,40.66,1017.13,97.20,464.43
...,...,...,...,...,...
9522,15.12,48.92,1011.80,72.93,462.59
9523,33.41,77.95,1010.30,59.72,432.90
9524,15.99,43.34,1014.20,78.66,465.96
9525,17.65,59.87,1018.58,94.65,450.93


In [24]:
df.nunique().sort_values()

V      634
AP    2517
AT    2773
RH    4546
PE    4836
dtype: int64

### Step 3 : Seperating X and Y

In [25]:
x = df.drop(columns="PE")
y = df[["PE"]]

In [26]:
x.head()

Unnamed: 0,AT,V,AP,RH
0,8.34,40.77,1010.84,90.01
1,23.64,58.49,1011.4,74.2
2,29.74,56.9,1007.15,41.91
3,19.07,49.69,1007.22,76.79
4,11.8,40.66,1017.13,97.2


In [27]:
y.head()

Unnamed: 0,PE
0,480.48
1,445.75
2,438.76
3,453.09
4,464.43


### Step 4: Create Preprocessing pipeline for X

In [28]:
x.dtypes

AT    float64
V     float64
AP    float64
RH    float64
dtype: object

In [29]:
from sklearn.pipeline import make_pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler

In [36]:
num_pipe = make_pipeline(
    SimpleImputer(strategy="mean"),
    StandardScaler()
).set_output(transform="pandas")


In [37]:
xpre = num_pipe.fit_transform(x)
xpre

Unnamed: 0,AT,V,AP,RH
0,-1.520448,-1.066041,-0.403535,1.141599
1,0.534897,0.330813,-0.309262,0.059223
2,1.354348,0.205475,-1.024725,-2.151400
3,-0.079020,-0.362884,-1.012941,0.236538
4,-1.055645,-1.074713,0.655349,1.633837
...,...,...,...,...
9522,-0.609648,-0.423583,-0.241925,-0.027724
9523,1.847362,1.864831,-0.494441,-0.932100
9524,-0.492776,-0.863450,0.162101,0.364561
9525,-0.269777,0.439598,0.899448,1.459261


### Step 5 : Perform the train test split

In [38]:
from sklearn.model_selection import train_test_split
xtrain,xtest,ytrain,ytest = train_test_split(xpre,y,test_size=0.2,random_state=42)
xtrain.head()

Unnamed: 0,AT,V,AP,RH
5695,-0.152905,0.324507,0.089713,1.817999
7300,-1.497611,-1.153542,0.323712,0.90883
9066,-0.885038,-0.954892,0.29341,0.988245
6424,1.398679,1.223949,-0.47929,-1.082031
6773,-0.824586,-0.942279,0.796758,1.506499


In [39]:
xtrain.shape

(7621, 4)

In [40]:
xtest.shape

(1906, 4)

### Step 6 : Model building

Random forest

In [46]:
from sklearn.ensemble import RandomForestRegressor
rfr = RandomForestRegressor(
    
        n_estimators =1,
        min_samples_leaf=5,
        min_samples_split=5,
        max_depth=1,
        criterion= 'poisson'
    )


In [47]:
rfr.fit(xtrain,ytrain)

In [48]:
rfr.score(xtrain,ytrain)

0.717237905962081

In [49]:
rfr.score(xtest,ytest)

0.7255787425883435

### Tuning Random Forest

In [69]:
param1 ={
    "max_depth" : [2,3,4,5],
    "min_samples_split" :[4,5,6],
    "criterion" : ["poisson",'squared_error']
}

### Randomized serch CV is faster then grid search cv

In [72]:
from sklearn.model_selection import RandomizedSearchCV
model1 = RandomForestRegressor()
rscv1 = RandomizedSearchCV(
    estimator= model1,
    param_distributions= param1,
    scoring= "r2",
    cv=5
   
)

In [73]:
rscv1.fit(xtrain,ytrain)

In [74]:
rscv1.best_score_

0.9375928391887687

In [75]:
rscv1.best_params_

{'min_samples_split': 5, 'max_depth': 5, 'criterion': 'squared_error'}

In [76]:
best_rf = rscv1.best_estimator_
best_rf

In [77]:
best_rf.score(xtrain,ytrain)

0.9414887696592059

In [78]:
best_rf.score(xtest,ytest)

0.9422594451315114

### Gradient Boosting

In [104]:
from sklearn.ensemble import GradientBoostingRegressor
gbr = GradientBoostingRegressor(
    n_estimators=1,
    learning_rate= 0.6,
    max_depth= 1,
    min_samples_leaf= 5,
    min_samples_split= 5
)
gbr.fit(xtrain,ytrain)

In [105]:
gbr.score(xtrain,ytrain)

0.6028340413015374

In [106]:
gbr.score(xtest,ytest)

0.5986075530579043

### Tuning Gradient Boost

In [107]:
param2 = {
    "learning_rate" : [0.05,0.1,0.2],
    "max_depth" : [2,3,4,5],
    "min_samples_split" : [ 5,6,7,8],
    "n_estimators" :[50,100,200]
}

In [108]:
model2 = GradientBoostingRegressor()
rscv2 = RandomizedSearchCV(
    estimator= model2,
    param_distributions= param2,
    scoring= "r2",
    cv = 5
)
rscv2.fit(xtrain,ytrain)

In [109]:
rscv2.best_score_

0.962630212627088

In [110]:
rscv2.best_params_

{'n_estimators': 200,
 'min_samples_split': 5,
 'max_depth': 5,
 'learning_rate': 0.2}

In [111]:
best_gb = rscv2.best_estimator_
best_gb

In [112]:
best_gb.score(xtrain,ytrain)

0.986422659607636

In [113]:
best_gb.score(xtest,ytest)

0.9695777147293496

### Step 7 - Detail Model Evaluation

Random Forest

In [115]:
from sklearn.metrics import mean_squared_error, mean_absolute_error, \
                            mean_absolute_percentage_error, r2_score

In [116]:
def evaluate_model(model, x, y):
    ypred = model.predict(x)
    mse = mean_squared_error(y, ypred)
    mae = mean_absolute_error(y, ypred)
    mape = mean_absolute_percentage_error(y, ypred)
    r2 = r2_score(y, ypred)
    print(f"MSE : {mse:.2f}")
    print(f"MAE : {mae:.2f}")
    print(f"MAPE : {mape:.4f}")
    print(f"R2 : {r2:.4f}")

In [117]:
evaluate_model(best_rf,xtrain,ytrain)

MSE : 16.82
MAE : 3.16
MAPE : 0.0070
R2 : 0.9415


In [119]:
evaluate_model(best_rf,xtest,ytest)

MSE : 17.43
MAE : 3.27
MAPE : 0.0072
R2 : 0.9423


In [118]:
evaluate_model(best_gb,xtrain,ytrain)

MSE : 3.90
MAE : 1.49
MAPE : 0.0033
R2 : 0.9864


In [120]:
evaluate_model(best_gb,xtest,ytest)

MSE : 9.18
MAE : 2.17
MAPE : 0.0048
R2 : 0.9696


In [122]:
xpred_test_rf = best_rf.predict(xtest)
xpred_test_rf[0:5]

array([433.433691  , 481.30924993, 479.25998578, 450.1858617 ,
       447.52512756])

In [123]:
ytest.head()

Unnamed: 0,PE
1087,429.38
6308,485.29
8021,480.4
9483,452.3
4459,446.47


In [124]:
xpred_test_gb  = best_gb.predict(xtest)
xpred_test_gb[0:5]

array([430.30384712, 483.48722215, 472.69835913, 450.85646154,
       449.09988542])

### both random forest and gradient boost are performing equally well

### here lets select gradient boost for out of sample prediction

### because gradient boost was slightly better in cv results

### Step 8 Out of sample prediction

In [125]:
xnew = pd.read_csv("test_PowerPlant (2).csv")
xnew.head()

Unnamed: 0,AT,V,AP,RH
0,22.49,58.82,1009.73,85.19
1,28.88,54.89,1007.16,46.6
2,27.89,73.21,1001.32,85.88
3,25.69,62.26,1011.38,74.08
4,12.16,40.81,1025.79,70.97


In [128]:
num_pipe

In [129]:
x_new_pre = num_pipe.transform(xnew)
x_new_pre

Unnamed: 0,AT,V,AP,RH
0,0.380410,0.356827,-0.590397,0.811615
1,1.238819,0.047028,-1.023041,-1.830315
2,1.105826,1.491180,-2.006171,0.858853
3,0.810286,0.628000,-0.312629,0.051007
4,-1.007284,-1.062888,2.113209,-0.161908
...,...,...,...,...
95,0.898948,1.197147,-0.805877,-0.196139
96,-1.058331,-1.129893,-0.314313,0.658261
97,-1.885843,-1.017956,1.291690,0.881445
98,0.712221,1.631496,-0.839546,0.491899


In [144]:
preds = best_gb.predict(x_new_pre)
preds[0:5]

array([444.55060276, 440.54570487, 432.02299286, 440.9730766 ,
       474.46205622])

### Save the results in new dataframe

In [159]:
results = pd.DataFrame(preds,columns=["PE_Pred"])
results.round(2)

Unnamed: 0,PE_Pred
0,444.55
1,440.55
2,432.02
3,440.97
4,474.46
...,...
95,433.48
96,473.98
97,491.02
98,439.83
