# Ensemble Techniques
Combining multiple models to improve accuracy
1. Bagging - Random Forest (parallel Trees are created)
2. Boosting - GradientBoost (sequentially corrective models are added)

In [10]:
from warnings import filterwarnings
filterwarnings("ignore")

### Step 1 : Read the Dataset

In [11]:
import pandas as pd
df = pd.read_csv("PowerPlant (1).csv")
df

Unnamed: 0,AT,V,AP,RH,PE
0,8.34,40.77,1010.84,90.01,480.48
1,23.64,58.49,1011.40,74.20,445.75
2,29.74,56.90,1007.15,41.91,438.76
3,19.07,49.69,1007.22,76.79,453.09
4,11.80,40.66,1017.13,97.20,464.43
...,...,...,...,...,...
9563,15.12,48.92,1011.80,72.93,462.59
9564,33.41,77.95,1010.30,59.72,432.90
9565,15.99,43.34,1014.20,78.66,465.96
9566,17.65,59.87,1018.58,94.65,450.93


### Step 2 : Basic data quality check

In [12]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9568 entries, 0 to 9567
Data columns (total 5 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   AT      9568 non-null   float64
 1   V       9568 non-null   float64
 2   AP      9568 non-null   float64
 3   RH      9568 non-null   float64
 4   PE      9568 non-null   float64
dtypes: float64(5)
memory usage: 373.9 KB


In [13]:
df.isna().sum()

AT    0
V     0
AP    0
RH    0
PE    0
dtype: int64

In [14]:
df.duplicated().sum()

41

In [20]:
df = df.drop_duplicates(keep = "first",ignore_index=True)

In [21]:
df.duplicated().sum()

0

In [22]:
df

Unnamed: 0,AT,V,AP,RH,PE
0,8.34,40.77,1010.84,90.01,480.48
1,23.64,58.49,1011.40,74.20,445.75
2,29.74,56.90,1007.15,41.91,438.76
3,19.07,49.69,1007.22,76.79,453.09
4,11.80,40.66,1017.13,97.20,464.43
...,...,...,...,...,...
9522,15.12,48.92,1011.80,72.93,462.59
9523,33.41,77.95,1010.30,59.72,432.90
9524,15.99,43.34,1014.20,78.66,465.96
9525,17.65,59.87,1018.58,94.65,450.93


In [24]:
df.nunique().sort_values()

V      634
AP    2517
AT    2773
RH    4546
PE    4836
dtype: int64

### Step 3 : Seperating X and Y

In [25]:
x = df.drop(columns="PE")
y = df[["PE"]]

In [26]:
x.head()

Unnamed: 0,AT,V,AP,RH
0,8.34,40.77,1010.84,90.01
1,23.64,58.49,1011.4,74.2
2,29.74,56.9,1007.15,41.91
3,19.07,49.69,1007.22,76.79
4,11.8,40.66,1017.13,97.2


In [27]:
y.head()

Unnamed: 0,PE
0,480.48
1,445.75
2,438.76
3,453.09
4,464.43


### Step 4: Create Preprocessing pipeline for X

In [28]:
x.dtypes

AT    float64
V     float64
AP    float64
RH    float64
dtype: object

In [29]:
from sklearn.pipeline import make_pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler

In [36]:
num_pipe = make_pipeline(
    SimpleImputer(strategy="mean"),
    StandardScaler()
).set_output(transform="pandas")


In [37]:
xpre = num_pipe.fit_transform(x)
xpre

Unnamed: 0,AT,V,AP,RH
0,-1.520448,-1.066041,-0.403535,1.141599
1,0.534897,0.330813,-0.309262,0.059223
2,1.354348,0.205475,-1.024725,-2.151400
3,-0.079020,-0.362884,-1.012941,0.236538
4,-1.055645,-1.074713,0.655349,1.633837
...,...,...,...,...
9522,-0.609648,-0.423583,-0.241925,-0.027724
9523,1.847362,1.864831,-0.494441,-0.932100
9524,-0.492776,-0.863450,0.162101,0.364561
9525,-0.269777,0.439598,0.899448,1.459261


### Step 5 : Perform the train test split

In [38]:
from sklearn.model_selection import train_test_split
xtrain,xtest,ytrain,ytest = train_test_split(xpre,y,test_size=0.2,random_state=42)
xtrain.head()

Unnamed: 0,AT,V,AP,RH
5695,-0.152905,0.324507,0.089713,1.817999
7300,-1.497611,-1.153542,0.323712,0.90883
9066,-0.885038,-0.954892,0.29341,0.988245
6424,1.398679,1.223949,-0.47929,-1.082031
6773,-0.824586,-0.942279,0.796758,1.506499


In [39]:
xtrain.shape

(7621, 4)

In [40]:
xtest.shape

(1906, 4)

### Step 6 : Model building

Random forest

In [46]:
from sklearn.ensemble import RandomForestRegressor
rfr = RandomForestRegressor(
    
        n_estimators =1,
        min_samples_leaf=5,
        min_samples_split=5,
        max_depth=1,
        criterion= 'poisson'
    )


In [47]:
rfr.fit(xtrain,ytrain)

In [48]:
rfr.score(xtrain,ytrain)

0.717237905962081

In [49]:
rfr.score(xtest,ytest)

0.7255787425883435

### Tuning Random Forest