### Machine Learning Classification Problem-

_Due to the imbalance between electricity supply and demand, leading to instability, this work (Notebook) is based on building a binary classification model to predict if a grid is stable or unstable using the UCI Electrical Grid 
Stability SImulated dataset_

In [1]:
# Importing all necessary libraries

import pandas as pd
import sklearn.utils
import xgboost as xgb
import lightgbm as lgb
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.metrics import confusion_matrix
from xgboost import XGBClassifier
from sklearn.metrics import classification_report
from sklearn.model_selection import RandomizedSearchCV

In [2]:
# importing dataset and returning the first 3 rows.

data= pd.read_csv('Data_for_UCI_named.csv')
data.head(3)

Unnamed: 0,tau1,tau2,tau3,tau4,p1,p2,p3,p4,g1,g2,g3,g4,stab,stabf
0,2.95906,3.079885,8.381025,9.780754,3.763085,-0.782604,-1.257395,-1.723086,0.650456,0.859578,0.887445,0.958034,0.055347,unstable
1,9.304097,4.902524,3.047541,1.369357,5.067812,-1.940058,-1.872742,-1.255012,0.413441,0.862414,0.562139,0.78176,-0.005957,stable
2,8.971707,8.848428,3.046479,1.214518,3.405158,-1.207456,-1.27721,-0.920492,0.163041,0.766689,0.839444,0.109853,0.003471,unstable


In [3]:
# dropping stab column

data= data.drop(columns= ['stab'], axis= 1)
data.head(3)

Unnamed: 0,tau1,tau2,tau3,tau4,p1,p2,p3,p4,g1,g2,g3,g4,stabf
0,2.95906,3.079885,8.381025,9.780754,3.763085,-0.782604,-1.257395,-1.723086,0.650456,0.859578,0.887445,0.958034,unstable
1,9.304097,4.902524,3.047541,1.369357,5.067812,-1.940058,-1.872742,-1.255012,0.413441,0.862414,0.562139,0.78176,stable
2,8.971707,8.848428,3.046479,1.214518,3.405158,-1.207456,-1.27721,-0.920492,0.163041,0.766689,0.839444,0.109853,unstable


In [4]:
# checking the class distributions of the target variable 'staf' to check for imbalance classes

data['stabf'].value_counts()

unstable    6380
stable      3620
Name: stabf, dtype: int64

In [5]:
# separating out features and target variable

X= data.iloc[:, : -1]
y= data['stabf']

In [6]:
# splitting dataset into train and test sets

X_train, X_test, y_train, y_test= train_test_split(X, y, test_size= 0.2, random_state= 1)

In [7]:
# checking the class distribution of y (target variable)

y_train.value_counts()

unstable    5092
stable      2908
Name: stabf, dtype: int64

In [8]:
# instantiating the standard scalar function and transforming X_train

scaler= StandardScaler()
normalised_train_df= scaler.fit_transform(X_train)

# converting X_train to a dataframe
normalised_train_df= pd.DataFrame(normalised_train_df, columns= X_train.columns)

In [9]:
# checking for confirmation

normalised_train_df.head(3)

Unnamed: 0,tau1,tau2,tau3,tau4,p1,p2,p3,p4,g1,g2,g3,g4
0,0.367327,-0.986042,0.650447,1.547527,-0.29149,0.061535,1.293862,-0.845074,0.160918,0.339859,0.585568,0.492239
1,-0.064659,0.089437,1.035079,-1.641494,0.619865,-0.067235,-1.502925,0.486613,-0.293143,-1.558488,1.429649,-1.443521
2,-1.46785,1.298418,-0.502536,1.166046,-0.180521,0.490603,0.68256,-0.855302,1.39935,1.451534,-1.045743,0.492489


In [10]:
# check for shape

normalised_train_df.shape

(8000, 12)

In [11]:
# normalising x_test with standard scalar function and transforming X_test

X_test= X_test.reset_index(drop= True)
normalised_test_df= scaler.transform(X_test)

In [12]:
# converting X_test to a dataframe
normalised_test_df= pd.DataFrame(normalised_test_df, columns= X_test.columns)

In [13]:
# check for confirmation

normalised_test_df.head(3)

Unnamed: 0,tau1,tau2,tau3,tau4,p1,p2,p3,p4,g1,g2,g3,g4
0,0.593951,-0.412733,1.503924,1.116943,0.403423,-1.492971,-0.785033,1.566781,-0.901007,1.167203,-1.50733,1.084726
1,0.20219,0.374416,-0.1888,-0.522268,-0.225967,-1.058483,0.420047,1.028627,-1.625721,-0.39566,1.414651,1.226011
2,-1.079044,-0.313745,-0.884634,0.01708,-0.943122,0.112653,0.801335,0.733004,1.457108,-1.438495,0.651821,-1.682168


In [14]:
# check for shape

normalised_test_df.shape

(2000, 12)

_Fixing Classifiers and ensemble methods_

#### 1). Random forest Classifier

In [15]:
# instantiating Random Forest Classifer and fitting the model

rfc= RandomForestClassifier(random_state= 1)
rfc.fit(normalised_train_df, y_train)

RandomForestClassifier(random_state=1)

#### Confusion matrix on Random Forest Classifier

In [16]:
new_predrfc= rfc.predict(normalised_test_df)

cm_rfc= confusion_matrix(y_true= y_test, y_pred= new_predrfc, labels= ['stable', 'unstable'])

cm_rfc

array([[ 625,   87],
       [  55, 1233]], dtype=int64)

#### Classification report on Random Forest Classifier

In [17]:
print(classification_report(y_test, new_predrfc, digits= 4))

              precision    recall  f1-score   support

      stable     0.9191    0.8778    0.8980       712
    unstable     0.9341    0.9573    0.9456      1288

    accuracy                         0.9290      2000
   macro avg     0.9266    0.9176    0.9218      2000
weighted avg     0.9288    0.9290    0.9286      2000



#### 2). XGBoost

In [18]:
# setting parameters

param= {'max_depth': 3, 'eta': 0.3, 'objective': 'multi:softprob',
       'num_class': 2}

steps= 20

In [19]:
# instantiating XGBoost Classifier and fitting model

xg_cla= xgb.XGBClassifier(param= param, random_state= 1)
xg_cla.fit(normalised_train_df, y_train)

XGBClassifier(param={'eta': 0.3, 'max_depth': 3, 'num_class': 2,
                     'objective': 'multi:softprob'},
              random_state=1)

#### Confusion matrix on XGBoost

In [20]:
new_predxgb= xg_cla.predict(normalised_test_df)

cm_xgb= confusion_matrix(y_true= y_test, y_pred= new_predxgb, labels= ['stable', 'unstable'])
cm_xgb

array([[ 603,  109],
       [  52, 1236]], dtype=int64)

#### Classification report on XGBoost

In [21]:
print(classification_report(y_test, new_predxgb, digits= 4))

              precision    recall  f1-score   support

      stable     0.9206    0.8469    0.8822       712
    unstable     0.9190    0.9596    0.9389      1288

    accuracy                         0.9195      2000
   macro avg     0.9198    0.9033    0.9105      2000
weighted avg     0.9195    0.9195    0.9187      2000



#### 3). LightGBM

In [22]:
# instantiating lightGBM and fitting the model

lig_cla= lgb.LGBMClassifier(random_state= 1)
lig_cla.fit(normalised_train_df, y_train)

LGBMClassifier(random_state=1)

In [23]:
new_predlgb= lig_cla.predict(normalised_test_df)

cm_lig= confusion_matrix(y_true= y_test, y_pred= new_predlgb, labels= ['stable', 'unstable'])

cm_lig

array([[ 635,   77],
       [  48, 1240]], dtype=int64)

#### Classification report on LightGBM

In [24]:
print(classification_report(y_test, new_predlgb, digits=4))

              precision    recall  f1-score   support

      stable     0.9297    0.8919    0.9104       712
    unstable     0.9415    0.9627    0.9520      1288

    accuracy                         0.9375      2000
   macro avg     0.9356    0.9273    0.9312      2000
weighted avg     0.9373    0.9375    0.9372      2000



#### 4). EXtra Tree Classifier

In [25]:
# instantiating the Extra Trees Classifier () function and fitting the model

etc= ExtraTreesClassifier(random_state= 1)
etc.fit(normalised_train_df, y_train)

ExtraTreesClassifier(random_state=1)

In [26]:
new_predetc= etc.predict(normalised_test_df)

cm_etc= confusion_matrix(y_true= y_test, y_pred= new_predetc, labels= ['stable', 'unstable'])

cm_etc

array([[ 606,  106],
       [  38, 1250]], dtype=int64)

#### Classification report on Extra Tree Classifier

In [27]:
print(classification_report(y_test, new_predetc, digits=4))

              precision    recall  f1-score   support

      stable     0.9410    0.8511    0.8938       712
    unstable     0.9218    0.9705    0.9455      1288

    accuracy                         0.9280      2000
   macro avg     0.9314    0.9108    0.9197      2000
weighted avg     0.9287    0.9280    0.9271      2000



#### Randomized Search CV on Extra Trees Classifier (Tuning Extra Tree Classifier model)

In [28]:
# define parameters

etc_param= {'n_estimators': [50, 100, 300, 500, 1000],
           'min_samples_leaf':[1, 2, 4, 6, 8],
           'min_samples_split': [2, 3, 5, 7, 9],
           'max_features': ['auto', 'sqrt', 'log2', None]}

In [29]:
# Intitialising Random search

etc_rscv= RandomizedSearchCV(estimator= etc, param_distributions= etc_param,cv=5, n_iter=10, 
                             scoring = 'accuracy', n_jobs = -1, verbose = 1, random_state= 1)

In [30]:
# fitting the model

etc_rscv.fit(normalised_train_df, y_train)

Fitting 5 folds for each of 10 candidates, totalling 50 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:  2.2min
[Parallel(n_jobs=-1)]: Done  50 out of  50 | elapsed:  2.6min finished


RandomizedSearchCV(cv=5, estimator=ExtraTreesClassifier(random_state=1),
                   n_jobs=-1,
                   param_distributions={'max_features': ['auto', 'sqrt', 'log2',
                                                         None],
                                        'min_samples_leaf': [1, 2, 4, 6, 8],
                                        'min_samples_split': [2, 3, 5, 7, 9],
                                        'n_estimators': [50, 100, 300, 500,
                                                         1000]},
                   random_state=1, scoring='accuracy', verbose=1)

In [31]:
# Printing out the best parameter

etc_rscv.best_params_

{'n_estimators': 1000,
 'min_samples_split': 2,
 'min_samples_leaf': 8,
 'max_features': None}

In [32]:
new_pred_rscv= etc_rscv.predict(normalised_test_df)

cm_etc_rscv= confusion_matrix(y_true= y_test, y_pred= new_pred_rscv, labels= ['stable', 'unstable'])

cm_etc_rscv

array([[ 619,   93],
       [  53, 1235]], dtype=int64)

#### Classification report on Randomized search CV of Extra Trees Classifier

In [33]:
print(classification_report(y_test, new_pred_rscv, digits= 4))

              precision    recall  f1-score   support

      stable     0.9211    0.8694    0.8945       712
    unstable     0.9300    0.9589    0.9442      1288

    accuracy                         0.9270      2000
   macro avg     0.9256    0.9141    0.9193      2000
weighted avg     0.9268    0.9270    0.9265      2000



_Note the drop in accuracy of the Extra Tree Classifier model. This is based on the used parameters_

In [34]:
# determining the importance of the features

print(etc.feature_importances_)

[0.11739736 0.11844468 0.11316851 0.11546569 0.03950675 0.04037132
 0.04070628 0.04057864 0.08978291 0.09367636 0.09688268 0.09401882]


_The most importanat feature is tau2, while the least is p1_