## Load Libraries

In [1]:
# import pandas
import pandas as pd
# import numpy
import numpy as np
# import seaborn
import seaborn as sns
# import matplotlib
from matplotlib import pyplot as plt
# import train_test_split to split the dataset into training and testing set
from sklearn.model_selection import train_test_split
# import StandardScaler
from sklearn.preprocessing import StandardScaler
# import accuracy_score, confusion_matrix, classification_report & f1_score to perform cross validation
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, f1_score
# import cross_val_score, RandomizedSearchCV, KFold & StratifiedKFold for measuring performance
from sklearn.model_selection import cross_val_score, RandomizedSearchCV, KFold, StratifiedKFold
# import RandomForestClassifier, ExtraTreesClassifier to create and select  Classifier model
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier
# import XGBClassifier for Gradient boost classifier
from xgboost import XGBClassifier
# import LigtGBM Classifier
from lightgbm import LGBMClassifier

import warnings
warnings.filterwarnings('ignore')





from imblearn.over_sampling import SMOTE



  import pandas.util.testing as tm


### Load the Dataset


https://archive.ics.uci.edu/ml/datasets/Electrical+Grid+Stability+Simulated+Data+

In [3]:
#Dataset is stored locally

eco_footprint = pd.read_csv("Data_for_UCI_named.csv")
eco_footprint.head() #Let's have a peek

Unnamed: 0,tau1,tau2,tau3,tau4,p1,p2,p3,p4,g1,g2,g3,g4,stab,stabf
0,2.95906,3.079885,8.381025,9.780754,3.763085,-0.782604,-1.257395,-1.723086,0.650456,0.859578,0.887445,0.958034,0.055347,unstable
1,9.304097,4.902524,3.047541,1.369357,5.067812,-1.940058,-1.872742,-1.255012,0.413441,0.862414,0.562139,0.78176,-0.005957,stable
2,8.971707,8.848428,3.046479,1.214518,3.405158,-1.207456,-1.27721,-0.920492,0.163041,0.766689,0.839444,0.109853,0.003471,unstable
3,0.716415,7.6696,4.486641,2.340563,3.963791,-1.027473,-1.938944,-0.997374,0.446209,0.976744,0.929381,0.362718,0.028871,unstable
4,3.134112,7.608772,4.943759,9.857573,3.525811,-1.125531,-1.845975,-0.554305,0.79711,0.45545,0.656947,0.820923,0.04986,unstable


Get a description of statistical dataset

In [5]:
eco_footprint.describe()

Unnamed: 0,tau1,tau2,tau3,tau4,p1,p2,p3,p4,g1,g2,g3,g4,stab
count,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0
mean,5.25,5.250001,5.250004,5.249997,3.75,-1.25,-1.25,-1.25,0.525,0.525,0.525,0.525,0.015731
std,2.742548,2.742549,2.742549,2.742556,0.75216,0.433035,0.433035,0.433035,0.274256,0.274255,0.274255,0.274255,0.036919
min,0.500793,0.500141,0.500788,0.500473,1.58259,-1.999891,-1.999945,-1.999926,0.050009,0.050053,0.050054,0.050028,-0.08076
25%,2.874892,2.87514,2.875522,2.87495,3.2183,-1.624901,-1.625025,-1.62496,0.287521,0.287552,0.287514,0.287494,-0.015557
50%,5.250004,5.249981,5.249979,5.249734,3.751025,-1.249966,-1.249974,-1.250007,0.525009,0.525003,0.525015,0.525002,0.017142
75%,7.62469,7.624893,7.624948,7.624838,4.28242,-0.874977,-0.875043,-0.875065,0.762435,0.76249,0.76244,0.762433,0.044878
max,9.999469,9.999837,9.99945,9.999443,5.864418,-0.500108,-0.500072,-0.500025,0.999937,0.999944,0.999982,0.99993,0.109403


In [6]:
#info
eco_footprint.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 14 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   tau1    10000 non-null  float64
 1   tau2    10000 non-null  float64
 2   tau3    10000 non-null  float64
 3   tau4    10000 non-null  float64
 4   p1      10000 non-null  float64
 5   p2      10000 non-null  float64
 6   p3      10000 non-null  float64
 7   p4      10000 non-null  float64
 8   g1      10000 non-null  float64
 9   g2      10000 non-null  float64
 10  g3      10000 non-null  float64
 11  g4      10000 non-null  float64
 12  stab    10000 non-null  float64
 13  stabf   10000 non-null  object 
dtypes: float64(13), object(1)
memory usage: 1.1+ MB


There are 13 numerical variables execpt stabf which is categorical.

In [7]:
eco_footprint.isna().sum()

tau1     0
tau2     0
tau3     0
tau4     0
p1       0
p2       0
p3       0
p4       0
g1       0
g2       0
g3       0
g4       0
stab     0
stabf    0
dtype: int64

No missing data in the dataset

As instructed the "stab" column is droppedd. This also helps since we are creating a binary classifier

In [None]:
#drop stab column
eco_footprint.drop('stab', axis = 1, inplace = True)

The distribution of the Labels in the Dataset

In [8]:
#count unique values of stabf
eco_footprint['stabf'].value_counts()



unstable    6380
stable      3620
Name: stabf, dtype: int64

Extracting Features and Labels

In [9]:
# create feature variables
X = eco_footprint.drop('stabf', axis = 1)
# create target variable
y = eco_footprint['stabf']

Split the Dataset into a 80-20 train-test split using random state of 1

In [10]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 1)

In [11]:
# normalise train set to a common scale using the  standard scaler
normalized_X_train = pd.DataFrame(StandardScaler().fit_transform(X_train), columns = X_train.columns)
# normalise X_test to a common scale using the  standard scaler
normalized_X_test = pd.DataFrame(StandardScaler().fit_transform(X_test), columns = X_test.columns)

In [None]:
# function to train model
def train_model(model, X = normalized_X_train, y = y_train):
    return model.fit(X, y)    

In [None]:
# function for classification report and accuracy
def model_accuracy(model, X_train = normalized_X_train, y_train = y_train, X_test = normalized_X_test):
    model.fit(X_train, y_train)                                 # train model
    y_pred = model.predict(X_test)                              # test model
    print(f'Classification report for {model} is:')
    print(classification_report(y_test, y_pred, digits = 5))    # print classification_report
    print('')
    accuracy = round(accuracy_score(y_test, y_pred), 4)         # accuracy of model
    print(f'Accuracy is {accuracy}')                            # prints model accuracy
    return

In [None]:
model_accuracy(rfc)

In [None]:
lgbmc = LGBMClassifier(random_state=1)
#  extra trees classifier

In [None]:
model_accuracy(lgbmc)

In [None]:
# light gradient boosting model
xgbc = XGBClassifier(max_depth=3, learning_rate=0.1, random_state=1)
model_accuracy(xgbc)

In [None]:
#  extra trees classifier
extc = ExtraTreesClassifier(random_state=1)
model_accuracy(extc)

In [None]:
# define the confusion matrix
TP, FP, FN, TN = 255, 1380, 45, 20
pricission = TP/(TP + FP)
accuracy = TP/(TP + FN)
F1_score = round((2 * pricission * accuracy)/(pricission + accuracy), 4)
f'The F1 score of the classifier is {F1_score}'

In [None]:
# hyperparameter
# the number of trees in the forest/number of boosting rounds
n_estimators = [50, 100, 300, 500, 1000]
# the minimum number of samples required to split an internal node
min_samples_split = [2, 3, 5, 7, 9]
# the minimum number of samples required to be at a leaf node
min_samples_leaf = [1, 2, 4, 6, 8]
# the number of features to consider when looking for the best split
max_features = ['auto', 'sqrt', 'log2', None] 
# grid hyperparameter
hyperparameter = {'n_estimators': n_estimators,
                       'min_samples_leaf': min_samples_leaf,
                       'min_samples_split': min_samples_split,
                       'max_features': max_features}

In [None]:
radomized_cv = RandomizedSearchCV(extc, hyperparameter,cv = 5, n_iter = 10, 
                                  scoring = 'accuracy', n_jobs = -1, verbose = 1, random_state = 1)
search_param = train_model(radomized_cv)
search_param.best_params_

In [None]:
# use the best parameters to train the dataset using extra trees classifier model
hypertuned_extc = ExtraTreesClassifier(**search_param.best_params_, random_state = 1)
hypertuned_extc.fit(normalized_X_train, y_train)
hypertund_y_pred = hypertuned_extc.predict(normalized_X_test)
accuracy = round(accuracy_score(y_test, hypertund_y_pred), 4)
print(f'Accuracy is {accuracy}')