## Imports

In [1]:
# Importing relevant libraries
import pandas as pd
import numpy as np
from sklearn.metrics import confusion_matrix, accuracy_score, recall_score, precision_score, f1_score

In [2]:
# Importing the daytaset
df = pd.read_csv('Data_for_UCI_named.csv')

df.head()

Unnamed: 0,tau1,tau2,tau3,tau4,p1,p2,p3,p4,g1,g2,g3,g4,stab,stabf
0,2.95906,3.079885,8.381025,9.780754,3.763085,-0.782604,-1.257395,-1.723086,0.650456,0.859578,0.887445,0.958034,0.055347,unstable
1,9.304097,4.902524,3.047541,1.369357,5.067812,-1.940058,-1.872742,-1.255012,0.413441,0.862414,0.562139,0.78176,-0.005957,stable
2,8.971707,8.848428,3.046479,1.214518,3.405158,-1.207456,-1.27721,-0.920492,0.163041,0.766689,0.839444,0.109853,0.003471,unstable
3,0.716415,7.6696,4.486641,2.340563,3.963791,-1.027473,-1.938944,-0.997374,0.446209,0.976744,0.929381,0.362718,0.028871,unstable
4,3.134112,7.608772,4.943759,9.857573,3.525811,-1.125531,-1.845975,-0.554305,0.79711,0.45545,0.656947,0.820923,0.04986,unstable


In [3]:
print(df['stabf'].value_counts())

unstable    6380
stable      3620
Name: stabf, dtype: int64


## Splitting the Dataset

In [4]:
# Splitting the dataset into Independent and Dependent variables
x = df.drop(columns= ['stabf', 'stab'])
y = df['stabf']

In [5]:
x.head()

Unnamed: 0,tau1,tau2,tau3,tau4,p1,p2,p3,p4,g1,g2,g3,g4
0,2.95906,3.079885,8.381025,9.780754,3.763085,-0.782604,-1.257395,-1.723086,0.650456,0.859578,0.887445,0.958034
1,9.304097,4.902524,3.047541,1.369357,5.067812,-1.940058,-1.872742,-1.255012,0.413441,0.862414,0.562139,0.78176
2,8.971707,8.848428,3.046479,1.214518,3.405158,-1.207456,-1.27721,-0.920492,0.163041,0.766689,0.839444,0.109853
3,0.716415,7.6696,4.486641,2.340563,3.963791,-1.027473,-1.938944,-0.997374,0.446209,0.976744,0.929381,0.362718
4,3.134112,7.608772,4.943759,9.857573,3.525811,-1.125531,-1.845975,-0.554305,0.79711,0.45545,0.656947,0.820923


In [6]:
# Using Label Encoder to encode the categorical variables
from sklearn.preprocessing import LabelEncoder
encoder = LabelEncoder()
y = encoder.fit_transform(y)

y

array([1, 0, 1, ..., 0, 1, 1])

In [7]:
# splitting the data to test and train sets
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size= 0.2, random_state= 1)

In [8]:
from sklearn.preprocessing import StandardScaler
sc_x = StandardScaler()
sc_y = StandardScaler()
sc_x.fit_transform(x_train)
sc_x.transform(x_test)
sc_y.fit_transform(y_train.reshape(len(y_train), 1))

array([[ 0.75570623],
       [ 0.75570623],
       [ 0.75570623],
       ...,
       [ 0.75570623],
       [ 0.75570623],
       [-1.32326552]])

## Building the model using RandomForest and ExtraTrees Classifiers

In [9]:
# Using Random Forest Classifier
from sklearn.ensemble import RandomForestClassifier
rfc = RandomForestClassifier(n_estimators = 50, random_state = 1)
rfc.fit(x_train, y_train)

RandomForestClassifier(n_estimators=50, random_state=1)

In [10]:
y_pred = rfc.predict(x_test)

# Confusion matrix
cm = confusion_matrix(y_test, y_pred)
print(cm)

# Accuracy
Accuracy = accuracy_score(y_test, y_pred)
print('Accuracy: {:.4f}%'.format(Accuracy*100))

# Precision
precision = precision_score(y_test, y_pred)
print( 'Precision: {:.2f}%' .format(precision* 100))

# Recall
recall = recall_score(y_test, y_pred)
print( 'Recall: {:.2f}%' .format(recall* 100))

# F1-Score
f1 = f1_score(y_test, y_pred)
print( 'F1: {:.2f}%' .format(f1* 100))

[[ 621   91]
 [  65 1223]]
Accuracy: 92.2000%
Precision: 93.07%
Recall: 94.95%
F1: 94.00%


In [18]:
# Using Extra Tress Classifier
from sklearn.ensemble import ExtraTreesClassifier
etc = ExtraTreesClassifier()
etc.fit(x_train, y_train)

ExtraTreesClassifier()

In [20]:
print(etc.feature_importances_)

[0.11630607 0.1174384  0.11388319 0.11415955 0.03977294 0.04118699
 0.0403828  0.03990442 0.09036913 0.0947575  0.09787551 0.09396349]


In [12]:
y_pred = etc.predict(x_test)

# Confusion matrix
cm = confusion_matrix(y_test, y_pred)
print(cm)

# Accuracy
Accuracy = accuracy_score(y_test, y_pred)
print('Accuracy: {:.2f}%'.format(Accuracy*100))

# Precision
precision = precision_score(y_test, y_pred)
print( 'Precision: {:.2f}%' .format(precision* 100))

# Recall
recall = recall_score(y_test, y_pred)
print( 'Recall: {:.2f}%' .format(recall* 100))

# F1-Score
f1 = f1_score(y_test, y_pred)
print( 'F1: {:.2f}%' .format(f1* 100))

[[ 602  110]
 [  25 1263]]
Accuracy: 93.25%
Precision: 91.99%
Recall: 98.06%
F1: 94.93%


In [13]:
# Using XGBoost classifier
from xgboost import XGBClassifier
xgb = XGBClassifier(random_state= 1)
xgb.fit(x_train, y_train)





XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, gamma=0, gpu_id=-1,
              importance_type='gain', interaction_constraints='',
              learning_rate=0.300000012, max_delta_step=0, max_depth=6,
              min_child_weight=1, missing=nan, monotone_constraints='()',
              n_estimators=100, n_jobs=2, num_parallel_tree=1, random_state=1,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, subsample=1,
              tree_method='exact', validate_parameters=1, verbosity=None)

In [14]:
y_pred = xgb.predict(x_test)

# Confusion matrix
cm = confusion_matrix(y_test, y_pred)
print(cm)

# Accuracy
Accuracy = accuracy_score(y_test, y_pred)
print('Accuracy: {:.4f}%'.format(Accuracy*100))

# Precision
precision = precision_score(y_test, y_pred)
print( 'Precision: {:.2f}%' .format(precision* 100))

# Recall
recall = recall_score(y_test, y_pred)
print( 'Recall: {:.2f}%' .format(recall* 100))

# F1-Score
f1 = f1_score(y_test, y_pred)
print( 'F1: {:.2f}%' .format(f1* 100))

[[ 648   64]
 [  45 1243]]
Accuracy: 94.5500%
Precision: 95.10%
Recall: 96.51%
F1: 95.80%


In [15]:
# Using Lightgbm Classifier
from lightgbm import LGBMClassifier
lgbc = LGBMClassifier(random_state= 1)
lgbc.fit(x_train, y_train)

LGBMClassifier(random_state=1)

In [16]:
y_pred = lgbc.predict(x_test)

# Confusion matrix
cm = confusion_matrix(y_test, y_pred)
print(cm)

# Accuracy
Accuracy = accuracy_score(y_test, y_pred)
print('Accuracy: {:.2f}%'.format(Accuracy*100))

# Precision
precision = precision_score(y_test, y_pred)
print( 'Precision: {:.2f}%' .format(precision* 100))

# Recall
recall = recall_score(y_test, y_pred)
print( 'Recall: {:.2f}%' .format(recall* 100))

# F1-Score
f1 = f1_score(y_test, y_pred)
print( 'F1: {:.2f}%' .format(f1* 100))

[[ 634   78]
 [  44 1244]]
Accuracy: 93.90%
Precision: 94.10%
Recall: 96.58%
F1: 95.33%


In [17]:
# Using Randomized Search Grid to to improve the ExtratreesClassifier model
n_estimators = [50, 100, 300, 500, 1000]
min_samples_split = [2, 3, 5, 7, 9]
min_samples_leaf = [1, 2, 4, 6, 8]
max_features = ['auto', 'sqrt', 'log2', None]

from sklearn.model_selection import RandomizedSearchCV
parameters = {'n_estimators': n_estimators, 
              'min_samples_split': min_samples_split,
              'min_samples_leaf': min_samples_leaf, 
              'max_features': max_features,
             }

rsc = RandomizedSearchCV(estimator= etc, param_distributions= parameters, n_iter=10, scoring = 'accuracy', n_jobs = -1, verbose = 1, random_state)
rsc.fit(x_train, y_train)
best_score = rsc.best_score_
best_parameters = rsc.best_params_
print("Best Score: {:.2f} %".format(best_score*100))
print("Best Parameters:", best_parameters)

Best Score: 92.08 %
Best Parameters: {'n_estimators': 1000, 'min_samples_split': 5, 'min_samples_leaf': 2, 'max_features': None}
