In [1]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, f1_score, roc_auc_score, recall_score
from sklearn.model_selection import GridSearchCV, train_test_split
import pickle

In [2]:
# Loading the data
df = pd.read_csv('Group_9_data_cleaned.csv', index_col=0)
df.head(5)

Unnamed: 0,current_1,current_2,current_3,current_4,current_5,current_6,current_7,current_8,current_9,current_10,...,current_40,current_41,current_42,current_43,current_44,current_45,current_46,current_47,current_48,output
0,-3.0146e-07,8.2603e-06,-1.2e-05,-2e-06,-1.4386e-06,-2.1e-05,0.031718,0.03171,0.031721,-0.032963,...,-0.63308,2.9646,8.1198,-1.4961,-1.4961,-1.4961,-1.4996,-1.4996,-1.4996,1
1,2.9132e-06,-5.2477e-06,3e-06,-6e-06,2.7789e-06,-4e-06,0.030804,0.03081,0.030806,-0.03352,...,-0.59314,7.6252,6.169,-1.4967,-1.4967,-1.4967,-1.5005,-1.5005,-1.5005,1
2,-2.9517e-06,-3.184e-06,-1.6e-05,-1e-06,-1.5753e-06,1.7e-05,0.032877,0.03288,0.032896,-0.029834,...,-0.63252,2.7784,5.3017,-1.4983,-1.4983,-1.4982,-1.4985,-1.4985,-1.4985,1
3,-1.3226e-06,8.8201e-06,-1.6e-05,-5e-06,-7.2829e-07,4e-06,0.02941,0.029401,0.029417,-0.030156,...,-0.62289,6.5534,6.2606,-1.4963,-1.4963,-1.4963,-1.4975,-1.4975,-1.4976,1
4,-6.8366e-08,5.6663e-07,-2.6e-05,-6e-06,-7.9406e-07,1.3e-05,0.030119,0.030119,0.030145,-0.031393,...,-0.6301,4.5155,9.5231,-1.4958,-1.4958,-1.4958,-1.4959,-1.4959,-1.4959,1


In [3]:
df.columns

Index(['current_1', 'current_2', 'current_3', 'current_4', 'current_5',
       'current_6', 'current_7', 'current_8', 'current_9', 'current_10',
       'current_11', 'current_12', 'current_13', 'current_14', 'current_15',
       'current_16', 'current_17', 'current_18', 'current_19', 'current_20',
       'current_21', 'current_22', 'current_23', 'current_24', 'current_25',
       'current_26', 'current_27', 'current_28', 'current_29', 'current_30',
       'current_31', 'current_32', 'current_33', 'current_34', 'current_35',
       'current_36', 'current_37', 'current_38', 'current_39', 'current_40',
       'current_41', 'current_42', 'current_43', 'current_44', 'current_45',
       'current_46', 'current_47', 'current_48', 'output'],
      dtype='object')

In [4]:
X = df.drop(['output'],axis=1)
y = df['output']

In [5]:
# dividing the dataset into train,test and validation sets
X_train, X_rem, y_train, y_rem  = train_test_split(X, y, train_size = 0.5, random_state = 42)
X_valid, X_test, y_valid, y_test = train_test_split(X_rem, y_rem, test_size = 0.5, random_state = 42)

In [6]:
# The shape of training dataset
print(f" The input training  data shape is {X_train.shape}")
print(f" The  output training data shape is {y_train.shape}")

 The input training  data shape is (29254, 48)
 The  output training data shape is (29254,)


In [7]:
# The shape of validation dataset
print(f" The input training  data shape is {X_valid.shape}")
print(f" The  output training data shape is {y_valid.shape}")

 The input training  data shape is (14627, 48)
 The  output training data shape is (14627,)


In [8]:
# The shape of the test dataset 
print(f" The input training  data shape is {X_valid.shape}")
print(f" The  output training data shape is {y_valid.shape}")

 The input training  data shape is (14627, 48)
 The  output training data shape is (14627,)


In [9]:
# selecting a random model 
model_rf = RandomForestClassifier()
model_rf.fit(X_train, y_train)

In [10]:
# accuracy scores on the three datasets
print(f" Random model training score is {model_rf.score(X_train,y_train)}")
print(f" Random model validation data score is {model_rf.score(X_valid,y_valid)}")
print(f" Random model testing score is {model_rf.score(X_test,y_test)}")

 Random model training score is 1.0
 Random model validation data score is 0.9979489984275655
 Random model testing score is 0.9980858627290129


In [11]:
# classification report for validation dataset
y_pred_va = model_rf.predict(X_valid)
print(classification_report(y_pred_va,y_valid))

              precision    recall  f1-score   support

           1       1.00      1.00      1.00      1344
           2       1.00      1.00      1.00      1342
           3       1.00      0.99      1.00      1303
           4       1.00      1.00      1.00      1299
           5       0.99      1.00      0.99      1295
           6       1.00      1.00      1.00      1337
           7       1.00      1.00      1.00      1396
           8       1.00      1.00      1.00      1335
           9       0.99      1.00      1.00      1308
          10       1.00      1.00      1.00      1310
          11       1.00      1.00      1.00      1358

    accuracy                           1.00     14627
   macro avg       1.00      1.00      1.00     14627
weighted avg       1.00      1.00      1.00     14627



### Finding the best model using Hyperparameter validation

In [17]:
# Define a grid of hyperparameters to search over
param_grid = {
    'n_estimators': [50, 100],
    'max_depth': [5, 10],
    'min_samples_split': [2, 5],
    'min_samples_leaf': [1, 2],
}

In [18]:
# Create a Grid Search CV object and fit it to the Validation data
grid_search = GridSearchCV(model_rf, param_grid, cv=5)
grid_search.fit(X_valid, y_valid)

GridSearchCV(cv=5, estimator=RandomForestClassifier(),
             param_grid={'max_depth': [5, 10], 'min_samples_leaf': [1, 2],
                         'min_samples_split': [2, 5],
                         'n_estimators': [50, 100]})

In [19]:
grid_search.best_params_

{'max_depth': 10,
 'min_samples_leaf': 1,
 'min_samples_split': 5,
 'n_estimators': 100}

In [7]:
# best classification tree for the above data
best_rf = RandomForestClassifier(max_depth = 10, min_samples_leaf= 1, min_samples_split= 5, n_estimators= 100)
best_rf.fit(X_train,y_train)
filename = 'RF_class.sav'
pickle.dump(best_rf, open(filename, 'wb'))

### Accuracy of the models

In [21]:
# Best model training 
best_rf.fit(X_train,y_train)
# Accuracy scores of best tree on different datasets
y_train_pred = best_rf.predict(X_train)
print(f" The training data accuracy for the best model is {accuracy_score(y_train,y_train_pred)}")
y_valid_pred = best_rf.predict(X_valid) 
print(f" The validation data accuracy for the best model is {accuracy_score(y_valid,y_valid_pred)}")
y_test_pred = best_rf.predict(X_test)
print(f" The testing data accuracy for the best model is {accuracy_score(y_test,y_test_pred)}")      

 The training data accuracy for the best model is 0.9998632665618377
 The validation data accuracy for the best model is 0.9993163328091885
 The testing data accuracy for the best model is 0.9991112934098988


### Confusion matrices for Training data, Validation data and Testing data

In [22]:
### Training Data
print(confusion_matrix(y_train,y_train_pred))

[[2643    0    0    0    0    0    0    0    0    0    0]
 [   0 2623    0    0    0    0    0    0    0    0    0]
 [   0    0 2685    0    0    0    0    0    0    0    0]
 [   0    0    0 2675    4    0    0    0    0    0    0]
 [   0    0    0    0 2716    0    0    0    0    0    0]
 [   0    0    0    0    0 2626    0    0    0    0    0]
 [   0    0    0    0    0    0 2609    0    0    0    0]
 [   0    0    0    0    0    0    0 2665    0    0    0]
 [   0    0    0    0    0    0    0    0 2693    0    0]
 [   0    0    0    0    0    0    0    0    0 2663    0]
 [   0    0    0    0    0    0    0    0    0    0 2652]]


In [23]:
# confusion matrix for validation data
print(confusion_matrix(y_valid,y_valid_pred))

[[1345    0    0    0    0    0    0    0    0    0    0]
 [   0 1338    0    0    0    0    0    0    0    0    1]
 [   0    0 1293    0    0    0    0    0    0    0    0]
 [   0    0    0 1294    5    0    0    0    0    0    0]
 [   0    0    0    0 1305    0    0    0    0    0    0]
 [   0    0    0    0    0 1336    0    0    0    0    1]
 [   0    0    0    0    0    0 1396    0    0    0    0]
 [   0    0    0    0    0    0    0 1335    0    0    0]
 [   0    0    0    0    0    0    0    0 1312    2    0]
 [   0    0    0    0    0    0    0    0    1 1307    0]
 [   0    0    0    0    0    0    0    0    0    0 1356]]


In [24]:
# Confusion matrix for testing data
print(confusion_matrix(y_test,y_test_pred))

[[1331    0    0    0    0    0    0    0    0    0    0]
 [   1 1354    1    0    0    0    0    0    0    0    1]
 [   0    0 1341    0    0    0    0    0    0    0    0]
 [   0    0    0 1339    2    0    0    0    0    0    0]
 [   0    0    0    1 1297    0    0    0    0    0    0]
 [   0    0    0    0    0 1356    0    0    0    0    0]
 [   0    0    0    0    1    0 1313    0    0    0    0]
 [   0    1    0    0    0    0    0 1318    0    0    0]
 [   0    0    0    0    0    0    0    2 1309    1    0]
 [   0    0    0    0    0    0    0    0    2 1346    0]
 [   0    0    0    0    0    0    0    0    0    0 1311]]


### Classification Report for all the models

In [25]:
#Classification report for the training data
print(classification_report(y_train,y_train_pred))

              precision    recall  f1-score   support

           1       1.00      1.00      1.00      2643
           2       1.00      1.00      1.00      2623
           3       1.00      1.00      1.00      2685
           4       1.00      1.00      1.00      2679
           5       1.00      1.00      1.00      2716
           6       1.00      1.00      1.00      2626
           7       1.00      1.00      1.00      2609
           8       1.00      1.00      1.00      2665
           9       1.00      1.00      1.00      2693
          10       1.00      1.00      1.00      2663
          11       1.00      1.00      1.00      2652

    accuracy                           1.00     29254
   macro avg       1.00      1.00      1.00     29254
weighted avg       1.00      1.00      1.00     29254



In [26]:
# Classification report for the testing data
print(classification_report(y_test,y_test_pred))

              precision    recall  f1-score   support

           1       1.00      1.00      1.00      1331
           2       1.00      1.00      1.00      1357
           3       1.00      1.00      1.00      1341
           4       1.00      1.00      1.00      1341
           5       1.00      1.00      1.00      1298
           6       1.00      1.00      1.00      1356
           7       1.00      1.00      1.00      1314
           8       1.00      1.00      1.00      1319
           9       1.00      1.00      1.00      1312
          10       1.00      1.00      1.00      1348
          11       1.00      1.00      1.00      1311

    accuracy                           1.00     14628
   macro avg       1.00      1.00      1.00     14628
weighted avg       1.00      1.00      1.00     14628



In [None]:
https://drive.google.com/file/d/1KPfSSlcXfQyrPiFMQRSsTcFMYojc40Db/view?usp=sharing

In [27]:
# classification report for the validation data
print(classification_report(y_valid,y_valid_pred))

              precision    recall  f1-score   support

           1       1.00      1.00      1.00      1345
           2       1.00      1.00      1.00      1339
           3       1.00      1.00      1.00      1293
           4       1.00      1.00      1.00      1299
           5       1.00      1.00      1.00      1305
           6       1.00      1.00      1.00      1337
           7       1.00      1.00      1.00      1396
           8       1.00      1.00      1.00      1335
           9       1.00      1.00      1.00      1314
          10       1.00      1.00      1.00      1308
          11       1.00      1.00      1.00      1356

    accuracy                           1.00     14627
   macro avg       1.00      1.00      1.00     14627
weighted avg       1.00      1.00      1.00     14627



### ROC_AUC score of the models 

In [28]:
# roc auc score for training data
y_train_prob = best_rf.predict_proba(X_train)
print(f" The roc_auc score for the training data is {roc_auc_score(y_train,y_train_prob,average='weighted',multi_class='ovr')}")
# roc auc score for the validation data
y_valid_prob = best_rf.predict_proba(X_valid)
print(f" The roc_auc score for the validation data is {roc_auc_score(y_valid,y_valid_prob,average='weighted',multi_class='ovr')}")
# roc auc score for the testing data
y_test_prob = best_rf.predict_proba(X_test)
print(f" The roc_auc score for the testing data is {roc_auc_score(y_test,y_test_prob,average='weighted',multi_class='ovr')}")

 The roc_auc score for the training data is 1.0
 The roc_auc score for the validation data is 0.9999998046868175
 The roc_auc score for the testing data is 0.9999935118697014


### Recall score of the models

In [29]:
# recall score for training data
print(f" The recall score for the training data is {recall_score(y_train,y_train_pred,average= 'macro')}")
# recall score for validation data
print(f" The recall score for the validation data is {recall_score(y_valid,y_valid_pred,average= 'macro')}")
# recall score for test data
print(f" The recall score for the testing data is {recall_score(y_test,y_test_pred,average= 'macro')}")

 The recall score for the training data is 0.9998642641419796
 The recall score for the validation data is 0.9993063200128557
 The recall score for the testing data is 0.9991125409659506


### F1 score of the models

In [30]:
# F-1 score for training data
print(f" The f-1 score for the training data is {f1_score(y_train,y_train_pred,average= 'weighted')}")
# F-1 score for validation data
print(f" The f-1 score for the validation data is {f1_score(y_valid,y_valid_pred,average= 'weighted')}")
# F-1 score for test data
print(f" The f-1 score for the testing data is {f1_score(y_test,y_test_pred,average= 'weighted')}")

 The f-1 score for the training data is 0.9998632657913588
 The f-1 score for the validation data is 0.9993163550087673
 The f-1 score for the testing data is 0.999111220832485
