In [9]:
import pandas as pd
START_DATA = pd.read_csv('data/merged_dataset.csv', nrows = 10000)
nan_cols = START_DATA.isna().any()

cols_with_nan = START_DATA.columns[nan_cols]

df = START_DATA.copy()

df = df.drop(columns=cols_with_nan)

df = df.select_dtypes(exclude=['object'])

float64_cols = list(df.select_dtypes(include=['float64']))
int64_cols = list(df.select_dtypes(include=['int64']))


cols_to_read = df.columns

  START_DATA = pd.read_csv('data/merged_dataset.csv', nrows = 10000)


In [10]:
cols_to_read_final = list(filter(lambda x: len(x) < 14, cols_to_read))

In [11]:
cols_to_read_final.remove('alarm_info')
len(cols_to_read_final)

12406

In [7]:
float64_cols_dict = {}
for x in float64_cols:
    float64_cols_dict[x] = 'float32'
float64_cols_dict
int64_cols_dict = {}
for x in int64_cols:
    int64_cols_dict[x] = 'uint8'
dtype_to_read = {**float64_cols_dict, **int64_cols_dict}

In [12]:
X_test = pd.read_csv('data/X_test.csv', usecols=cols_to_read_final, dtype=dtype_to_read)
y_test = pd.read_csv('data/y_test.csv', usecols=['alarm_info'], dtype=dtype_to_read)

In [13]:
X_test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 37774 entries, 0 to 37773
Columns: 12406 entries, 01 to zyuganov
dtypes: float32(15), uint8(12391)
memory usage: 448.5 MB


In [14]:
import joblib
from sklearn.metrics import confusion_matrix, classification_report

In [15]:
print(X_test.shape)
print(y_test.shape)

(37774, 12406)
(37774, 1)


**Logistic Regression**

In [16]:
logreg = joblib.load('trained_models/LogisticRegression')
y_pred = logreg.predict(X_test)

print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))
print(logreg.score(X_test, y_test))

[[26807  1374]
 [ 7669  1924]]
              precision    recall  f1-score   support

           0       0.78      0.95      0.86     28181
           1       0.58      0.20      0.30      9593

    accuracy                           0.76     37774
   macro avg       0.68      0.58      0.58     37774
weighted avg       0.73      0.76      0.71     37774

0.7606025308413195


**Random Forest Classifier with max depth 5**

In [40]:
RFC5 = joblib.load('trained_models/RandomForestClassifier5')
y_pred = RFC5.predict(X_test)

print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))
print(RFC5.score(X_test, y_test))

[[27641   540]
 [ 8536  1057]]
              precision    recall  f1-score   support

           0       0.76      0.98      0.86     28181
           1       0.66      0.11      0.19      9593

    accuracy                           0.76     37774
   macro avg       0.71      0.55      0.52     37774
weighted avg       0.74      0.76      0.69     37774

0.7597289140678773


**Random Forest Classifier with max depth 20**

In [41]:
RFC20 = joblib.load('trained_models/RandomForestClassifier20')

y_pred = RFC20.predict(X_test)

print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))
print(RFC20.score(X_test, y_test))

[[27330   851]
 [ 5088  4505]]
              precision    recall  f1-score   support

           0       0.84      0.97      0.90     28181
           1       0.84      0.47      0.60      9593

    accuracy                           0.84     37774
   macro avg       0.84      0.72      0.75     37774
weighted avg       0.84      0.84      0.83     37774

0.8427754540159899


**Random Forest Classifier with max depth 50**

In [42]:
RFC50 = joblib.load('trained_models/RandomForestClassifier50')

y_pred = RFC50.predict(X_test)

print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))
print(RFC50.score(X_test, y_test))

[[26430  1751]
 [ 2927  6666]]
              precision    recall  f1-score   support

           0       0.90      0.94      0.92     28181
           1       0.79      0.69      0.74      9593

    accuracy                           0.88     37774
   macro avg       0.85      0.82      0.83     37774
weighted avg       0.87      0.88      0.87     37774

0.8761582040556997


**Random Forest Classifier with max depth 100**

In [43]:
RFC100 = joblib.load('trained_models/RandomForestClassifier100')

y_pred = RFC100.predict(X_test)

print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))
print(RFC100.score(X_test, y_test))

[[26445  1736]
 [ 2932  6661]]
              precision    recall  f1-score   support

           0       0.90      0.94      0.92     28181
           1       0.79      0.69      0.74      9593

    accuracy                           0.88     37774
   macro avg       0.85      0.82      0.83     37774
weighted avg       0.87      0.88      0.87     37774

0.8764229364112882


**k-NN with k = 5**

In [14]:
knn = joblib.load('trained_models/5NearestNeighbours')

y_pred = knn.predict(X_test)

print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))
print(knn.score(X_test, y_test))

[[25249  2932]
 [ 3876  5717]]
              precision    recall  f1-score   support

           0       0.87      0.90      0.88     28181
           1       0.66      0.60      0.63      9593

    accuracy                           0.82     37774
   macro avg       0.76      0.75      0.75     37774
weighted avg       0.81      0.82      0.82     37774

0.8197702123153492


**k-NN with k = 20**

In [20]:
knn = joblib.load('trained_models/20NearestNeighbours')

y_pred = knn.predict(X_test)

print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))
print(knn.score(X_test, y_test))

[[26125  2056]
 [ 5456  4137]]
              precision    recall  f1-score   support

           0       0.83      0.93      0.87     28181
           1       0.67      0.43      0.52      9593

    accuracy                           0.80     37774
   macro avg       0.75      0.68      0.70     37774
weighted avg       0.79      0.80      0.79     37774

0.8011330544819187


**k-NN with k = 50**

In [21]:
knn = joblib.load('trained_models/50NearestNeighbours')

y_pred = knn.predict(X_test)

print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))
print(knn.score(X_test, y_test))

[[26296  1885]
 [ 6260  3333]]
              precision    recall  f1-score   support

           0       0.81      0.93      0.87     28181
           1       0.64      0.35      0.45      9593

    accuracy                           0.78     37774
   macro avg       0.72      0.64      0.66     37774
weighted avg       0.76      0.78      0.76     37774

0.7843754963731667


**Stochastic Gradient Descent**

In [19]:
sgd = joblib.load('trained_models/StochasticGradientDescent')

y_pred = sgd.predict(X_test)

print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))
print(sgd.score(X_test, y_test))

https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations


[[11562 16619]
 [ 1330  8263]]
              precision    recall  f1-score   support

           0       0.90      0.41      0.56     28181
           1       0.33      0.86      0.48      9593

    accuracy                           0.52     37774
   macro avg       0.61      0.64      0.52     37774
weighted avg       0.75      0.52      0.54     37774

0.5248318949542014
