In [1]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split

from evidently import ColumnMapping
from evidently.test_suite import TestSuite
from evidently.test_preset import DataStabilityTestPreset

from evidently.report import Report
from evidently.metric_preset import DataDriftPreset, DataQualityPreset, ClassificationPreset
import warnings
warnings.filterwarnings('ignore')
print("Hello world")

  @numba.jit()
  @numba.jit()
  @numba.jit()


Hello world


  @numba.jit()


In [2]:
ref_data = pd.read_csv("../data/processed/preprocessed.csv")
cur_data = pd.read_csv("../data/processed/test.csv")

In [3]:
ref_data['Machine failure'].value_counts()

Machine failure
1    38617
0    19295
Name: count, dtype: int64

In [4]:
ref_X = ref_data.drop(["Machine failure", "type_of_failure"], axis=1)
ref_y = ref_data["Machine failure"]

cur_X = cur_data.drop(["Machine failure", "type_of_failure"], axis=1)
cur_y = cur_data["Machine failure"]

In [5]:
ref_X_train, ref_X_test, ref_y_train, ref_y_test = train_test_split(ref_X, ref_y, test_size=0.2, random_state=42)

In [6]:
ref_y_test

16971    1
53134    1
46497    0
9584     0
1566     0
        ..
32887    1
35872    1
4623     0
6231     0
40380    0
Name: Machine failure, Length: 11583, dtype: int64

In [7]:
cur_X

Unnamed: 0,Type,Rotational speed [rpm],Torque [Nm],Tool wear [min],Air temperature [c],Process temperature [c]
0,0.0,0.116454,0.717242,0.808000,0.416913,0.542584
1,0.0,0.165882,0.505503,0.832677,0.799828,0.600806
2,0.0,0.181121,0.491575,0.839321,0.207279,0.319268
3,2.0,0.175786,0.542582,0.241107,0.315217,0.382716
4,1.0,0.211621,0.581239,0.973157,0.708003,0.722145
...,...,...,...,...,...,...
11578,1.0,0.125980,0.815221,0.460557,0.357166,0.431582
11579,0.0,0.249834,0.397051,0.898098,0.724233,0.634204
11580,0.0,0.163534,0.526165,0.816208,0.657651,0.623478
11581,0.0,0.296275,0.310440,0.577075,0.206522,0.333333


In [8]:
ref_y.value_counts()

Machine failure
1    38617
0    19295
Name: count, dtype: int64

In [9]:
rf = RandomForestClassifier()
rf.fit(ref_X_train.values, ref_y_train.values)



## Using predict

In [10]:
ref_pred = rf.predict(ref_X_test)
ref_pred = pd.DataFrame(ref_pred, columns=["Prediction"])
cur_pred = rf.predict(cur_X)
cur_pred = pd.DataFrame(cur_pred, columns=["Prediction"])


In [11]:
ref_X_test.reset_index(inplace=True, drop=True)
ref_y_test.reset_index(inplace=True, drop=True)
ref_merged = pd.concat([ref_X_test, ref_y_test], axis=1)
ref_merged = pd.concat([ref_merged, ref_pred], axis=1)
ref_merged

Unnamed: 0,Type,Rotational speed [rpm],Torque [Nm],Tool wear [min],Air temperature [c],Process temperature [c],Machine failure,Prediction
0,1.0,0.122459,0.661519,0.653691,0.759223,0.558424,1,1
1,0.0,0.099819,0.564633,0.813977,0.236123,0.378512,1,1
2,2.0,0.186429,0.536209,0.026251,0.750368,0.707195,0,0
3,0.0,0.210128,0.515110,0.067194,0.434783,0.592593,0,0
4,1.0,0.136787,0.563187,0.584980,0.293478,0.296296,0,0
...,...,...,...,...,...,...,...,...
11578,0.0,0.036932,0.954168,0.022469,0.583528,0.695252,1,1
11579,0.0,0.088382,0.881834,0.572687,0.776007,0.755737,1,1
11580,0.0,0.207800,0.506868,0.707510,0.836957,0.666667,0,0
11581,1.0,0.190338,0.508242,0.584980,0.652174,0.654321,0,0


In [12]:
cur_X.reset_index(inplace=True, drop=True)
cur_y.reset_index(inplace=True, drop=True)
cur_merged = pd.concat([cur_X, cur_y], axis=1)
cur_merged = pd.concat([cur_merged, cur_pred], axis=1)
cur_merged

Unnamed: 0,Type,Rotational speed [rpm],Torque [Nm],Tool wear [min],Air temperature [c],Process temperature [c],Machine failure,Prediction
0,0.0,0.116454,0.717242,0.808000,0.416913,0.542584,1,1
1,0.0,0.165882,0.505503,0.832677,0.799828,0.600806,1,1
2,0.0,0.181121,0.491575,0.839321,0.207279,0.319268,1,1
3,2.0,0.175786,0.542582,0.241107,0.315217,0.382716,0,0
4,1.0,0.211621,0.581239,0.973157,0.708003,0.722145,1,1
...,...,...,...,...,...,...,...,...
11578,1.0,0.125980,0.815221,0.460557,0.357166,0.431582,1,1
11579,0.0,0.249834,0.397051,0.898098,0.724233,0.634204,1,1
11580,0.0,0.163534,0.526165,0.816208,0.657651,0.623478,1,1
11581,0.0,0.296275,0.310440,0.577075,0.206522,0.333333,0,0


In [13]:
cm = ColumnMapping()
cm.target = "Machine failure"
cm.prediction = "Prediction"
cm.target_names = ["No Failure", "Machine Failure"]

In [14]:
classification_performance_report = Report(metrics=[
    ClassificationPreset(),
])
classification_performance_report.run(reference_data=ref_merged, current_data=cur_merged, column_mapping = cm)
classification_performance_report.save_html("../reports/classification_performance_report.html")

## Using Probas

In [15]:
ref_probas = pd.DataFrame(rf.predict_proba(ref_X_test))
ref_probas.columns = ['No Failure', 'Machine Failure']
cur_probas = pd.DataFrame(rf.predict_proba(cur_X))
cur_probas.columns = ['No Failure', 'Machine Failure']

In [16]:
ref_probas

Unnamed: 0,No Failure,Machine Failure
0,0.00,1.00
1,0.06,0.94
2,1.00,0.00
3,1.00,0.00
4,0.98,0.02
...,...,...
11578,0.00,1.00
11579,0.01,0.99
11580,1.00,0.00
11581,1.00,0.00


In [17]:
ref_probas.value_counts()

No Failure  Machine Failure
0.00        1.00               5790
1.00        0.00               3074
0.01        0.99                992
0.02        0.98                370
0.99        0.01                325
                               ... 
0.57        0.43                  1
0.83        0.17                  1
0.60        0.40                  1
0.69        0.31                  1
0.53        0.47                  1
Name: count, Length: 94, dtype: int64

In [18]:
ref_X_test.reset_index(inplace=True, drop=True)
ref_X_test['Attrition'] = ['No Failure' if x == 0 else 'Machine Failure' for x in ref_y_test]
ref_merged = pd.concat([ref_X_test, ref_probas], axis = 1)

cur_X.reset_index(inplace=True, drop=True)
cur_X['Attrition'] = ['No Failure' if x == 0 else 'Machine Failure' for x in cur_y]
cur_merged = pd.concat([cur_X, cur_probas], axis = 1)

In [19]:
ref_merged

Unnamed: 0,Type,Rotational speed [rpm],Torque [Nm],Tool wear [min],Air temperature [c],Process temperature [c],Attrition,No Failure,Machine Failure
0,1.0,0.122459,0.661519,0.653691,0.759223,0.558424,Machine Failure,0.00,1.00
1,0.0,0.099819,0.564633,0.813977,0.236123,0.378512,Machine Failure,0.06,0.94
2,2.0,0.186429,0.536209,0.026251,0.750368,0.707195,No Failure,1.00,0.00
3,0.0,0.210128,0.515110,0.067194,0.434783,0.592593,No Failure,1.00,0.00
4,1.0,0.136787,0.563187,0.584980,0.293478,0.296296,No Failure,0.98,0.02
...,...,...,...,...,...,...,...,...,...
11578,0.0,0.036932,0.954168,0.022469,0.583528,0.695252,Machine Failure,0.00,1.00
11579,0.0,0.088382,0.881834,0.572687,0.776007,0.755737,Machine Failure,0.01,0.99
11580,0.0,0.207800,0.506868,0.707510,0.836957,0.666667,No Failure,1.00,0.00
11581,1.0,0.190338,0.508242,0.584980,0.652174,0.654321,No Failure,1.00,0.00


In [20]:
cm  = ColumnMapping()
cm.target = 'Attrition'
cm.prediction = ['No Failure', 'Machine Failure']
cm.pos_label = 'No Failure'

In [21]:
classification_performance_report = Report(metrics=[
    ClassificationPreset(),
])

classification_performance_report.run(reference_data=ref_merged, current_data=cur_merged, column_mapping = cm)
classification_performance_report.save_html("../reports/classification_performance_report.html")