# Big G Express - Data Exploration

## Team: Elden Ring

<img src="https://eldenring.wiki.fextralife.com/file/Elden-Ring/mirel_pastor_of_vow.jpg" alt="PRAISE DOG" style="width:806px;height:600px;"/>

#### PRAISE THE DOG!

In [1]:
import pandas as pd
import numpy as np
from datetime import datetime, timedelta

In [2]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix

from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler

from sklearn.ensemble import GradientBoostingClassifier

from imblearn.over_sampling import SMOTE

from sklearn.impute import SimpleImputer

from sklearn.metrics import roc_auc_score

from joblib import dump, load

In [89]:
faults = pd.read_pickle('../data/faults_filtered.pkl')
# y_derate = pd.read_pickle('../data/target_derate.pkl') # this one is the base model, 6 hr
#y_derate = pd.read_pickle('../data/target_derate3h.pkl')
#y_derate = pd.read_pickle('../data/target_derate12h.pkl')
y_derate = pd.read_pickle('../data/target_derate24h.pkl')
#y_derate = pd.read_pickle('../data/target_derate6h_noderaterow.pkl')
#y_75derate = pd.read_pickle('../data/target_75derate.pkl')
diagnostics_imputed = pd.read_pickle('../data/diagnostics_imputed.pkl')

In [90]:
# this one is mostly NaNs, just 250 values or so
diagnostics_imputed = diagnostics_imputed.drop(columns='ServiceDistance')

# and this drops columns that are not useful for predictions
faults = faults.drop(columns=['ESS_Id', 'active', 'eventDescription','ecuSoftwareVersion', 'ecuSerialNumber', 
    'ecuModel', 'ecuMake', 'ecuSource', 'MCTNumber', 'Latitude', 'Longitude', 'LocationTimeStamp'])

Remember there are parts of columns (where a particular truck had no values)

In [91]:
# this was just a simple fill with mean..
diagnostics_imputed['AcceleratorPedal'] = diagnostics_imputed['AcceleratorPedal'].fillna(value=diagnostics_imputed['AcceleratorPedal'].mean())
diagnostics_imputed['CruiseControlSetSpeed'] = diagnostics_imputed['CruiseControlSetSpeed'].fillna(value=diagnostics_imputed['CruiseControlSetSpeed'].mean())
diagnostics_imputed['EngineTimeLtd'] = diagnostics_imputed['EngineTimeLtd'].fillna(value=diagnostics_imputed['EngineTimeLtd'].mean())
diagnostics_imputed['FuelLevel'] = diagnostics_imputed['FuelLevel'].fillna(value=diagnostics_imputed['FuelLevel'].mean())
diagnostics_imputed['FuelTemperature'] = diagnostics_imputed['FuelTemperature'].fillna(value=diagnostics_imputed['FuelTemperature'].mean())
diagnostics_imputed['SwitchedBatteryVoltage'] = diagnostics_imputed['SwitchedBatteryVoltage'].fillna(value=diagnostics_imputed['SwitchedBatteryVoltage'].mean())
diagnostics_imputed['Throttle'] = diagnostics_imputed['Throttle'].fillna(value=diagnostics_imputed['Throttle'].mean())

In [None]:
# this took 30 min and didn't stop ...
# from sklearn.experimental import enable_iterative_imputer
# from sklearn.impute import IterativeImputer, KNNImputer
# scaler = StandardScaler().fit(diagnostics_imputed)

# knn_filled = scaler.inverse_transform(KNNImputer().fit_transform(scaler.transform(diagnostics_imputed)))

# diagnostics_imputed = IterativeImputer().fit_transform(diagnostics_imputed)

## Merging and prepping the data

In [None]:
faults_diagnostics = faults.merge(diagnostics_imputed, left_on='RecordID', right_on='FaultId', how='inner')

In [None]:
faults_diagnostics['spn_fmi'] = ['_'.join(i) for i in zip(faults_diagnostics['spn'].astype(str), faults_diagnostics['fmi'].astype(str))]

faults_diagnostics = pd.get_dummies(faults_diagnostics, columns=['spn_fmi'], prefix='spn_fmi')

faults_diagnostics = faults_diagnostics.sort_values(by=['EquipmentID', 'EventTimeStamp'])

In [None]:
# to obtain the one hot encoded columns since there are so many
faults_cols = ['EventTimeStamp'] + [col for col in faults_diagnostics.columns if 'spn_fmi' in col] 

diagnostics_cols = ['EventTimeStamp', 'activeTransitionCount', 'AcceleratorPedal',
         'BarometricPressure', 'CruiseControlSetSpeed', 'DistanceLtd',
         'EngineCoolantTemperature', 'EngineLoad', 'EngineOilPressure', 
        'EngineOilTemperature', 'EngineRpm', 'EngineTimeLtd', 'FuelLevel', 'FuelLtd', 
        'FuelRate', 'FuelTemperature', 'IntakeManifoldTemperature', 'LampStatus',
        'Speed', 'SwitchedBatteryVoltage', 'Throttle', 'TurboBoostPressure']

In [None]:
faults_rolling = (
    faults_diagnostics
    .groupby('EquipmentID')[faults_cols]
    .rolling(window = '1d', on = "EventTimeStamp")
    .max()
)

faults_rolling = faults_rolling.reset_index()

In [None]:
diagnostics_rolling = (
    faults_diagnostics
    .groupby('EquipmentID')[diagnostics_cols]
    .rolling(window = '1d', on = "EventTimeStamp")
    .mean()
)

diagnostics_rolling = diagnostics_rolling.reset_index()

In [None]:
faults_rolling = pd.merge(faults_diagnostics['RecordID'], #[['RecordID'] + diagnostics_cols]
                          faults_rolling,
                          left_index= True,
                          right_on = 'level_1').drop(columns='level_1')

diagnostics_rolling = pd.merge(faults_diagnostics['RecordID'],
                          diagnostics_rolling,
                          left_index= True,
                          right_on = 'level_1').drop(columns='level_1')

In [None]:
faults_diagnostics_rolling =  pd.merge(diagnostics_rolling.drop(columns=['EquipmentID', 'EventTimeStamp']),
            faults_rolling.drop(columns=['EquipmentID', 'EventTimeStamp']),
            on = 'RecordID')

In [None]:
faults_diagnostics_rolling = faults_diagnostics_rolling.sort_values('RecordID').drop(columns='RecordID')

In [None]:
#faults_diagnostics_rolling = faults_rolling.drop(columns=['RecordID', 'EquipmentID', 'EventTimeStamp_x', 'EventTimeStamp_y'])


## Training and test

In [None]:
# use stratify on target (with derate) and split based on trucks
X_train, X_test, y_train, y_test = train_test_split(faults_diagnostics_rolling, y_derate.sort_values('RecordID')['target'], train_size = 0.8, test_size = 0.2, random_state = 42)

In [None]:
y_train.value_counts()

In [None]:
y_test.value_counts()

In [None]:
gbr = Pipeline(
    steps = [
        ('gb', GradientBoostingClassifier(verbose=True)) #n_estimators = 1000, learning_rate=0.01
    ]
)

gbr.fit(X_train, y_train)

In [None]:
confusion_matrix(y_train, gbr.predict(X_train))

In [None]:
print(classification_report(y_train, gbr.predict(X_train)))

In [None]:
importances = pd.DataFrame({
    'variable': gbr.feature_names_in_,
    'importance': gbr['gb'].feature_importances_
})

importances.sort_values('importance', ascending = False).head(20)

NOTE: during one of the trainings there was a particular spn-fmi code that got to the top, with importance of 0.8! It was the 46262 code and after inspecting, despite appearing only once in the dataset, since there are many events happening in a smlal timeframe around it, it got picked up as important!

In [None]:
oversampler = SMOTE(k_neighbors=5, random_state=42)

In [None]:
X_smote, y_smote = oversampler.fit_resample(X_train, y_train)

In [None]:
y_smote.value_counts()

In [None]:
gbr_smoted = Pipeline(
    steps = [
        ('gb', GradientBoostingClassifier(verbose=True))#n_estimators = 1000, learning_rate=0.01
    ]
)

gbr_smoted.fit(X_smote, y_smote)

In [None]:
y_predict = gbr_smoted.predict(X_train)

In [None]:
confusion_matrix(y_train, y_predict)

In [None]:
confusion_matrix(y_test, gbr.predict(X_test))

In [None]:
confusion_matrix(y_test, gbr_smoted.predict(X_test))

In [None]:
print(classification_report(y_train, y_predict))

In [None]:
importances = pd.DataFrame({
    'variable': gbr_smoted.feature_names_in_,
    'importance': gbr_smoted['gb'].feature_importances_
})

importances.sort_values('importance', ascending = False).head(25)

In [None]:
gbr.predict_proba(X_test)[:,1]

In [None]:
roc_auc_score(y_true=y_test, y_score=gbr.predict_proba(X_test)[:,1])

In [None]:
roc_auc_score(y_true=y_test, y_score=gbr_smoted.predict_proba(X_test)[:,1])

## Better Train-Test split

The above is a good start. However, there are trucks whose events end up mixed between both train and test split. Instead, we want to make sure that each individual truck only appears in one.

In [92]:
print(faults['EquipmentID'].nunique())
print(faults.loc[faults['spn'] == 5246]['EquipmentID'].nunique())

1042
189


First off, get the two lists of trucks that had (or not) a full derate.

In [93]:
all_trucks = faults['EquipmentID'].unique()
derate_trucks = faults.loc[faults['spn'] == 5246]['EquipmentID'].unique()
no_derate_trucks = all_trucks[np.isin(all_trucks, derate_trucks, invert=True)]

Secondly, put those lists together, marking if a derate occured (1) or not (0).

In [94]:
trucks_df = pd.concat([
            pd.DataFrame({'EquipmentID': derate_trucks, 'derate': 1}),
            pd.DataFrame({'EquipmentID': no_derate_trucks, 'derate': 0}) 
            ])

Lastly, use the train_test_split, by accounting for the proportion of 'derates' in both (using stratify)

In [95]:
trucks_train, trucks_test = train_test_split(trucks_df, stratify=trucks_df['derate'], train_size = 0.8, test_size = 0.2, random_state = 42)

In [96]:
# this was just to verify that the proportions of trucks with and without derate in two samples are equal
# print(trucks_train['derate'].value_counts(normalize=True))
# print(trucks_test['derate'].value_counts(normalize=True))

# print(faults.loc[faults['EquipmentID'].isin(trucks_train['EquipmentID'])].shape[0])
# print(faults.loc[faults['EquipmentID'].isin(trucks_test['EquipmentID'])].shape[0])

Finally, use that information to split the diagnostics and targets.

In [97]:
# need to extract this because the train dataset only has RecordID
records_train = faults.loc[faults['EquipmentID'].isin(trucks_train['EquipmentID'])]['RecordID']
records_test = faults.loc[faults['EquipmentID'].isin(trucks_test['EquipmentID'])]['RecordID']

In [98]:
y_train = y_derate.loc[y_derate['RecordID'].isin(records_train)].sort_values('RecordID').drop(columns='RecordID')['target']
y_test = y_derate.loc[y_derate['RecordID'].isin(records_test)].sort_values('RecordID').drop(columns='RecordID')['target']

Now that the y_train and y_test are sorted, time to do the same for the X_train and X_test.

In [99]:
faults_diagnostics = faults.merge(diagnostics_imputed, left_on='RecordID', right_on='FaultId', how='inner').drop(columns='FaultId')

Next it depends on how these get prepared, so I'll build a function. It takes the faults + diagnostic con

In [100]:
def windowize_predictors(fulldetail_faults, time_window='1d', faults_agg='max', windowize_diagnostics = True, diagnostics_agg='mean'):

    # pull out the diagnostics table columns for later
    diagnostics_cols = [col for col in fulldetail_faults.columns if col not in ['RecordID', 'spn', 'fmi', 'EquipmentID']]

    # create a combined spn_fmi column to make dummies out of
    fulldetail_faults['spn_fmi'] = ['_'.join(i) for i in zip(fulldetail_faults['spn'].astype(str), fulldetail_faults['fmi'].astype(str))]

    # make dummies (one hot encode)
    fulldetail_faults = pd.get_dummies(fulldetail_faults, columns=['spn_fmi'], prefix='spn_fmi')

    # make sure the dataframe is in the right order to be able to later re-assign RecordID to it
    fulldetail_faults = fulldetail_faults.sort_values(by=['EquipmentID', 'EventTimeStamp'])

    # pull out all the Faults table columns (now one hot encoded)
    faults_cols = ['EventTimeStamp'] + [col for col in fulldetail_faults.columns if 'spn_fmi' in col] 

    # rolling window function over faults - by default just taking IF a code appears in a 24 hr past window
    faults_rolling = (
        fulldetail_faults
            .groupby('EquipmentID')[faults_cols]
            .rolling(window = time_window, on = "EventTimeStamp")
            .agg(faults_agg)
            .reset_index()
    )
    
    # by default I also decided to apply the same rolling window for the diagnostics part
    # (can be turned off by setting = False, it is quick to execute)
    if windowize_diagnostics:

        # rolling window over diagnostics, by default using mean
        diagnostics_rolling = (
            fulldetail_faults
                .groupby('EquipmentID')[diagnostics_cols]
                .rolling(window = time_window, on = "EventTimeStamp")
                .agg(diagnostics_agg)
                .reset_index()
        )

        # joining back the faults rw to the original dataframe to get the "RecordID" out
        faults_rolling = pd.merge(fulldetail_faults[['RecordID', 'spn']],
                            faults_rolling,
                            left_index= True,
                            right_on = 'level_1').drop(columns='level_1')
        
        ###### ONLY uncomment this next line IF the derate rows are not tagged
        # faults_rolling = faults_rolling.loc[faults_rolling['spn'] != 5246]

        # joining back the diagnostics rw to the original dataframe to get the "RecordID" out
        diagnostics_rolling = pd.merge(fulldetail_faults[['RecordID', 'spn']],
                                diagnostics_rolling,
                                left_index= True,
                                right_on = 'level_1').drop(columns='level_1')
        
        ####### ONLY uncomment this next line IF the derate rows are not tagged
        # diagnostics_rolling = diagnostics_rolling.loc[diagnostics_rolling['spn'] != 5246]
        
        # joining the two rolling windows
        faults_diagnostics_rolling =  pd.merge(
            diagnostics_rolling.drop(columns=['EquipmentID', 'EventTimeStamp', 'spn']),
            faults_rolling.drop(columns=['EquipmentID', 'EventTimeStamp', 'spn']),
            on = 'RecordID'
        )

    # this gets used if we only want to take into account the current diagnostics
    # (essentially, NO rolling window for diagnostics)
    else :

        # simply get back 'RecordID' and other diagnostic columns
        faults_diagnostics_rolling = pd.merge(
            fulldetail_faults[['RecordID', 'spn'] + diagnostics_cols].drop(columns=['EventTimeStamp']),
            faults_rolling.drop(columns=['EquipmentID', 'EventTimeStamp']),
            left_index= True,
            right_on = 'level_1').drop(columns='level_1')
        
        ####### ONLY uncomment this next line IF the derate rows are not tagged
        # faults_diagnostics_rolling = faults_diagnostics_rolling.loc[faults_diagnostics_rolling['spn'] != 5246]
        
        faults_diagnostics_rolling = faults_diagnostics_rolling.drop(columns='spn')
        
    predictor_train = (
        faults_diagnostics_rolling
        .loc[faults_diagnostics_rolling['RecordID']
             .isin(records_train)]
        .sort_values('RecordID')
        .drop(columns='RecordID')
    )
    predictor_test = (
        faults_diagnostics_rolling
        .loc[faults_diagnostics_rolling['RecordID']
             .isin(records_test)]
        .sort_values('RecordID')
        .drop(columns='RecordID')
    )

    return predictor_train, predictor_test

In [101]:
X_train, X_test = windowize_predictors(faults_diagnostics, time_window='7d', faults_agg='max', windowize_diagnostics=False, diagnostics_agg='mean')

In [102]:
gbr = Pipeline(
    steps = [
        ('gb', GradientBoostingClassifier(verbose=True)) #n_estimators = 1000, learning_rate=0.01
    ]
)

In [103]:
gbr.fit(X_train, y_train)

      Iter       Train Loss   Remaining Time 
         1           0.0523            7.19m
         2           0.0484            7.07m
         3           0.0507            6.94m
         4        2410.8273            6.79m
         5        2410.8265            6.74m
         6        2410.8259            6.67m
         7        2410.8254            6.60m
         8        2410.8251            6.56m
         9        2410.8248            6.49m
        10        2410.8245            6.42m
        20        2410.8182            5.71m
        30        2410.8150            5.00m
        40        2410.8130            4.29m
        50        2410.8120            3.57m
        60        2410.8112            2.86m
        70        2410.8106            2.14m
        80        2410.8101            1.43m
        90        2410.8113           42.90s
       100        2410.8107            0.00s


In [104]:
print('confusion matrix')
print(confusion_matrix(y_train, gbr.predict(X_train)))
print('\n')
print('classification report')
print(classification_report(y_train, gbr.predict(X_train)))
print('\n')

importances = pd.DataFrame({
    'variable': gbr.feature_names_in_,
    'importance': gbr['gb'].feature_importances_
})

print('Variable Importances:')
display(importances.sort_values('importance', ascending = False).head(20))

print('------ TEST')
print(confusion_matrix(y_test, gbr.predict(X_test)))
print('ROC AUC Score')
print(roc_auc_score(y_true=y_test, y_score=gbr.predict_proba(X_test)[:,1]))

confusion matrix
[[440508    307]
 [  1118    868]]


classification report
              precision    recall  f1-score   support

           0       1.00      1.00      1.00    440815
           1       0.74      0.44      0.55      1986

    accuracy                           1.00    442801
   macro avg       0.87      0.72      0.77    442801
weighted avg       1.00      1.00      1.00    442801



Variable Importances:


Unnamed: 0,variable,importance
840,spn_fmi_74_14,0.381216
16,LampStatus,0.097102
10,EngineTimeLtd,0.090937
624,spn_fmi_5246_0,0.076297
717,spn_fmi_5848_9,0.074982
622,spn_fmi_524287_31,0.074748
4,DistanceLtd,0.036382
628,spn_fmi_5246_19,0.035623
343,spn_fmi_3226_9,0.031183
638,spn_fmi_5394_17,0.015174


------ TEST
[[103334     91]
 [   332    116]]
ROC AUC Score
0.9493531264028453


In [105]:
oversampler = SMOTE(k_neighbors=5, random_state=42)

X_smote, y_smote = oversampler.fit_resample(X_train, y_train)

In [106]:
# this is going to re-fit from scratch, unless we set warm_start=True
# also, simply add this line to all X_ variables if you want to exclude 5246 influencing the model:
# .drop(columns=[col for col in X_smote.columns if 'spn_fmi_5246' in col])
gbr.fit(X_smote, y_smote)

      Iter       Train Loss   Remaining Time 
         1           1.2467           20.95m
         2           1.1325           20.47m
         3           1.0370           20.58m
         4           0.9576           20.58m
         5           0.8902           21.29m
         6           0.8325           21.14m
         7           0.7827           21.09m
         8           0.7391           20.91m
         9           0.7021           20.60m
        10           0.6696           20.18m
        20           0.4625           18.56m
        30           0.3744           16.10m
        40           0.3264           13.84m
        50           0.2955           11.00m
        60           0.2721            8.39m
        70           0.2553            6.06m
        80           0.2406            3.93m
        90           0.2273            1.92m
       100           0.2185            0.00s


In [108]:
print('confusion matrix')
print(confusion_matrix(y_train, gbr.predict(X_train)))
print('\n')
print('classification report')
print(classification_report(y_train, gbr.predict(X_train)))
print('\n')

importances = pd.DataFrame({
    'variable': gbr.feature_names_in_,
    'importance': gbr['gb'].feature_importances_
})

print('Variable Importances:')
display(importances.sort_values('importance', ascending = False).head(20))

print('------ TEST')
print('confusion matrix')
print(confusion_matrix(y_test, gbr.predict(X_test)))
print('classification report')
print(classification_report(y_test, gbr.predict(X_test)))
print('ROC AUC Score')
print(roc_auc_score(y_true=y_test, y_score=gbr.predict_proba(X_test)[:,1]))

confusion matrix
[[426454  14361]
 [   186   1800]]


classification report
              precision    recall  f1-score   support

           0       1.00      0.97      0.98    440815
           1       0.11      0.91      0.20      1986

    accuracy                           0.97    442801
   macro avg       0.56      0.94      0.59    442801
weighted avg       1.00      0.97      0.98    442801



Variable Importances:


Unnamed: 0,variable,importance
163,spn_fmi_1569_31,0.438912
14,FuelTemperature,0.234057
16,LampStatus,0.074303
0,activeTransitionCount,0.038777
300,spn_fmi_3031_9,0.035722
717,spn_fmi_5848_9,0.017069
343,spn_fmi_3226_9,0.011243
627,spn_fmi_5246_16,0.009862
389,spn_fmi_3362_31,0.009312
3,CruiseControlSetSpeed,0.00902


------ TEST
confusion matrix
[[100254   3171]
 [    69    379]]
classification report
              precision    recall  f1-score   support

           0       1.00      0.97      0.98    103425
           1       0.11      0.85      0.19       448

    accuracy                           0.97    103873
   macro avg       0.55      0.91      0.59    103873
weighted avg       1.00      0.97      0.98    103873

ROC AUC Score
0.960453755740875


In [109]:
# to load model
# gbr = load('../models/gbr_model_1.joblib') 

# to save model
#dump(gbr, '../models/gbr_model_17.joblib') 

['../models/gbr_model_17.joblib']

Besides saving the models, I will construct a json file that describes how they were obtained.

In [29]:
import json

In [110]:
to_dump = {
    'file_path' : '../models/gbr_model_17.joblib',
    'targets' : 'any row where a derate (5246) happens in the next 24 hours',
    'diagnostics_file' : 'used imputer to average data per truck and then simple mean to average any remaining nulls',
    'train_test_split' : 'using trucks and assuring same ratio of derate and nonderate',
    'windowize_predictors': {'dataframe': 'merged faults and diagnostics',
                             'how far in the past to aggregate' : '7 days',
                             'how to aggregate the one-hot encoded spn_fmi': 'max (default)',
                             'use rolling window on diagnostics?' : 'False ',
                             'how to aggregate diagnostics data' : 'mean'},
    'pipeline' : {'step 1': 'GradientBoostingClassifier (default values)'},
    'rebalancing' : {'over or under fitting': 'used SMOTE(k_neighbors=5, random_state=42)',
                     'variables used': 'all (including derate columns)'}

}

tmp_matrix = confusion_matrix(y_train, gbr.predict(X_train))

to_dump['train_confusion_matrix'] = {'TN': int(tmp_matrix[0][0]),
                                     'FP': int(tmp_matrix[0][1]),
                                     'FN': int(tmp_matrix[1][0]),
                                     'TP': int(tmp_matrix[1][1])}

tmp_matrix = confusion_matrix(y_test, gbr.predict(X_test))

to_dump['test_confusion_matrix'] = {'TN': int(tmp_matrix[0][0]),
                                    'FP': int(tmp_matrix[0][1]),
                                    'FN': int(tmp_matrix[1][0]),
                                    'TP': int(tmp_matrix[1][1])}


to_dump['test_rocaouc_score'] = roc_auc_score(y_true=y_test, y_score=gbr.predict_proba(X_test)[:,1])

importances = pd.DataFrame({
    'variable': gbr.feature_names_in_,
    'importance': gbr['gb'].feature_importances_
})

importances = importances.sort_values('importance', ascending = False).head(20)

tmp_dict={}

for index, row in importances.iterrows():
    tmp_dict[row["variable"]] = row['importance']

to_dump['top20_fature_importances'] = tmp_dict


json_object = json.dumps(to_dump, indent=4)

In [111]:
# with open('../models/gbr_model_17.json', 'w') as outfile:
#     outfile.write(json_object)

## Looking into the most promising model

First few steps are straightforward as outlined below:

Michael's suggestion was to check if any of the false positives actually happen to have a derate within 24 hours. Adding that to the dataframe from above

In [130]:
# load the model that I want to use to look at the predictions
gbr_best = load('../models/gbr_model_11.joblib')

# get the target values 
# note: the model might have been trained on a different time window, but if it correctly predicts derates further down the road, that is perfectly fine
# that is why, using Michael's suggestion, always compare models to the 24 hr derate window
y_derate = pd.read_pickle('../data/target_derate24h.pkl')

y_comparison = y_derate.loc[y_derate['RecordID'].isin(records_test)].sort_values('RecordID')

# preditc y values based on model
y_pred = gbr_best.predict(X_test) #.drop(columns=[col for col in X_smote.columns if 'spn_fmi_5246' in col])

# put all of it together in a dataframe
y_comparison['predicted'] = y_pred

# merge it back to get the complete faults info
test_results = pd.merge(faults, y_comparison, on='RecordID', how='inner')

This is the confusion Matrix for the model 5:
- "TN": 100535
- "FP": 3099
- "FN": 37
- "TP": 202

This is the confusion Matrix for the model 16:
- "TN": 99488
- "FP": 3937
- "FN": 66
- "TP": 382

In [131]:
test_results

Unnamed: 0,RecordID,EventTimeStamp,spn,fmi,activeTransitionCount,EquipmentID,target,predicted
0,4,2015-02-21 11:35:33,1807,2,127,1369,0,0
1,6,2015-02-21 11:40:22,111,17,1,1417,0,0
2,15,2015-02-21 11:14:38,1067,2,127,309,0,0
3,35,2015-02-21 12:01:10,111,17,1,1585,0,0
4,50,2015-02-21 12:13:47,1807,2,127,1369,0,0
...,...,...,...,...,...,...,...,...
103868,1248425,2020-03-06 12:00:41,829,3,126,1853,0,0
103869,1248426,2020-03-06 12:00:41,96,3,126,1853,0,0
103870,1248431,2020-03-06 12:20:36,96,3,126,1853,0,0
103871,1248432,2020-03-06 12:20:36,829,3,126,1853,0,0


In [132]:
# select the false positives
false_positive = test_results.loc[(test_results['target'] == 0) & (test_results['predicted'] == 1)]


The logic here is that if I use the rolling window again, I can sum up on the "predicted" values. Any sums that are more than 1 indicate repeated values. I.e. they show that those predictions occur within 24 hours and therefore, they were not actually separate predictions of the model.

In order to get the unique false predictions, we count how many times 'predicted' was 1.

**results**:
- model 5: 819 out of 3099 are false positives
- model 16: 709 out of 3937 are false positives

In [133]:
false_positive = (
    false_positive
    .sort_values(['EquipmentID','EventTimeStamp'])
    .groupby('EquipmentID')[['EventTimeStamp', 'predicted']]
    .rolling(window = '1d', on = "EventTimeStamp")
    .sum()
)

false_positive.loc[false_positive['predicted'] == 1.]

Unnamed: 0_level_0,Unnamed: 1_level_0,EventTimeStamp,predicted
EquipmentID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1329,635,2015-02-25 15:38:07,1.0
1339,7575,2015-06-12 16:25:19,1.0
1343,2593,2015-04-23 18:13:33,1.0
1343,2899,2015-04-27 16:18:02,1.0
1343,3060,2015-04-28 23:35:13,1.0
...,...,...,...
2171,87711,2018-04-29 16:20:09,1.0
2220,102837,2020-01-16 13:35:15,1.0
2338,102946,2020-01-20 12:46:31,1.0
309,86386,2018-03-12 11:31:45,1.0


Similar approach to get the false negatives, except now we invert target and predicted.

**results**:
- model 5: 11 out of 37 are false negatives
- model 16: 22 out of 66 are false negatives


In [134]:
# select the false negatives
false_negative = test_results.loc[(test_results['target'] == 1) & (test_results['predicted'] == 0)]


In [135]:
false_negative = (
    false_negative
    .sort_values(['EquipmentID','EventTimeStamp'])
    .groupby('EquipmentID')[['EventTimeStamp', 'target']]
    .rolling(window = '1d', on = "EventTimeStamp")
    .sum()
)

len(false_negative.loc[false_negative['target'] == 1.])

29

Finally, looking at the true positives

**results**: 
- 67 out of 202 are true positives, out of which 21 are predicted at least 2 hours in advance
- 67 out of 382 are true positives, out of which 45 are predicted at least 2 hours in advance

In [136]:
# select the true positive
true_positive = test_results.loc[(test_results['target'] == 1) & (test_results['predicted'] == 1)]


In [137]:
true_positive = (
    true_positive
    .sort_values(['EquipmentID','EventTimeStamp'])
    .groupby('EquipmentID')[['EventTimeStamp', 'RecordID', 'predicted', 'target']]
    .rolling(window = '1d', on = "EventTimeStamp")
    .agg({'RecordID': lambda x: x[-1], 'predicted': 'sum', 'target': 'sum'})
    .reset_index()
)

true_positive['RecordID'] = true_positive['RecordID'].astype(int)

true_positive = true_positive.loc[true_positive['predicted'] == 1.]

> NOTE: do not use iterrows to modify the dataframe it's being iterated over!! the results are not guaranteed

In [138]:
true_positive

Unnamed: 0,EquipmentID,EventTimeStamp,RecordID,predicted,target
0,1329,2015-02-25 13:53:08,5713,1.0,1.0
4,1339,2015-06-12 08:24:15,85259,1.0,1.0
6,1366,2015-06-11 10:08:58,84237,1.0,1.0
20,1366,2015-07-03 15:10:45,109732,1.0,1.0
22,1366,2015-09-23 04:36:59,214072,1.0,1.0
...,...,...,...,...,...
335,1919,2018-11-04 08:07:49,1075078,1.0,1.0
338,1922,2019-07-07 11:13:03,1176722,1.0,1.0
353,1970,2019-04-28 17:50:36,1153464,1.0,1.0
363,2004,2019-07-03 07:08:25,1176072,1.0,1.0


In [139]:
derate_times = []

for index, row in true_positive.iterrows():
    derate_times.append(
        faults.loc[(faults['EquipmentID'] == str(row['EquipmentID']))
                   & (faults['spn'] == 5246) 
                   & (faults['EventTimeStamp'] >= row['EventTimeStamp'])]
                   .iloc[0]['EventTimeStamp']
    )

In [140]:
true_positive['derateTimeStamp'] = derate_times

In [141]:
true_positive['timediff'] = true_positive['derateTimeStamp'] - true_positive['EventTimeStamp']

In [142]:
len(true_positive.loc[true_positive['timediff'] > timedelta(hours= 2)])

46