# Big G Express - Data Exploration

## Team: Elden Ring

<img src="https://eldenring.wiki.fextralife.com/file/Elden-Ring/mirel_pastor_of_vow.jpg" alt="PRAISE DOG" style="width:806px;height:600px;"/>

#### PRAISE THE DOG!

In [1]:
import pandas as pd
import numpy as np
from datetime import datetime, timedelta

In [2]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix

from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler

from sklearn.ensemble import GradientBoostingClassifier

from imblearn.over_sampling import SMOTE

from sklearn.impute import SimpleImputer

In [3]:
faults = pd.read_pickle('../data/faults_filtered.pkl')
y_derate = pd.read_pickle('../data/target_derate.pkl')
y_75derate = pd.read_pickle('../data/target_75derate.pkl')
diagnostics_imputed = pd.read_pickle('../data/diagnostics_imputed.pkl')

In [4]:
# this one is mostly NaNs, just 250 values or so
diagnostics_imputed = diagnostics_imputed.drop(columns='ServiceDistance')

Remember there are parts of columns (where a particular truck had no values)

In [5]:
# this was just a simple fill with mean..
diagnostics_imputed['AcceleratorPedal'] = diagnostics_imputed['AcceleratorPedal'].fillna(value=diagnostics_imputed['AcceleratorPedal'].mean())
diagnostics_imputed['CruiseControlSetSpeed'] = diagnostics_imputed['CruiseControlSetSpeed'].fillna(value=diagnostics_imputed['CruiseControlSetSpeed'].mean())
diagnostics_imputed['EngineTimeLtd'] = diagnostics_imputed['EngineTimeLtd'].fillna(value=diagnostics_imputed['EngineTimeLtd'].mean())
diagnostics_imputed['FuelLevel'] = diagnostics_imputed['FuelLevel'].fillna(value=diagnostics_imputed['FuelLevel'].mean())
diagnostics_imputed['FuelTemperature'] = diagnostics_imputed['FuelTemperature'].fillna(value=diagnostics_imputed['FuelTemperature'].mean())
diagnostics_imputed['SwitchedBatteryVoltage'] = diagnostics_imputed['SwitchedBatteryVoltage'].fillna(value=diagnostics_imputed['SwitchedBatteryVoltage'].mean())
diagnostics_imputed['Throttle'] = diagnostics_imputed['Throttle'].fillna(value=diagnostics_imputed['Throttle'].mean())

In [6]:
#diagnostics_imputed = SimpleImputer().fit_transform(diagnostics_imputed)

In [7]:
# this took 30 min and didn't stop ...
# from sklearn.experimental import enable_iterative_imputer
# from sklearn.impute import IterativeImputer, KNNImputer
# scaler = StandardScaler().fit(diagnostics_imputed)

# knn_filled = scaler.inverse_transform(KNNImputer().fit_transform(scaler.transform(diagnostics_imputed)))

# diagnostics_imputed = IterativeImputer().fit_transform(diagnostics_imputed)

## Merging and prepping the data

In [8]:
faults_diagnostics = faults.merge(diagnostics_imputed, left_on='RecordID', right_on='FaultId', how='inner')

In [9]:
faults_diagnostics = faults_diagnostics.drop(columns=['ESS_Id', 'eventDescription','ecuSoftwareVersion', 'ecuSerialNumber', 
    'ecuModel', 'ecuMake', 'ecuSource', 'MCTNumber', 'Latitude', 'Longitude', 'LocationTimeStamp'])

In [10]:
faults_diagnostics['spn_fmi'] = ['_'.join(i) for i in zip(faults_diagnostics['spn'].astype(str), faults_diagnostics['fmi'].astype(str))]

faults_diagnostics = pd.get_dummies(faults_diagnostics, columns=['spn_fmi'], prefix='spn_fmi')

faults_diagnostics = faults_diagnostics.sort_values(by=['EquipmentID', 'EventTimeStamp'])

In [11]:
# to obtain the one hot encoded columns since there are so many
faults_cols = ['EventTimeStamp'] + [col for col in faults_diagnostics.columns if 'spn_fmi' in col] 

diagnostics_cols = ['EventTimeStamp', 'activeTransitionCount', 'AcceleratorPedal',
         'BarometricPressure', 'CruiseControlSetSpeed', 'DistanceLtd',
         'EngineCoolantTemperature', 'EngineLoad', 'EngineOilPressure', 
        'EngineOilTemperature', 'EngineRpm', 'EngineTimeLtd', 'FuelLevel', 'FuelLtd', 
        'FuelRate', 'FuelTemperature', 'IntakeManifoldTemperature', 'LampStatus',
        'Speed', 'SwitchedBatteryVoltage', 'Throttle', 'TurboBoostPressure']

In [12]:
faults_rolling = (
    faults_diagnostics
    .groupby('EquipmentID')[faults_cols]
    .rolling(window = '1d', on = "EventTimeStamp")
    .max()
)

faults_rolling = faults_rolling.reset_index()

In [13]:
diagnostics_rolling = (
    faults_diagnostics
    .groupby('EquipmentID')[diagnostics_cols]
    .rolling(window = '1d', on = "EventTimeStamp")
    .mean()
)

diagnostics_rolling = diagnostics_rolling.reset_index()

In [14]:
faults_rolling = pd.merge(faults_diagnostics['RecordID'], #[['RecordID'] + diagnostics_cols]
                          faults_rolling,
                          left_index= True,
                          right_on = 'level_1').drop(columns='level_1')

diagnostics_rolling = pd.merge(faults_diagnostics['RecordID'],
                          diagnostics_rolling,
                          left_index= True,
                          right_on = 'level_1').drop(columns='level_1')

In [15]:
faults_diagnostics_rolling =  pd.merge(diagnostics_rolling.drop(columns=['EquipmentID', 'EventTimeStamp']),
            faults_rolling.drop(columns=['EquipmentID', 'EventTimeStamp']),
            on = 'RecordID')

In [19]:
faults_diagnostics_rolling = faults_diagnostics_rolling.sort_values('RecordID').drop(columns='RecordID')

In [None]:
#faults_diagnostics_rolling = faults_rolling.drop(columns=['RecordID', 'EquipmentID', 'EventTimeStamp_x', 'EventTimeStamp_y'])


## Training and test

In [20]:
# use stratify on target (with derate) and split based on trucks
X_train, X_test, y_train, y_test = train_test_split(faults_diagnostics_rolling, y_derate.sort_values('RecordID')['target'], train_size = 0.8, test_size = 0.2, random_state = 42)

In [21]:
y_train.value_counts()

0    436270
1      1069
Name: target, dtype: int64

In [22]:
y_test.value_counts()

0    109015
1       320
Name: target, dtype: int64

In [23]:
pipeline_lr = Pipeline(
    steps = [
        ('scaler', StandardScaler()),
        ('logreg', LogisticRegression())
        # note: more steps can be added here ...
    ]
)

In [24]:
pipeline_lr.fit(X_train, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [25]:
gbr = Pipeline(
    steps = [
        #('scaler', MinMaxScaler()),
        ('gb', GradientBoostingClassifier(verbose=True))#n_estimators = 1000, learning_rate=0.01
    ]
)

gbr.fit(X_train, y_train)

      Iter       Train Loss   Remaining Time 
         1           0.0441            6.95m
         2          97.3975            6.90m
         3          97.3973            6.85m
         4         145.8726            6.78m
         5         145.9625            6.73m
         6         145.9624            6.65m
         7         145.9623            6.57m
         8         145.9622            6.50m
         9         145.9622            6.41m
        10         145.9621            6.33m
        20         145.9619            5.56m
        30         145.9619            4.87m
        40         145.9618            4.18m
        50         145.9618            3.48m
        60         145.9618            2.79m
        70         145.9618            2.09m
        80         145.9618            1.40m
        90         145.9618           41.92s
       100         145.9618            0.00s


In [26]:
confusion_matrix(y_train, gbr.predict(X_train))

array([[436236,     34],
       [   878,    191]])

In [27]:
print(classification_report(y_train, gbr.predict(X_train)))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00    436270
           1       0.85      0.18      0.30      1069

    accuracy                           1.00    437339
   macro avg       0.92      0.59      0.65    437339
weighted avg       1.00      1.00      1.00    437339



In [28]:
importances = pd.DataFrame({
    'variable': gbr.feature_names_in_,
    'importance': gbr['gb'].feature_importances_
})

importances.sort_values('importance', ascending = False).head(20)

Unnamed: 0,variable,importance
624,spn_fmi_5246_0,0.43237
8,EngineRpm,0.208056
628,spn_fmi_5246_19,0.147001
90,spn_fmi_111_17,0.14396
522,spn_fmi_4360_19,0.036558
89,spn_fmi_111_1,0.021812
467,spn_fmi_4094_18,0.004083
13,FuelTemperature,0.001049
7,EngineOilTemperature,0.000855
6,EngineOilPressure,0.000743


NOTE: during one of the trainings there was a particular spn-fmi code that got to the top, with importance of 0.8! It was the 46262 code and after inspecting, despite appearing only once in the dataset, since there are many events happening in a smlal timeframe around it, it got picked up as important!

In [29]:
oversampler = SMOTE(k_neighbors=5, random_state=42)

In [30]:
X_smote, y_smote = oversampler.fit_resample(X_train, y_train)

In [31]:
y_smote.value_counts()

0    436270
1    436270
Name: target, dtype: int64

In [32]:
gbr_smoted = Pipeline(
    steps = [
        #('scaler', MinMaxScaler()),
        ('gb', GradientBoostingClassifier(verbose=True))#n_estimators = 1000, learning_rate=0.01
    ]
)

gbr_smoted.fit(X_smote, y_smote)

      Iter       Train Loss   Remaining Time 
         1           1.2365           16.15m
         2           1.1140           16.11m
         3           1.0121           15.94m
         4           0.9289           15.82m
         5           0.8584           15.64m
         6           0.7933           15.46m
         7           0.7370           15.28m
         8           0.6895           15.11m
         9           0.6480           14.93m
        10           0.6007           14.77m
        20           0.3555           13.22m
        30           0.2656           11.73m
        40           0.2155           10.09m
        50           0.1900            8.42m
        60           0.1732            6.73m
        70           0.1588            5.05m
        80           0.1489            3.37m
        90           0.1404            1.69m
       100           0.1327            0.00s


In [33]:
y_predict = gbr_smoted.predict(X_train)

In [34]:
confusion_matrix(y_train, y_predict)

array([[426147,  10123],
       [    69,   1000]])

In [35]:
confusion_matrix(y_test, gbr.predict(X_test))

array([[109003,     12],
       [   270,     50]])

In [36]:
confusion_matrix(y_test, gbr_smoted.predict(X_test))

array([[106448,   2567],
       [    22,    298]])

In [37]:
print(classification_report(y_train, y_predict))

              precision    recall  f1-score   support

           0       1.00      0.98      0.99    436270
           1       0.09      0.94      0.16      1069

    accuracy                           0.98    437339
   macro avg       0.54      0.96      0.58    437339
weighted avg       1.00      0.98      0.99    437339



In [38]:
importances = pd.DataFrame({
    'variable': gbr_smoted.feature_names_in_,
    'importance': gbr_smoted['gb'].feature_importances_
})

importances.sort_values('importance', ascending = False).head(20)

Unnamed: 0,variable,importance
15,LampStatus,0.405861
13,FuelTemperature,0.300429
163,spn_fmi_1569_31,0.123809
624,spn_fmi_5246_0,0.059353
20,activeTransitionCount,0.033369
300,spn_fmi_3031_9,0.015914
830,spn_fmi_6802_31,0.008257
627,spn_fmi_5246_16,0.006828
90,spn_fmi_111_17,0.00614
2,CruiseControlSetSpeed,0.00539


In [41]:
from sklearn.metrics import roc_auc_score

In [39]:
gbr.predict_proba(X_test)[:,1]

array([0.00102048, 0.00102048, 0.00102048, ..., 0.00102048, 0.00102048,
       0.00102048])

In [42]:
roc_auc_score(y_true=y_test, y_score=gbr.predict_proba(X_test)[:,1])

0.42974725381828194

In [43]:
roc_auc_score(y_true=y_test, y_score=gbr_smoted.predict_proba(X_test)[:,1])

0.9929663205751502

## Better Train-Test split

The above is a good start. However, there are trucks whose events end up mixed between both train and test split. Instead, we want to make sure that each individual truck only appears in one.

In [46]:
print(faults['EquipmentID'].nunique())
print(faults.loc[faults['spn'] == 5246]['EquipmentID'].nunique())

1042
189


In [51]:
all_trucks = faults['EquipmentID'].unique()
derate_trucks = faults.loc[faults['spn'] == 5246]['EquipmentID'].unique()
no_derate_trucks = all_trucks[np.isin(all_trucks, derate_trucks, invert=True)]

In [58]:
trucks_df = pd.concat([
            pd.DataFrame({'EquipmentID': derate_trucks, 'target': 1}),
            pd.DataFrame({'EquipmentID': no_derate_trucks, 'target': 0}) 
            ])

trucks_df

Unnamed: 0,EquipmentID,target
0,1630,1
1,1487,1
2,1329,1
3,1419,1
4,1486,1
...,...,...
848,2358,0
849,2343,0
850,2356,0
851,2380,0
