# Big G Express - Data Exploration

## Team: Elden Ring

<img src="https://eldenring.wiki.fextralife.com/file/Elden-Ring/mirel_pastor_of_vow.jpg" alt="PRAISE DOG" style="width:806px;height:600px;"/>

#### PRAISE THE DOG!

In [1]:
import pandas as pd
import numpy as np
from datetime import datetime, timedelta

In [2]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix

from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler

from sklearn.ensemble import GradientBoostingClassifier

from imblearn.over_sampling import SMOTE

from sklearn.impute import SimpleImputer

In [3]:
faults = pd.read_pickle('../data/faults_filtered.pkl')
y_derate = pd.read_pickle('../data/target_derate.pkl')
y_75derate = pd.read_pickle('../data/target_75derate.pkl')
diagnostics_imputed = pd.read_pickle('../data/diagnostics_imputed.pkl')

In [4]:
# this one is mostly NaNs, just 250 values or so
diagnostics_imputed = diagnostics_imputed.drop(columns='ServiceDistance')

Remember there are parts of columns (where a particular truck had no values)

In [5]:
# this was just a simple fill with mean..
diagnostics_imputed['AcceleratorPedal'] = diagnostics_imputed['AcceleratorPedal'].fillna(value=diagnostics_imputed['AcceleratorPedal'].mean())
diagnostics_imputed['CruiseControlSetSpeed'] = diagnostics_imputed['CruiseControlSetSpeed'].fillna(value=diagnostics_imputed['CruiseControlSetSpeed'].mean())
diagnostics_imputed['EngineTimeLtd'] = diagnostics_imputed['EngineTimeLtd'].fillna(value=diagnostics_imputed['EngineTimeLtd'].mean())
diagnostics_imputed['FuelLevel'] = diagnostics_imputed['FuelLevel'].fillna(value=diagnostics_imputed['FuelLevel'].mean())
diagnostics_imputed['FuelTemperature'] = diagnostics_imputed['FuelTemperature'].fillna(value=diagnostics_imputed['FuelTemperature'].mean())
diagnostics_imputed['SwitchedBatteryVoltage'] = diagnostics_imputed['SwitchedBatteryVoltage'].fillna(value=diagnostics_imputed['SwitchedBatteryVoltage'].mean())
diagnostics_imputed['Throttle'] = diagnostics_imputed['Throttle'].fillna(value=diagnostics_imputed['Throttle'].mean())

In [6]:
#diagnostics_imputed = SimpleImputer().fit_transform(diagnostics_imputed)

In [7]:
# this took 30 min and didn't stop ...
# from sklearn.experimental import enable_iterative_imputer
# from sklearn.impute import IterativeImputer, KNNImputer
# scaler = StandardScaler().fit(diagnostics_imputed)

# knn_filled = scaler.inverse_transform(KNNImputer().fit_transform(scaler.transform(diagnostics_imputed)))

# diagnostics_imputed = IterativeImputer().fit_transform(diagnostics_imputed)

## Merging and prepping the data

In [8]:
faults_diagnostics = faults.merge(diagnostics_imputed, left_on='RecordID', right_on='FaultId', how='inner')

In [9]:
faults_diagnostics = faults_diagnostics.drop(columns=['ESS_Id', 'eventDescription','ecuSoftwareVersion', 'ecuSerialNumber', 
    'ecuModel', 'ecuMake', 'ecuSource', 'MCTNumber', 'Latitude', 'Longitude', 'LocationTimeStamp'])

In [10]:
test_diagnostics = faults_diagnostics.copy()

test_diagnostics['spn_fmi'] = ['_'.join(i) for i in zip(test_diagnostics['spn'].astype(str), test_diagnostics['fmi'].astype(str))]

test_diagnostics = pd.get_dummies(test_diagnostics, columns=['spn_fmi'], prefix='spn_fmi')

test_diagnostics = test_diagnostics.sort_values(by=['EquipmentID', 'EventTimeStamp'])

In [11]:
# to obtain the one hot encoded columns since there are so many
faults_cols = ['EventTimeStamp'] + [col for col in test_diagnostics.columns if 'spn_fmi' in col] 

diagnostics_cols = ['EventTimeStamp', 'activeTransitionCount', 'AcceleratorPedal',
         'BarometricPressure', 'CruiseControlSetSpeed', 'DistanceLtd',
         'EngineCoolantTemperature', 'EngineLoad', 'EngineOilPressure', 
        'EngineOilTemperature', 'EngineRpm', 'EngineTimeLtd', 'FuelLevel', 'FuelLtd', 
        'FuelRate', 'FuelTemperature', 'IntakeManifoldTemperature', 'LampStatus',
        'Speed', 'SwitchedBatteryVoltage', 'Throttle', 'TurboBoostPressure']

In [12]:
faults_rolling = (
    test_diagnostics
    .groupby('EquipmentID')[faults_cols]
    .rolling(window = '1d', on = "EventTimeStamp")
    .sum()
)

faults_rolling = faults_rolling.reset_index()

In [13]:
diagnostics_rolling = (
    test_diagnostics
    .groupby('EquipmentID')[diagnostics_cols]
    .rolling(window = '1d', on = "EventTimeStamp")
    .mean()
)

diagnostics_rolling = diagnostics_rolling.reset_index()

In [14]:
faults_rolling = pd.merge(faults_diagnostics['RecordID'],
                          faults_rolling,
                          left_index= True,
                          right_on = 'level_1').drop(columns='level_1')

diagnostics_rolling = pd.merge(faults_diagnostics['RecordID'],
                          diagnostics_rolling,
                          left_index= True,
                          right_on = 'level_1').drop(columns='level_1')

In [15]:
faults_diagnostics_rolling =  pd.merge(diagnostics_rolling.drop(columns=['EquipmentID', 'EventTimeStamp']),
            faults_rolling.drop(columns=['EquipmentID', 'EventTimeStamp']),
            on = 'RecordID')

In [16]:
faults_diagnostics_rolling = faults_diagnostics_rolling.drop(columns='RecordID')

## Training and test

In [17]:
# use stratify on target (with derate) and split based on trucks
X_train, X_test, y_train, y_test = train_test_split(faults_diagnostics_rolling, y_derate['target'], train_size = 0.8, test_size = 0.2, random_state = 42)

In [18]:
y_train.value_counts()

0    436214
1      1125
Name: target, dtype: int64

In [19]:
y_test.value_counts()

0    109071
1       264
Name: target, dtype: int64

In [20]:
pipeline_lr = Pipeline(
    steps = [
        ('scaler', StandardScaler()),
        ('logreg', LogisticRegression())
        # note: more steps can be added here ...
    ]
)

In [21]:
pipeline_lr.fit(X_train, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [22]:
gbr = Pipeline(
    steps = [
        #('scaler', MinMaxScaler()),
        ('gb', GradientBoostingClassifier(verbose=True))#n_estimators = 1000, learning_rate=0.01
    ]
)

gbr.fit(X_train, y_train)

      Iter       Train Loss   Remaining Time 
         1           0.0359            8.45m
         2           0.0357            8.45m
         3           0.0357            8.32m
         4           0.0359            8.22m
         5 10611194024910003372032.0000            8.16m
         6 10611194024910003372032.0000            8.06m
         7 10611194024910003372032.0000            7.96m
         8 10611194024910003372032.0000            7.86m
         9 10611194024910003372032.0000            7.77m
        10 10611194024910003372032.0000            7.68m
        20 10611194024910003372032.0000            6.77m
        30 10611194024910003372032.0000            5.93m
        40 1492340984998544128051864133412665180553216.0000            5.08m
        50 1492340984998544128051864133412665180553216.0000            4.23m
        60 1492340984998544128051864133412665180553216.0000            3.40m
        70 1492340984998544128051864133412665180553216.0000            2.55m
        80

In [23]:
confusion_matrix(y_train, gbr.predict(X_train))

array([[436210,      4],
       [  1113,     12]])

In [24]:
print(classification_report(y_train, gbr.predict(X_train)))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00    436214
           1       0.75      0.01      0.02      1125

    accuracy                           1.00    437339
   macro avg       0.87      0.51      0.51    437339
weighted avg       1.00      1.00      1.00    437339



In [25]:
importances = pd.DataFrame({
    'variable': gbr.feature_names_in_,
    'importance': gbr['gb'].feature_importances_
})

importances.sort_values('importance', ascending = False).head(20)

Unnamed: 0,variable,importance
323,spn_fmi_3217_10,0.34156
14,IntakeManifoldTemperature,0.229634
4,EngineCoolantTemperature,0.063139
621,spn_fmi_524071_31,0.055921
5,EngineLoad,0.037434
3,DistanceLtd,0.036314
0,AcceleratorPedal,0.035327
941,spn_fmi_829_0,0.028072
16,Speed,0.02744
11,FuelLtd,0.019834


In [None]:
faults.loc[(faults['spn'] == 46262) & (faults['fmi'] == 0)]

In [None]:
faults.loc[(faults['EquipmentID'] == '1450') & (faults['EventTimeStamp'] >= '2015-07-10 09:05:55')].head(200)

In [None]:
X_train.loc[X_train['spn_fmi_46262_0'] > 0]

In [None]:
faults.loc[(faults['EquipmentID'] == '1450') & (faults['spn'] == 5246)]

In [None]:
oversampler = SMOTE(k_neighbors=5, random_state=42)

In [None]:
X_smote, y_smote = oversampler.fit_resample(X_train, y_train)

In [None]:
y_smote.value_counts()

In [None]:
gbr_smoted = Pipeline(
    steps = [
        #('scaler', MinMaxScaler()),
        ('gb', GradientBoostingClassifier(verbose=True))#n_estimators = 1000, learning_rate=0.01
    ]
)

gbr_smoted.fit(X_smote, y_smote)

In [None]:
y_predict = gbr_smoted.predict(X_train)

In [None]:
confusion_matrix(y_train, y_predict)

In [None]:
confusion_matrix(y_test, gbr.predict(X_test))

In [None]:
confusion_matrix(y_test, gbr_smoted.predict(X_test))

In [None]:
print(classification_report(y_train, y_predict))

In [None]:
importances = pd.DataFrame({
    'variable': gbr_smoted.feature_names_in_,
    'importance': gbr_smoted['gb'].feature_importances_
})

importances.sort_values('importance', ascending = False).head(20)

In [None]:
from sklearn.metrics import roc_auc_score

In [None]:
gbr.predict_proba(X_test)[:,1]

In [None]:
roc_auc_score(y_true=y_test, y_score=gbr.predict_proba(X_test)[:,1])

In [None]:
roc_auc_score(y_true=y_test, y_score=gbr_smoted.predict_proba(X_test)[:,1])