# Big G Express - Data Exploration

## Team: Elden Ring

<img src="https://eldenring.wiki.fextralife.com/file/Elden-Ring/mirel_pastor_of_vow.jpg" alt="PRAISE DOG" style="width:806px;height:600px;"/>

#### PRAISE THE DOG!

In [1]:
import pandas as pd
import numpy as np
from datetime import datetime, timedelta

from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix

In [2]:
faults = pd.read_pickle('../data/faults_filtered.pkl')
y_derate = pd.read_pickle('../data/target_derate.pkl')
y_75derate = pd.read_pickle('../data/target_75derate.pkl')
diagnostics_imputed = pd.read_pickle('../data/diagnostics_imputed.pkl')

In [3]:
# this one is mostly NaNs, just 250 values or so
diagnostics_imputed = diagnostics_imputed.drop(columns='ServiceDistance')

In [4]:
# for right now, for the trucks that didn't have values, just use average from other trucks
diagnostics_imputed['AcceleratorPedal'] = diagnostics_imputed['AcceleratorPedal'].fillna(value=diagnostics_imputed['AcceleratorPedal'].mean())
diagnostics_imputed['CruiseControlSetSpeed'] = diagnostics_imputed['CruiseControlSetSpeed'].fillna(value=diagnostics_imputed['CruiseControlSetSpeed'].mean())
diagnostics_imputed['EngineTimeLtd'] = diagnostics_imputed['EngineTimeLtd'].fillna(value=diagnostics_imputed['EngineTimeLtd'].mean())
diagnostics_imputed['FuelLevel'] = diagnostics_imputed['FuelLevel'].fillna(value=diagnostics_imputed['FuelLevel'].mean())
diagnostics_imputed['FuelTemperature'] = diagnostics_imputed['FuelTemperature'].fillna(value=diagnostics_imputed['FuelTemperature'].mean())
diagnostics_imputed['SwitchedBatteryVoltage'] = diagnostics_imputed['SwitchedBatteryVoltage'].fillna(value=diagnostics_imputed['SwitchedBatteryVoltage'].mean())
diagnostics_imputed['Throttle'] = diagnostics_imputed['Throttle'].fillna(value=diagnostics_imputed['Throttle'].mean())

## Merging and prepping the data

In [5]:
faults_diagnostics = faults.merge(diagnostics_imputed, left_on='RecordID', right_on='FaultId', how='inner')

In [6]:
faults_diagnostics = faults_diagnostics.drop(columns=['ESS_Id', 'eventDescription','ecuSoftwareVersion', 'ecuSerialNumber', 
    'ecuModel', 'ecuMake', 'ecuSource', 'MCTNumber', 'Latitude', 'Longitude', 'LocationTimeStamp'])

In [7]:
test_diagnostics = faults_diagnostics.copy()

test_diagnostics['spn_fmi'] = ['_'.join(i) for i in zip(test_diagnostics['spn'].astype(str), test_diagnostics['fmi'].astype(str))]

test_diagnostics = pd.get_dummies(test_diagnostics, columns=['spn_fmi'], prefix='spn_fmi')

test_diagnostics = test_diagnostics.sort_values(by='EventTimeStamp')

In [8]:
# to obtain the one hot encoded columns since there are so many
spnfmi_cols = [col for col in test_diagnostics.columns if 'spn_fmi' in col]
fixed_cols = ['RecordID', 'spn', 'fmi']

In [9]:
faults_diagnostics_rolling = (
    test_diagnostics
    .groupby('EquipmentID')[['EventTimeStamp'] + spnfmi_cols]
    .rolling(window = '1d', on = "EventTimeStamp")
    .sum()
)

faults_diagnostics_rolling = faults_diagnostics_rolling.reset_index()

In [10]:
faults_diagnostics_rolling = pd.merge(faults_diagnostics,
                          faults_diagnostics_rolling,
                          left_index= True,
                          right_on = 'level_1').drop(columns='level_1')

In [12]:
faults_diagnostics_rolling = faults_diagnostics_rolling.drop(columns=['FaultId','active','EventTimeStamp_x', 'EventTimeStamp_y', 'spn', 'fmi', 'EquipmentID_x', 'EquipmentID_y'])

## Training and test

In [14]:
X_train, X_test, y_train, y_test = train_test_split(faults_diagnostics_rolling, y_derate, train_size = 0.8, test_size = 0.2, random_state = 42)

In [15]:
y_train['target'].value_counts()

0    436214
1      1125
Name: target, dtype: int64

In [16]:
y_test['target'].value_counts()

0    109071
1       264
Name: target, dtype: int64