In [1]:
import pandas as pd
import datetime as dt
import matplotlib.pyplot as plt
import seaborn as sns
import plotly
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn import metrics
from sklearn.metrics import classification_report, f1_score
from sklearn.compose import ColumnTransformer
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

# Read in csv

In [2]:
train = pd.read_csv('../data/training_data.csv')
train = train.drop(columns=['EquipmentID.1','EquipmentID.2','FaultId'],axis = 1)

  has_raised = await self.run_ast_nodes(code_ast.body, cell_name,


In [3]:
test = pd.read_csv('../data/testing_data.csv')
test = test.drop(columns=['EquipmentID.1','EquipmentID.2','FaultId'],axis = 1) #test needs same number of columns as train

# Further clean test/train data

In [4]:
# #dummy variables for the string/object data types
train = pd.get_dummies(train, columns = ['IgnStatus'], drop_first = True).drop(columns=['ecuMake','active'])
test = pd.get_dummies(test, columns = ['IgnStatus'], drop_first = True).drop(columns=['ecuMake','active'])

In [5]:
train['time_until_derate'] = pd.to_timedelta(train['time_until_derate']) 

In [6]:
#getting rid of the ones that would be too late to catch in our training data since it doesn't help us
train = train[~((train['time_until_derate'] >= dt.timedelta(hours=0)) & (train['time_until_derate'] <= dt.timedelta(hours=1)))] 

# Drop the fault data

In [7]:
train = train.iloc[:,:21]
test = test.iloc[:,:21]

# Standard Scaler 
* do before any model fitting

In [8]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler

In [9]:
#logistic regression
pipe = Pipeline(steps=[
    
    ('scale',StandardScaler()), 
    ('classifier',LogisticRegression(penalty='l1', solver='saga')) #can also use liblinear as solver
    
]) 

In [10]:
#sgd with balanced class weights (to be used EXCEPT when oversampling)
pipesgd = Pipeline(steps=[
    
    ('scale',StandardScaler()), 
    ('classifier',SGDClassifier(class_weight='balanced',penalty='l1', loss = 'log'))
    
]) 

In [11]:
#sgd without balanced weights, to be used when oversampling aka SMOTE
pipesgdnw = Pipeline(steps=[
    
    ('scale',StandardScaler()), 
    ('classifier',SGDClassifier(penalty='l1', loss = 'log'))
    
]) 

# Set predictor and target variables

In [12]:
# Predictors
X_train = train.drop(columns = ['derate_soon','time_until_derate','EventTimeStamp','EquipmentID'])
#Target Variable
y_train = train['derate_soon']

In [13]:
X_test = test.drop(columns = ['derate_soon','time_until_derate','EventTimeStamp','EquipmentID'])
y_test = test['derate_soon']

In [14]:
y_test = y_test.replace('True',1)
y_test = y_test.replace('False',0)

# Logistic Regression
* with l1 penalty

In [15]:
pipe.fit(X_train, y_train)



Pipeline(steps=[('scale', StandardScaler()),
                ('classifier',
                 LogisticRegression(penalty='l1', solver='saga'))])

In [16]:
y_pred = pipe.predict(X_test)

How well the model did

In [17]:
accuracy_score(y_test,y_pred)

0.9977141102515331

In [18]:
confusion_matrix(y_test,y_pred)

array([[116973,      0],
       [   268,      0]], dtype=int64)

In [19]:
print(classification_report(y_test, y_pred))

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


              precision    recall  f1-score   support

       False       1.00      1.00      1.00    116973
        True       0.00      0.00      0.00       268

    accuracy                           1.00    117241
   macro avg       0.50      0.50      0.50    117241
weighted avg       1.00      1.00      1.00    117241



  _warn_prf(average, modifier, msg_start, len(result))


# SGD
* weighting by class

In [20]:
pipesgd.fit(X_train,y_train)



Pipeline(steps=[('scale', StandardScaler()),
                ('classifier',
                 SGDClassifier(class_weight='balanced', loss='log',
                               penalty='l1'))])

In [21]:
y_pred = pipesgd.predict(X_test)

How well the model did

In [22]:
accuracy_score(y_test,y_pred)

0.5805733489137759

In [23]:
confusion_matrix(y_test,y_pred)

array([[67917, 49056],
       [  118,   150]], dtype=int64)

In [24]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

       False       1.00      0.58      0.73    116973
        True       0.00      0.56      0.01       268

    accuracy                           0.58    117241
   macro avg       0.50      0.57      0.37    117241
weighted avg       1.00      0.58      0.73    117241



# Undersampling

In [25]:
from imblearn.under_sampling import RandomUnderSampler

In [26]:
undersampler = RandomUnderSampler(random_state = 321)

In [27]:
X_resampled, y_resampled = undersampler.fit_resample(X_train, y_train)

### Logistic Regression

In [28]:
pipe.fit(X_resampled, y_resampled)



Pipeline(steps=[('scale', StandardScaler()),
                ('classifier',
                 LogisticRegression(penalty='l1', solver='saga'))])

In [29]:
y_pred = pipe.predict(X_test)

How well the model did

In [30]:
accuracy_score(y_test, y_pred)

0.6001995888810229

In [31]:
confusion_matrix(y_test, y_pred)


array([[70225, 46748],
       [  125,   143]], dtype=int64)

In [32]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

       False       1.00      0.60      0.75    116973
        True       0.00      0.53      0.01       268

    accuracy                           0.60    117241
   macro avg       0.50      0.57      0.38    117241
weighted avg       1.00      0.60      0.75    117241



### SGD

In [62]:
pipesgd.fit(X_resampled,y_resampled) 

Pipeline(steps=[('scale', StandardScaler()),
                ('classifier',
                 SGDClassifier(class_weight='balanced', loss='log',
                               penalty='l1'))])

In [63]:
y_pred = pipesgd.predict(X_test)

How well the model did

In [64]:
accuracy_score(y_test, y_pred)

0.14688547521771395

In [65]:
confusion_matrix(y_test, y_pred)

array([[16978, 99995],
       [   25,   243]], dtype=int64)

In [66]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

       False       1.00      0.15      0.25    116973
        True       0.00      0.91      0.00       268

    accuracy                           0.15    117241
   macro avg       0.50      0.53      0.13    117241
weighted avg       1.00      0.15      0.25    117241



# SMOTE
**S**ynthetic **M**inority **O**versampling **TE**chnique

In [38]:
from imblearn.over_sampling import SMOTE

In [39]:
oversampler = SMOTE(k_neighbors=5, n_jobs=-1, random_state=321)

In [40]:
X_smote, y_smote = oversampler.fit_resample(X_train, y_train)

### Logistic Regression

In [41]:
pipe.fit(X_smote, y_smote)

Pipeline(steps=[('scale', StandardScaler()),
                ('classifier',
                 LogisticRegression(penalty='l1', solver='saga'))])

In [42]:
y_pred = pipe.predict(X_test)

How well the model did

In [43]:
accuracy_score(y_test, y_pred)

0.6005834136522207

In [44]:
confusion_matrix(y_test, y_pred)

array([[70270, 46703],
       [  125,   143]], dtype=int64)

### SGD
* unweighted because SMOTE oversamples

In [45]:
pipesgdnw.fit(X_smote, y_smote)

Pipeline(steps=[('scale', StandardScaler()),
                ('classifier', SGDClassifier(loss='log', penalty='l1'))])

In [46]:
y_pred = pipesgdnw.predict(X_test)


How well the model did

In [47]:
accuracy_score(y_test, y_pred)

0.6107419759299221

In [48]:
confusion_matrix(y_test, y_pred)

array([[71462, 45511],
       [  126,   142]], dtype=int64)

# Lesser SMOTE

In [49]:
lessersmote = SMOTE(k_neighbors=5, n_jobs=-1, random_state=321, sampling_strategy=0.05)
# adjusts the ratio of derate:not derate so we have less fake data points

In [50]:
X_lsmote, y_lsmote = lessersmote.fit_resample(X_train, y_train)

### Logistic Regression

In [51]:
pipe.fit(X_lsmote, y_lsmote)

Pipeline(steps=[('scale', StandardScaler()),
                ('classifier',
                 LogisticRegression(penalty='l1', solver='saga'))])

In [52]:
y_pred = pipe.predict(X_test)


How well the model did

In [53]:
accuracy_score(y_test, y_pred)

0.9957523391987445

In [54]:
confusion_matrix(y_test, y_pred)

array([[116742,    231],
       [   267,      1]], dtype=int64)

### SGD
* unweighted because SMOTE oversamples

In [55]:
pipesgdnw.fit(X_lsmote, y_lsmote)

Pipeline(steps=[('scale', StandardScaler()),
                ('classifier', SGDClassifier(loss='log', penalty='l1'))])

In [56]:
y_pred = pipesgdnw.predict(X_test)


How well the model did

In [57]:
accuracy_score(y_test, y_pred)

0.9968952840729779

In [58]:
confusion_matrix(y_test, y_pred)

array([[116877,     96],
       [   268,      0]], dtype=int64)

# Pick your model
* SGD w/ undersampling had the best results (but still aren't great)

In [60]:
test['EventTimeStamp'] = pd.to_datetime(test['EventTimeStamp'])

In [67]:
y_pred = pipesgd.predict(X_test)#sgd w/ undersampling

In [68]:
test['prediction'] = y_pred

Unnamed: 0,BarometricPressure,EngineLoad,Speed,EquipmentID,EngineOilPressure,EngineOilTemperature,FuelLevel,IntakeManifoldTemperature,TurboBoostPressure,EngineCoolantTemperature,...,EngineRpm,FuelTemperature,SwitchedBatteryVoltage,DistanceLtd,FuelLtd,EngineTimeLtd,EventTimeStamp,time_until_derate,derate_soon,prediction


In [75]:
# This is how you evaluate the model you end up choosing:

success = 0
for i in range(len(test['time_until_derate']==0)):
    dr = test[test['time_until_derate']==0]
    truck = dr['EquipmentID']
    ts = dr['EventTimeStamp']
    if test[(test.EquipmentID == truck) &  
         (test.EventTimeStamp >= (ts - dt.timedelta(hours=24))) & 
         (test.EventTimeStamp <= (ts - dt.timedelta(hours=0.001)))]['prediction'].max() == True:
        success += 1
print(success)

ValueError: Can only compare identically-labeled Series objects

Features deemed important

In [76]:
coefficients = pd.DataFrame({'variable': X_train.columns, 'coefficient': pipesgd[1].coef_[0]})
coefficients[coefficients.coefficient > 0].variable.to_list()

['BarometricPressure',
 'Speed',
 'EngineOilTemperature',
 'IntakeManifoldTemperature',
 'FuelRate',
 'FuelLtd']

Setting up bootstrap to verify the above

In [83]:
from sklearn.model_selection import train_test_split
import random

In [78]:
X_boot, X_what, y_boot, y_what = train_test_split(X_train,y_train,test_size=0.9,stratify = y_train)

In [79]:
derate_index = y_train[y_train].index.to_list()

In [80]:
nonderate_index = y_train[~y_train].index.to_list()

In [84]:
bootstrap_index = random.sample(derate_index, k=50) + random.sample(nonderate_index, k=5000) #we're undersampling here

In [85]:
Var_boot = pd.concat([X_train , y_train],axis=1)

In [86]:
#to get a bootstrap sample
def bootstrapss():
    bootstrap_index = random.sample(derate_index, k=50) + random.sample(nonderate_index, k=5000)
    boot = Var_boot.loc[bootstrap_index]
    X_boot = boot.drop(columns='derate_soon', axis=1)
    y_boot = boot['derate_soon']
    return X_boot, y_boot

In [87]:
X_boot, y_boot = bootstrapss()

In [88]:
pipesgdboot = Pipeline(steps=[
    
    ('scale',StandardScaler()), 
    ('classifier',SGDClassifier(class_weight={0:1,1:5},penalty='l1', loss = 'log', alpha=0.001)) #let 1:5 run overnight  TBR
    
]) 

In [89]:
pipesgdboot.fit(X_boot,y_boot)

Pipeline(steps=[('scale', StandardScaler()),
                ('classifier',
                 SGDClassifier(alpha=0.001, class_weight={0: 1, 1: 5},
                               loss='log', penalty='l1'))])

In [90]:
y_pred = pipesgdboot.predict(X_test)

In [91]:
confusion_matrix(y_test,y_pred) #1:5

array([[115718,   1255],
       [   264,      4]], dtype=int64)

In [92]:
coefficients = pd.DataFrame({'variable': X_train.columns, 'coefficient': pipe[1].coef_[0]})

In [93]:
coefficients.sort_values('coefficient',ascending=False).head(15)

Unnamed: 0,variable,coefficient
2,Speed,0.6204
0,BarometricPressure,0.549204
12,FuelTemperature,0.204934
6,IntakeManifoldTemperature,0.185168
14,DistanceLtd,0.150617
9,FuelRate,0.122174
15,FuelLtd,0.078713
16,EngineTimeLtd,0.057065
5,FuelLevel,0.031765
11,EngineRpm,0.005081


In [94]:
from tqdm.notebook import tqdm

In [95]:
predictors = [] #up to possibly 100 iterations
for _ in tqdm(range(25)): #if it's a throwaway variable use an underscore
    X_boot, y_boot = bootstrapss()
    pipesgdboot.fit(X_boot,y_boot)
    coefficients = pd.DataFrame({'variable': X_train.columns, 'coefficient': pipesgdboot[1].coef_[0]})
    predictors.extend(coefficients[coefficients.coefficient > 0].variable.to_list())

  0%|          | 0/25 [00:00<?, ?it/s]

In [96]:
from collections import Counter

In [97]:
counter = Counter(predictors)

In [98]:
counter.most_common()

[('BarometricPressure', 25),
 ('Speed', 25),
 ('FuelTemperature', 19),
 ('DistanceLtd', 15),
 ('IntakeManifoldTemperature', 13),
 ('FuelLevel', 10),
 ('FuelLtd', 9),
 ('EngineRpm', 9),
 ('EngineLoad', 7),
 ('AcceleratorPedal', 6),
 ('EngineTimeLtd', 4),
 ('TurboBoostPressure', 3),
 ('FuelRate', 3),
 ('EngineOilTemperature', 1),
 ('EngineOilPressure', 1),
 ('EngineCoolantTemperature', 1)]