In [1]:
import pandas as pd
import datetime as dt
import matplotlib.pyplot as plt
import seaborn as sns
import plotly
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn import metrics
from sklearn.metrics import classification_report, f1_score
from sklearn.compose import ColumnTransformer
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

# Read in CSVs

In [2]:
train = pd.read_csv('../data/training_data.csv', low_memory=False)
train = train.drop(columns=['EquipmentID.1','EquipmentID.2','FaultId'],axis = 1)

In [3]:
test = pd.read_csv('../data/testing_data.csv', low_memory=False)
test = test.drop(columns=['EquipmentID.1','EquipmentID.2','FaultId'],axis = 1) #test needs same number of columns as train

# Further clean test/train data
* want to drop ecuMake X
* throw out IgnStatus_False X
* drop active column X
* throw out 0-1hr derate instances X

In [4]:
# #dummy variables for the string/object data types
train = pd.get_dummies(train, columns = ['IgnStatus'], drop_first = True).drop(columns=['ecuMake','active'])
test = pd.get_dummies(test, columns = ['IgnStatus'], drop_first = True).drop(columns=['ecuMake','active'])

In [5]:
train['time_until_derate'].sort_values()

37        0 days 00:00:00.000000000
59308     0 days 00:00:00.000000000
59314     0 days 00:00:00.000000000
59354     0 days 00:00:00.000000000
59360     0 days 00:00:00.000000000
                    ...            
468951                          NaN
468952                          NaN
468953                          NaN
468954                          NaN
468955                          NaN
Name: time_until_derate, Length: 468956, dtype: object

In [6]:
train['time_until_derate'] = pd.to_timedelta(train['time_until_derate']) 

In [7]:
train = train[~((train['time_until_derate'] >= dt.timedelta(hours=0)) & (train['time_until_derate'] <= dt.timedelta(hours=1)))]

# Standard Scaler 
* do before any model fitting

In [8]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler

In [9]:
pipe = Pipeline(steps=[
    
    ('scale',StandardScaler()), 
    ('classifier',LogisticRegression(penalty='l1', solver='saga')) #can also use liblinear as solver
    
]) 

In [11]:
pipesgd = Pipeline(steps=[
    
    ('scale',StandardScaler()), 
    ('classifier',SGDClassifier(class_weight='balanced',penalty='l1', loss = 'log'))
    
]) 

In [12]:
pipesgdnw = Pipeline(steps=[
    
    ('scale',StandardScaler()), 
    ('classifier',SGDClassifier(penalty='l1', loss = 'log'))
    
]) 

# Logistic Regression
* with l1 penalty

In [16]:
# Predictors
X_train = train.drop(columns = ['derate_soon','time_until_derate','EventTimeStamp','EquipmentID'])
#Target Variable
y_train = train['derate_soon']

In [14]:
pipe.fit(X_train, y_train)



Pipeline(steps=[('scale', StandardScaler()),
                ('classifier',
                 LogisticRegression(penalty='l1', solver='saga'))])

In [17]:
X_test = test.drop(columns = ['derate_soon','time_until_derate','EventTimeStamp','EquipmentID'])
y_test = test['derate_soon']

In [16]:
y_pred = pipe.predict(X_test)

In [18]:
y_test = y_test.replace('True',1)
y_test = y_test.replace('False',0)

### How well the model did

In [18]:
accuracy_score(y_test,y_pred)

0.9976629336153735

In [19]:
confusion_matrix(y_test,y_pred)

array([[116965,      8],
       [   266,      2]], dtype=int64)

In [20]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

       False       1.00      1.00      1.00    116973
        True       0.20      0.01      0.01       268

    accuracy                           1.00    117241
   macro avg       0.60      0.50      0.51    117241
weighted avg       1.00      1.00      1.00    117241



# SGD
* weighting by class

In [None]:
pipesgd.fit(X_train,y_train)

In [None]:
y_pred = pipesgd.predict(X_test)

### How well the model did

In [None]:
accuracy_score(y_test,y_pred)

In [None]:
confusion_matrix(y_test,y_pred)

In [None]:
print(classification_report(y_test, y_pred))

# Undersampling

In [None]:
from imblearn.under_sampling import RandomUnderSampler

In [None]:
undersampler = RandomUnderSampler(random_state = 321)

In [None]:
X_resampled, y_resampled = undersampler.fit_resample(X_train, y_train)

### Logistic Regression

In [None]:
pipe.fit(X_resampled, y_resampled)

In [None]:
y_pred = pipe.predict(X_test)
accuracy_score(y_test, y_pred)

In [None]:
confusion_matrix(y_test, y_pred)

In [None]:
print(classification_report(y_test, y_pred))

### SGD

In [None]:
pipesgd.fit(X_resampled,y_resampled) #will have to rerun this chunk of code

In [None]:
y_pred = pipesgd.predict(X_test)

In [None]:
accuracy_score(y_test, y_pred)

In [None]:
confusion_matrix(y_test, y_pred)

In [None]:
print(classification_report(y_test, y_pred))

# SMOTE
**S**ynthetic **M**inority **O**versampling **TE**chnique

In [19]:
from imblearn.over_sampling import SMOTE

In [22]:
oversampler = SMOTE(k_neighbors=5, n_jobs=-1, random_state=321)

In [23]:
X_smote, y_smote = oversampler.fit_resample(X_train, y_train)

### Logistic Regression 

In [24]:
pipe.fit(X_smote, y_smote)



Pipeline(steps=[('scale', StandardScaler()),
                ('classifier',
                 LogisticRegression(penalty='l1', solver='saga'))])

In [25]:
y_pred = pipe.predict(X_test)
accuracy_score(y_test, y_pred)

0.9643810612328452

In [26]:
confusion_matrix(y_test, y_pred)

array([[112935,   4038],
       [   138,    130]], dtype=int64)

### SGD

In [43]:
pipesgdnw.fit(X_smote, y_smote)

Pipeline(steps=[('scale', StandardScaler()),
                ('classifier', SGDClassifier(loss='log', penalty='l1'))])

In [44]:
y_pred = pipesgdnw.predict(X_test)
accuracy_score(y_test, y_pred)

0.9625813495278955

In [45]:
confusion_matrix(y_test, y_pred)

array([[112727,   4246],
       [   141,    127]], dtype=int64)

# Lesser SMOTE

In [20]:
lessersmote = SMOTE(k_neighbors=5, n_jobs=-1, random_state=321, sampling_strategy=0.05)
# adjusts the ratio of derate:not derate so we have less fake data points

In [21]:
X_lsmote, y_lsmote = lessersmote.fit_resample(X_train, y_train)

### Logstic Regression

In [22]:
pipe.fit(X_lsmote, y_lsmote)



Pipeline(steps=[('scale', StandardScaler()),
                ('classifier',
                 LogisticRegression(penalty='l1', solver='saga'))])

In [23]:
y_pred = pipe.predict(X_test)
accuracy_score(y_test, y_pred)

0.9912829129741302

In [24]:
confusion_matrix(y_test, y_pred)

array([[116193,    780],
       [   242,     26]], dtype=int64)

### SGD

In [51]:
pipesgdnw.fit(X_lsmote, y_lsmote)

Pipeline(steps=[('scale', StandardScaler()),
                ('classifier', SGDClassifier(loss='log', penalty='l1'))])

In [52]:
y_pred = pipesgdnw.predict(X_test)
accuracy_score(y_test, y_pred)

0.9883317269555872

In [53]:
confusion_matrix(y_test, y_pred)

array([[115834,   1139],
       [   229,     39]], dtype=int64)

# Digging into Logreg w/ SMOTE
* needs cleaning up but using as a resource for later

In [27]:
coefficients = pd.DataFrame({ 'variable': X_train.columns, 'coefficient': pipe[1].coef_[0]})

In [28]:
coefficients.sort_values('coefficient',ascending=False).head(25)

Unnamed: 0,variable,coefficient
2,Speed,0.398646
0,BarometricPressure,0.230548
6,IntakeManifoldTemperature,0.182826
12,FuelTemperature,0.143533
14,DistanceLtd,0.128309
1059,IgnStatus_True,0.092937
5,FuelLevel,0.070448
16,EngineTimeLtd,0.047633
883,74-14,0.047055
15,FuelLtd,0.034312


In [None]:
coefficients[coefficients.variable == 'IgnStatus_False']

# Bootstrap Experimenting

In [None]:
# random subset bootstrap

In [29]:
from sklearn.model_selection import train_test_split

In [30]:
X_boot, X_what, y_boot, y_what = train_test_split(X_train,y_train,test_size=0.9,stratify = y_train)

In [57]:
y_train.value_counts()

False    467684
True        481
Name: derate_soon, dtype: int64

In [60]:
X_train.sample(frac=0.1)

Unnamed: 0,BarometricPressure,EngineLoad,Speed,EngineOilPressure,EngineOilTemperature,FuelLevel,IntakeManifoldTemperature,TurboBoostPressure,EngineCoolantTemperature,FuelRate,...,96-4,96-9,97-15,97-16,97-3,97-4,976-9,98-18,98-5,IgnStatus_True
175834,14.065000,8.000000,6.912755,37.12,203.78750,73.2,87.8,1.16,179.6,1.307656,...,0,0,0,0,0,0,0,0,0,1
405986,14.282500,12.000000,2.951513,19.14,209.46880,40.4,127.4,0.58,183.2,0.554763,...,0,0,0,0,0,0,0,0,0,1
444127,14.210000,23.000000,0.000000,40.02,100.79370,35.6,71.6,0.00,105.8,0.924605,...,0,0,0,0,0,0,0,0,0,1
321639,14.427500,14.000000,5.640885,30.74,205.41870,100.0,136.4,2.03,185.0,0.317007,...,1,0,0,0,0,0,0,0,0,1
425457,14.210000,12.000000,0.165052,19.72,213.74370,48.4,78.8,0.29,181.4,0.581180,...,0,0,0,0,0,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
458823,14.355000,74.000000,60.311840,32.48,219.14380,45.2,80.6,16.24,176.0,12.574630,...,0,0,0,0,0,0,0,0,0,1
234771,14.717500,16.000000,0.000000,24.36,182.97500,22.8,138.2,0.29,165.2,0.752893,...,0,0,0,0,0,0,0,0,0,1
269724,3.553493,32.865604,25.075825,46.98,205.67185,68.8,120.2,2.32,176.0,1.017066,...,0,0,0,0,0,0,0,0,0,0
214373,14.282500,17.000000,9.815723,37.70,199.40000,48.4,91.4,1.16,170.6,2.298304,...,0,0,0,0,0,0,0,0,0,1


In [64]:
inspect = pd.concat([X_train,y_train], axis=1).groupby('derate_soon').apply(lambda x: x.sample(frac=0.1))

In [31]:
import random

In [32]:
derate_index = y_train[y_train].index.to_list()

In [33]:
nonderate_index = y_train[~y_train].index.to_list()

In [34]:
bootstrap_index = random.sample(derate_index, k=50) + random.sample(nonderate_index, k=5000) #we're undersampling here

In [84]:
X_train.loc[bootstrap_index]

Unnamed: 0,BarometricPressure,EngineLoad,Speed,EngineOilPressure,EngineOilTemperature,FuelLevel,IntakeManifoldTemperature,TurboBoostPressure,EngineCoolantTemperature,FuelRate,...,96-4,96-9,97-15,97-16,97-3,97-4,976-9,98-18,98-5,IgnStatus_True
448302,14.7175,12.0,46.360120,35.38,209.9187,54.0,100.4,2.61,186.8,1.452951,...,0,0,0,0,0,0,0,0,0,1
321504,14.7900,12.0,0.000000,23.78,187.9250,100.0,127.4,0.29,179.6,0.581180,...,0,0,0,0,0,0,0,0,0,1
462307,14.4275,11.0,0.000000,57.42,118.9063,90.8,71.6,1.16,118.4,0.739684,...,0,0,0,0,0,0,0,0,0,1
452383,14.2825,77.0,61.807010,34.80,223.6438,80.4,98.6,17.11,181.4,13.287900,...,0,0,0,0,0,0,0,0,0,1
20490,14.4275,0.0,57.321490,35.38,213.7437,77.2,86.0,1.16,186.8,0.000000,...,0,0,0,0,0,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
413198,14.4275,17.0,0.000000,24.94,192.2563,35.2,138.2,0.29,172.4,0.752893,...,0,0,0,0,0,0,0,0,0,1
25133,14.5000,81.0,66.661480,40.60,188.4312,36.0,91.4,24.07,192.2,17.897710,...,0,0,0,0,0,0,0,0,0,1
363985,14.1375,6.0,45.758160,42.92,187.1375,100.0,86.0,0.29,181.4,0.501928,...,0,0,0,0,0,0,0,0,0,1
93223,14.2825,8.0,4.262218,17.40,211.1000,44.0,104.0,1.45,188.6,0.515137,...,0,0,0,0,0,0,0,0,0,1


In [35]:
Var_boot = pd.concat([X_train , y_train],axis=1)

In [91]:
Var_boot

Unnamed: 0,BarometricPressure,EngineLoad,Speed,EngineOilPressure,EngineOilTemperature,FuelLevel,IntakeManifoldTemperature,TurboBoostPressure,EngineCoolantTemperature,FuelRate,...,96-9,97-15,97-16,97-3,97-4,976-9,98-18,98-5,IgnStatus_True,derate_soon
0,14.2825,29.181818,15.130145,38.86,129.70625,59.2,86.0,0.00,102.2,1.585037,...,0,0,0,0,0,0,0,0,1,False
1,14.2825,29.181818,15.130145,38.86,129.70625,59.2,86.0,0.00,102.2,1.585037,...,0,0,0,0,0,0,0,0,1,False
2,14.2825,29.181818,15.130145,38.86,129.70625,59.2,86.0,0.00,102.2,1.585037,...,0,0,0,0,0,0,0,0,1,False
3,14.2825,29.181818,15.130145,38.86,129.70625,59.2,86.0,0.00,102.2,1.585037,...,0,0,0,0,0,0,0,0,1,False
4,14.0650,28.000000,15.130145,41.76,129.70625,59.2,66.2,0.00,64.4,1.585037,...,0,0,0,0,0,0,0,0,1,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
468951,14.4275,0.000000,2.553447,38.28,112.21250,30.0,100.4,2.90,113.0,0.000000,...,0,0,0,0,0,0,0,0,1,False
468952,14.4275,15.000000,1.048564,36.54,140.28130,32.4,96.8,0.87,140.0,0.647223,...,0,0,0,0,0,0,0,0,1,False
468953,14.0650,83.000000,65.962430,42.34,222.68750,38.0,82.4,16.82,185.0,14.727640,...,0,0,0,0,0,0,0,0,1,False
468954,14.1375,53.000000,66.836240,42.34,222.68750,22.0,82.4,8.99,179.6,9.166799,...,0,0,0,0,0,0,0,0,1,False


In [36]:
#to get a bootstrap sample
def bootstrapss():
    bootstrap_index = random.sample(derate_index, k=50) + random.sample(nonderate_index, k=5000)
    boot = Var_boot.loc[bootstrap_index]
    X_boot = boot.drop(columns='derate_soon', axis=1)
    y_boot = boot['derate_soon']
    return X_boot, y_boot

In [37]:
X_boot, y_boot = bootstrapss()

In [94]:
X_boot

Unnamed: 0,BarometricPressure,EngineLoad,Speed,EngineOilPressure,EngineOilTemperature,FuelLevel,IntakeManifoldTemperature,TurboBoostPressure,EngineCoolantTemperature,FuelRate,...,96-4,96-9,97-15,97-16,97-3,97-4,976-9,98-18,98-5,IgnStatus_True
105362,14.065000,67.000000,51.117490,36.54,188.15000,84.0,107.6,18.56,186.8,9.338511,...,0,0,0,0,0,0,0,0,0,1
373931,14.572500,70.000000,54.088420,39.44,197.03750,60.8,91.4,4.93,185.0,11.676440,...,0,0,0,0,0,0,0,0,0,1
284187,14.427500,37.000000,6.475853,31.90,224.09380,63.6,140.0,2.03,186.8,3.130449,...,0,0,0,0,0,0,0,0,0,1
452382,14.282500,9.000000,0.000000,23.78,198.55620,81.2,149.0,0.00,179.6,0.475511,...,0,0,0,0,0,0,0,0,0,1
408761,14.500000,0.000000,0.000000,0.00,111.65000,81.2,93.2,0.00,111.2,0.000000,...,0,0,0,0,0,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
195584,14.500000,16.000000,0.000000,31.32,181.62500,100.0,136.4,0.29,174.2,0.752893,...,0,0,0,0,0,0,0,0,0,1
261472,13.702500,19.000000,9.582709,43.50,187.98120,100.0,125.6,5.80,174.2,3.381413,...,0,0,0,0,0,0,0,0,0,1
343362,14.137500,25.000000,0.000000,31.32,199.79370,60.0,87.8,6.96,186.8,1.849210,...,0,0,0,0,0,0,0,0,0,1
306647,14.380825,24.839477,16.803331,26.68,197.71250,47.6,98.6,0.58,181.4,0.726475,...,0,0,0,0,0,0,0,0,0,1


In [95]:
y_boot

105362     True
373931     True
284187     True
452382     True
408761     True
          ...  
195584    False
261472    False
343362    False
306647    False
317713    False
Name: derate_soon, Length: 5050, dtype: bool

In [38]:
pipesgdboot = Pipeline(steps=[
    
    ('scale',StandardScaler()), 
    ('classifier',SGDClassifier(class_weight={0:1,1:5},penalty='l1', loss = 'log', alpha=0.001)) #let 1:5 run overnight  TBR
    
]) 

In [39]:
%%time
pipesgdboot.fit(X_boot,y_boot)

Wall time: 25.7 s


Pipeline(steps=[('scale', StandardScaler()),
                ('classifier',
                 SGDClassifier(alpha=0.001, class_weight={0: 1, 1: 5},
                               loss='log', penalty='l1'))])

In [40]:
y_pred = pipesgdboot.predict(X_test)

In [41]:
confusion_matrix(y_test,y_pred) #1:5

array([[114854,   2119],
       [   202,     66]], dtype=int64)

In [151]:
confusion_matrix(y_test,y_pred) #1:1

array([[115810,   1163],
       [   207,     61]], dtype=int64)

In [116]:
confusion_matrix(y_test,y_pred) #1:2

array([[114765,   2208],
       [   212,     56]], dtype=int64)

In [120]:
pipesgdboot[1].coef_[0]

array([19.43039447,  0.08167959,  1.67060909, ...,  0.        ,
        0.        , 52.97062997])

In [42]:
coefficients = pd.DataFrame({'variable': X_train.columns, 'coefficient': pipe[1].coef_[0]})

In [43]:
(coefficients.coefficient != 0).sum()

953

In [44]:
coefficients.sort_values('coefficient',ascending=False).head(15)

Unnamed: 0,variable,coefficient
2,Speed,0.398646
0,BarometricPressure,0.230548
6,IntakeManifoldTemperature,0.182826
12,FuelTemperature,0.143533
14,DistanceLtd,0.128309
1059,IgnStatus_True,0.092937
5,FuelLevel,0.070448
16,EngineTimeLtd,0.047633
883,74-14,0.047055
15,FuelLtd,0.034312


In [62]:
coefficients = pd.DataFrame({'variable': X_train.columns, 'coefficient': pipe[1].coef_[0]})
coefficients[coefficients.coefficient > 0].variable.to_list()

['BarometricPressure',
 'Speed',
 'FuelLevel',
 'IntakeManifoldTemperature',
 'FuelRate',
 'FuelTemperature',
 'DistanceLtd',
 'FuelLtd',
 'EngineTimeLtd',
 '6802-31',
 '74-14',
 'IgnStatus_True']

In [46]:
from tqdm.notebook import tqdm

In [47]:
predictors = [] #up to possibly 100 iterations
for _ in tqdm(range(25)): #if it's a throwaway variable use an underscore
    X_boot, y_boot = bootstrapss()
    pipesgdboot.fit(X_boot,y_boot)
    coefficients = pd.DataFrame({'variable': X_train.columns, 'coefficient': pipesgdboot[1].coef_[0]})
    predictors.extend(coefficients[coefficients.coefficient > 0].variable.to_list())

  0%|          | 0/25 [00:00<?, ?it/s]



In [48]:
from collections import Counter

In [49]:
counter = Counter(predictors)

In [60]:
counter.most_common()

[('111-17', 25),
 ('1569-31', 25),
 ('3362-31', 25),
 ('BarometricPressure', 24),
 ('5394-5', 22),
 ('IntakeManifoldTemperature', 21),
 ('IgnStatus_True', 21),
 ('1761-11', 19),
 ('929-9', 19),
 ('96-3', 19),
 ('1761-9', 18),
 ('3031-9', 18),
 ('3364-9', 18),
 ('1761-19', 17),
 ('EngineRpm', 16),
 ('FuelLevel', 15),
 ('1068-2', 15),
 ('1787-11', 15),
 ('4094-18', 15),
 ('4094-31', 15),
 ('6802-31', 14),
 ('3226-9', 13),
 ('3361-4', 13),
 ('5848-9', 13),
 ('4376-3', 13),
 ('3216-9', 12),
 ('4334-4', 12),
 ('4334-18', 11),
 ('3821-11', 11),
 ('Speed', 10),
 ('829-3', 10),
 ('DistanceLtd', 10),
 ('EngineTimeLtd', 9),
 ('AcceleratorPedal', 9),
 ('111-18', 9),
 ('3216-4', 9),
 ('5394-17', 9),
 ('5743-9', 9),
 ('639-2', 9),
 ('5742-9', 9),
 ('FuelLtd', 8),
 ('4342-5', 8),
 ('74-14', 7),
 ('96-9', 7),
 ('SwitchedBatteryVoltage', 7),
 ('4364-18', 7),
 ('84-9', 7),
 ('EngineOilPressure', 7),
 ('3031-18', 7),
 ('3226-20', 6),
 ('171-9', 6),
 ('4334-2', 6),
 ('5394-4', 6),
 ('168-4', 5),
 ('790-9

In [160]:
# cross reference this w/ oluchi, daniel, and courtney's list.
# throw out rows where we have a super common fault code, rerun the model TBR

In [162]:
test.shape

(117241, 1064)

# Matching test data to og data

In [168]:
X_test['5246-16'].value_counts()

0    117224
1        17
Name: 5246-16, dtype: int64

In [50]:
y_pred = pipe.predict(X_test) #lesser smote model

In [176]:
y_pred.shape

(117241,)

In [181]:
test.shape

(117241, 1064)

In [51]:
test['prediction'] = y_pred

In [52]:
test['EventTimeStamp'] = pd.to_datetime(test['EventTimeStamp'])

In [53]:
dr = test[test['5246-16']==1].iloc[3]

In [54]:
truck = dr['EquipmentID']

In [55]:
ts = dr['EventTimeStamp']

In [56]:
if test[(test.EquipmentID == truck) &  (test.EventTimeStamp >= (ts - dt.timedelta(hours=6))) & (test.EventTimeStamp <= (ts - dt.timedelta(hours=1)))]['prediction'].max():
    print('hooray')

hooray


In [57]:
ts - dt.timedelta(hours=6)

Timestamp('2016-11-30 06:31:09')

In [58]:
ts

Timestamp('2016-11-30 12:31:09')

# This is how you evaluate the model you end up choosing:

In [61]:
success = 0
for i in range(len(test[test['5246-16']==1])):
    dr = test[test['5246-16']==1].iloc[i]
    truck = dr['EquipmentID']
    ts = dr['EventTimeStamp']
    if test[(test.EquipmentID == truck) &  
         (test.EventTimeStamp >= (ts - dt.timedelta(hours=24))) & 
         (test.EventTimeStamp <= (ts - dt.timedelta(hours=0.001)))]['prediction'].max() == True:
        success += 1
print(success)

12


present with a disclaimer: lots of false positives but managed to catch about half of the derates. with finetuning, it could do better. look at how many false positives per truck per time period.