In [1]:
import pandas as pd
import datetime as dt
import matplotlib.pyplot as plt
import seaborn as sns
import plotly
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn import metrics
from sklearn.metrics import classification_report, f1_score
from sklearn.compose import ColumnTransformer
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

# Read in CSVs

In [2]:
train = pd.read_csv('../data/training_data.csv', low_memory=False)
train = train.drop(columns=['Unnamed: 0','EquipmentID.1','EquipmentID.2','FaultId'],axis = 1)

In [3]:
test = pd.read_csv('../data/testing_data.csv', low_memory=False)
test = test.drop(columns=['Unnamed: 0','FaultId'],axis = 1) #test needs same number of columns as train

# Match the test data set to the training dataset so NaNs can be filled

In [4]:
# class GroupImputer(BaseEstimator, TransformerMixin):
#     def __init__(self,strategy='mean'):
#         self.strategy = strategy
#     def fit(self, df):
#         imputer ={}
#         if self.strategy == 'mean':
#             self.overall_ = df.mean()
#         if self.strategy == 'median':
#             self.overall_ = df.median()
#         if self.strategy == 'mode':
#             self.overall_ = df.mode().iloc[0]
#         for truck, columns in df.groupby('EquipmentID'):
#             if self.strategy == 'mean':
#                 imputer[truck] = columns.mean()
#             if self.strategy == 'median':
#                 imputer[truck] = columns.median()
#             if self.strategy == 'mode':
#                 imputer[truck] = columns.mode().iloc[0] #iloc is in case there's more than one mode
#         self.imputer_ = imputer
#         return self
#     def transform(self,df):
#         new_df = df.copy()
#         for truck in self.imputer_:
#             val = self.imputer_[truck]
#             new_df.loc[new_df.EquipmentID == truck] = new_df.loc[new_df.EquipmentID == truck].fillna(val)
#         new_df = new_df.fillna(self.overall_)
#         return new_df            

In [5]:
# gi_mean = GroupImputer(strategy='mean')
# gi_median = GroupImputer(strategy='median')
# gi_mode = GroupImputer(strategy='mode')

In [6]:
# mean_col = ['BarometricPressure','EngineLoad','Speed','EquipmentID']
# median_col = ['EngineOilPressure','EngineOilTemperature','FuelLevel','IntakeManifoldTemperature','TurboBoostPressure','EngineCoolantTemperature','FuelRate','EquipmentID']
# mode_col = ['AcceleratorPedal','EngineRpm','FuelTemperature','SwitchedBatteryVoltage','IgnStatus','DistanceLtd','FuelLtd','EngineTimeLtd','EquipmentID']

In [7]:
# ct = ColumnTransformer(transformers=[
#     ('mean',gi_mean,mean_col),
#     ('median',gi_median,median_col),
#     ('mode',gi_mode,mode_col)
# ], remainder = 'passthrough')

In [8]:
# ct.fit(train)

ColumnTransformer(remainder='passthrough',
                  transformers=[('mean', GroupImputer(),
                                 ['BarometricPressure', 'EngineLoad', 'Speed',
                                  'EquipmentID']),
                                ('median', GroupImputer(strategy='median'),
                                 ['EngineOilPressure', 'EngineOilTemperature',
                                  'FuelLevel', 'IntakeManifoldTemperature',
                                  'TurboBoostPressure',
                                  'EngineCoolantTemperature', 'FuelRate',
                                  'EquipmentID']),
                                ('mode', GroupImputer(strategy='mode'),
                                 ['AcceleratorPedal', 'EngineRpm',
                                  'FuelTemperature', 'SwitchedBatteryVoltage',
                                  'IgnStatus', 'DistanceLtd', 'FuelLtd',
                                  'EngineTimeLtd', 'EquipmentID'])]

In [9]:
# test = test[train.columns] #rearranges columns in test to match that of train

In [10]:
# imputed_cols = mean_col + median_col + mode_col
# remaining_cols = [x for x in train.columns if x not in imputed_cols]

In [11]:
# remaining_cols

['ecuMake',
 'active',
 'derate_soon',
 '0-0',
 '100-1',
 '100-18',
 '100-2',
 '100-3',
 '100-4',
 '100-5',
 '101-0',
 '101-14',
 '101-15',
 '101-16',
 '101-2',
 '101-3',
 '101-4',
 '101-5',
 '102-10',
 '102-14',
 '102-15',
 '102-16',
 '102-17',
 '102-18',
 '102-2',
 '102-20',
 '102-3',
 '102-31',
 '102-4',
 '102-5',
 '1023-5',
 '1024-0',
 '1028-9',
 '103-1',
 '103-10',
 '103-16',
 '103-18',
 '103-4',
 '103-7',
 '103-9',
 '1043-2',
 '1045-2',
 '1045-7',
 '105-0',
 '105-17',
 '105-18',
 '105-2',
 '105-3',
 '105-5',
 '1056-2',
 '1056-4',
 '1056-5',
 '1059-2',
 '1067-11',
 '1067-2',
 '1067-7',
 '1068-2',
 '107-3',
 '107-4',
 '1071-5',
 '1072-1',
 '1072-3',
 '1072-4',
 '1072-5',
 '1073-3',
 '1075-3',
 '1078-4',
 '108-3',
 '108-4',
 '1081-9',
 '110-0',
 '110-16',
 '110-18',
 '110-2',
 '110-3',
 '110-31',
 '110-4',
 '111-1',
 '111-17',
 '111-18',
 '111-3',
 '111-4',
 '111-5',
 '1127-14',
 '1127-16',
 '1127-18',
 '114863-11',
 '116-3',
 '116-31',
 '116-4',
 '1172-2',
 '1172-3',
 '1172-4',
 '1

In [12]:
# test2 = ct.transform(test)

In [13]:
# test2 = pd.DataFrame(test2, columns=imputed_cols + remaining_cols)
# test2.head()

Unnamed: 0,BarometricPressure,EngineLoad,Speed,EquipmentID,EngineOilPressure,EngineOilTemperature,FuelLevel,IntakeManifoldTemperature,TurboBoostPressure,EngineCoolantTemperature,...,96-3,96-4,96-9,97-15,97-16,97-3,97-4,976-9,98-18,98-5
0,14.2825,33.0,0.0,1327,39.44,91.7375,59.2,80.6,2.9,95.0,...,0,0,0,0,0,0,0,0,0,0
1,14.2825,29.1818,15.1301,1327,38.86,129.706,59.2,86.0,0.0,102.2,...,0,0,0,0,0,0,0,0,0,0
2,14.2825,29.1818,15.1301,1327,38.86,129.706,59.2,86.0,0.0,102.2,...,0,0,0,0,0,0,0,0,0,0
3,14.2825,29.1818,15.1301,1327,38.86,129.706,59.2,86.0,0.0,102.2,...,0,0,0,0,0,0,0,0,0,0
4,14.2825,29.1818,15.1301,1327,38.86,129.706,59.2,86.0,0.0,102.2,...,0,0,0,0,0,0,0,0,0,0


In [14]:
# list(test2.columns)

['BarometricPressure',
 'EngineLoad',
 'Speed',
 'EquipmentID',
 'EngineOilPressure',
 'EngineOilTemperature',
 'FuelLevel',
 'IntakeManifoldTemperature',
 'TurboBoostPressure',
 'EngineCoolantTemperature',
 'FuelRate',
 'EquipmentID',
 'AcceleratorPedal',
 'EngineRpm',
 'FuelTemperature',
 'SwitchedBatteryVoltage',
 'IgnStatus',
 'DistanceLtd',
 'FuelLtd',
 'EngineTimeLtd',
 'EquipmentID',
 'ecuMake',
 'active',
 'derate_soon',
 '0-0',
 '100-1',
 '100-18',
 '100-2',
 '100-3',
 '100-4',
 '100-5',
 '101-0',
 '101-14',
 '101-15',
 '101-16',
 '101-2',
 '101-3',
 '101-4',
 '101-5',
 '102-10',
 '102-14',
 '102-15',
 '102-16',
 '102-17',
 '102-18',
 '102-2',
 '102-20',
 '102-3',
 '102-31',
 '102-4',
 '102-5',
 '1023-5',
 '1024-0',
 '1028-9',
 '103-1',
 '103-10',
 '103-16',
 '103-18',
 '103-4',
 '103-7',
 '103-9',
 '1043-2',
 '1045-2',
 '1045-7',
 '105-0',
 '105-17',
 '105-18',
 '105-2',
 '105-3',
 '105-5',
 '1056-2',
 '1056-4',
 '1056-5',
 '1059-2',
 '1067-11',
 '1067-2',
 '1067-7',
 '1068-2

In [15]:
# column_numbers = [x for x in range(test2.shape[1])]  # list of columns' integer indices

# column_numbers.remove(3)
# column_numbers.remove(11)
# test2 = test2.iloc[:, column_numbers] #return all columns except the 3,11th column


In [16]:
# test2.head()

Unnamed: 0,BarometricPressure,EngineLoad,Speed,EngineOilPressure,EngineOilTemperature,FuelLevel,IntakeManifoldTemperature,TurboBoostPressure,EngineCoolantTemperature,FuelRate,...,96-3,96-4,96-9,97-15,97-16,97-3,97-4,976-9,98-18,98-5
0,14.2825,33.0,0.0,39.44,91.7375,59.2,80.6,2.9,95.0,2.15301,...,0,0,0,0,0,0,0,0,0,0
1,14.2825,29.1818,15.1301,38.86,129.706,59.2,86.0,0.0,102.2,1.58504,...,0,0,0,0,0,0,0,0,0,0
2,14.2825,29.1818,15.1301,38.86,129.706,59.2,86.0,0.0,102.2,1.58504,...,0,0,0,0,0,0,0,0,0,0
3,14.2825,29.1818,15.1301,38.86,129.706,59.2,86.0,0.0,102.2,1.58504,...,0,0,0,0,0,0,0,0,0,0
4,14.2825,29.1818,15.1301,38.86,129.706,59.2,86.0,0.0,102.2,1.58504,...,0,0,0,0,0,0,0,0,0,0


# Further clean test/train data
* want to drop ecuMake

In [17]:
# train['ecuMake'] = train['ecuMake'].replace('?MMNS','CMMNS')
# train['ecuMake'] = train['ecuMake'].replace('??MNS','CMMNS')
# train['ecuMake'] = train['ecuMake'].replace('?CAR','PCAR')
# train['ecuMake'] = train['ecuMake'].replace('???R','PCAR')
# train['ecuMake'] = train['ecuMake'].replace('?ACCR','PACCR')
# train['ecuMake'] = train['ecuMake'].replace('????R','PACCR')
# train['ecuMake'] = train['ecuMake'].replace('???CR','PACCR')
# train['ecuMake'] = train['ecuMake'].replace('?ATON','EATON')
# train['ecuMake'] = train['ecuMake'].replace('?NDWS','BNDWS')
# train['ecuMake'] = train['ecuMake'].replace('??DWS','BNDWS')

# train = train[(train['ecuMake'] == 'CMMNS') | (train['ecuMake'] == 'PCAR') | (train['ecuMake'] == 'PACCR') | 
#               (train['ecuMake'] == 'EATON') | (train['ecuMake'] == 'BNDWS') | (train['ecuMake'] == 'VOLVO')]

# test2['ecuMake'] = test2['ecuMake'].replace('?MMNS','CMMNS')
# test2['ecuMake'] = test2['ecuMake'].replace('??MNS','CMMNS')
# test2['ecuMake'] = test2['ecuMake'].replace('?CAR','PCAR')
# test2['ecuMake'] = test2['ecuMake'].replace('???R','PCAR')
# test2['ecuMake'] = test2['ecuMake'].replace('?ACCR','PACCR')
# test2['ecuMake'] = test2['ecuMake'].replace('????R','PACCR')
# test2['ecuMake'] = test2['ecuMake'].replace('???CR','PACCR')
# test2['ecuMake'] = test2['ecuMake'].replace('?ATON','EATON')
# test2['ecuMake'] = test2['ecuMake'].replace('?NDWS','BNDWS')
# test2['ecuMake'] = test2['ecuMake'].replace('??DWS','BNDWS')

# test2 = test2[(test2['ecuMake'] == 'CMMNS') | (test2['ecuMake'] == 'PCAR') | (test2['ecuMake'] == 'PACCR') | 
#               (test2['ecuMake'] == 'EATON') | (test2['ecuMake'] == 'BNDWS') | (test2['ecuMake'] == 'VOLVO')]


In [18]:
# #dummy variables for the string/object data types
train = pd.get_dummies(train, columns = ['ecuMake', 'IgnStatus']).drop(columns='EquipmentID',axis=1)
test2 = pd.get_dummies(test2, columns = ['ecuMake', 'IgnStatus']).drop(columns='EquipmentID',axis=1)

In [19]:
# list(test.columns)

# Logistic Regression
* with l1 penalty

In [20]:
# Predictors
X_train = train.drop(columns = 'derate_soon')
#Target Variable
y_train = train['derate_soon']

In [21]:
log = LogisticRegression(penalty='l1', solver='liblinear')

In [22]:
log.fit(X_train, y_train)

LogisticRegression(penalty='l1', solver='liblinear')

In [23]:
X_test = test2.drop(columns = 'derate_soon')
y_test = test2['derate_soon']

In [24]:
y_pred = log.predict(X_test)

In [25]:
y_test = y_test.replace('True',1)
y_test = y_test.replace('False',0)

### How well the model did

In [26]:
accuracy_score(y_test,y_pred)

0.9975788798726425

In [27]:
confusion_matrix(y_test,y_pred)

array([[90235,     0],
       [  219,     0]], dtype=int64)

Not great considering it guessed 0 correct derates out of the 219 provided.

In [28]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

       False       1.00      1.00      1.00     90235
        True       0.00      0.00      0.00       219

    accuracy                           1.00     90454
   macro avg       0.50      0.50      0.50     90454
weighted avg       1.00      1.00      1.00     90454



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


# SGD
* weighting by class

In [29]:
sgd = SGDClassifier(class_weight='balanced',penalty='l1', loss = 'log')

In [30]:
sgd.fit(X_train,y_train)

SGDClassifier(class_weight='balanced', loss='log', penalty='l1')

In [31]:
y_pred = sgd.predict(X_test)

### How well the model did

In [32]:
accuracy_score(y_test,y_pred)

0.011707608287085149

In [33]:
confusion_matrix(y_test,y_pred)

array([[  841, 89394],
       [    1,   218]], dtype=int64)

This has the opposite problem: We caught 217 of the 219 derates but our accuracy is garbage.

# Undersampling

In [34]:
from imblearn.under_sampling import RandomUnderSampler

In [35]:
undersampler = RandomUnderSampler(random_state = 321)

In [36]:
X_resampled, y_resampled = undersampler.fit_resample(X_train, y_train)

### Logistic Regression

In [37]:
log.fit(X_resampled, y_resampled)

LogisticRegression(penalty='l1', solver='liblinear')

In [38]:
y_pred = log.predict(X_test)
accuracy_score(y_test, y_pred)

0.8320251177394035

In [39]:
confusion_matrix(y_test, y_pred)

array([[75087, 15148],
       [   46,   173]], dtype=int64)

This is getting closer from our earlier Logistic Regression model.

### SGD

In [40]:
sgd.fit(X_resampled,y_resampled)

SGDClassifier(class_weight='balanced', loss='log', penalty='l1')

In [41]:
y_pred = sgd.predict(X_test)

In [42]:
accuracy_score(y_test, y_pred)

0.9907245671833197

In [43]:
confusion_matrix(y_test, y_pred)

array([[89615,   620],
       [  219,     0]], dtype=int64)

It seems adding the undersampling gives us the same high accuracy because it didn't guess any of them.

# SMOTE
**S**ynthetic **M**inority **O**versampling **TE**chnique

In [44]:
from imblearn.over_sampling import SMOTE

In [45]:
oversampler = SMOTE(k_neighbors=5, n_jobs=-1, random_state=321)

In [46]:
X_smote, y_smote = oversampler.fit_resample(X_train, y_train)

### Logistic Regression **best model so far

In [47]:
log.fit(X_smote, y_smote)

LogisticRegression(penalty='l1', solver='liblinear')

In [48]:
y_pred = log.predict(X_test)
accuracy_score(y_test, y_pred)

0.967519402127048

In [49]:
confusion_matrix(y_test, y_pred)

array([[87417,  2818],
       [  120,    99]], dtype=int64)

### SGD

In [50]:
sgd.fit(X_smote, y_smote)

SGDClassifier(class_weight='balanced', loss='log', penalty='l1')

In [51]:
y_pred = sgd.predict(X_test)
accuracy_score(y_test, y_pred)

0.037145952638910386

In [52]:
confusion_matrix(y_test, y_pred)

array([[ 3146, 87089],
       [    5,   214]], dtype=int64)

### SGD with a lesser SMOTE

In [None]:
# lessersmote = SMOTE(k_neighbors=5, n_jobs=-1, random_state=321, sampling_strategy=0.05) TBR
# adjusts the ratio of derate:not derate so we have less fake data points

# Standard Scaler 
* do before any model fitting

In [54]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler

In [None]:
# pipe = Pipeline(steps=[
    
#     ('scale',StandardScaler()), 
#     ('classifier',LogisticRegression(penalty='l1', solver='saga'))
    
# ]) # TBR

# Digging into Logreg w/ SMOTE

In [55]:
#will also work for sgd models
log.coef_[0]

array([ 2.80900018e-01, -3.52946756e-03,  1.54155282e-02, ...,
       -2.58608187e+01, -3.11585118e+00,  1.71524322e+00])

In [57]:
coefficients = pd.DataFrame({ 'variable': X_train.columns, 'coefficient': log.coef_[0]})

In [67]:
coefficients.sort_values('coefficient',ascending=False).tail(25)

Unnamed: 0,variable,coefficient
1010,91-1,-12.626525
1037,94-1,-12.719919
799,630-12,-12.759499
462,3610-2,-12.81753
771,609-12,-12.837942
822,641-9,-12.861233
748,5848-4,-13.00786
52,103-4,-13.1012
676,5396-1,-13.168979
148,1327-1,-13.177743


In [65]:
coefficients[coefficients.variable == 'IgnStatus_False']

Unnamed: 0,variable,coefficient
1066,IgnStatus_False,-3.115851
