In [1]:
import pandas as pd
import regex as re
import pandas_profiling
import datetime
import numpy as np

In [2]:
pd.set_option('display.max_row', 1000)
pd.set_option('display.max_columns', 50)

# Reading in the csv as dataframes
naming conventions chosen by Matthew

In [3]:
# onboard_df = pd.read_csv('../data/VehicleDiagnosticOnboardData.csv')
# onboard_df.head()
# #in long form, may need to pivot remember the data is organized. faultid=recordid?

In [4]:
fault_df = pd.read_csv('../data/J1939Faults.csv', low_memory=False)
fault_df.head()

Unnamed: 0,RecordID,ESS_Id,EventTimeStamp,eventDescription,actionDescription,ecuSoftwareVersion,ecuSerialNumber,ecuModel,ecuMake,ecuSource,spn,fmi,active,activeTransitionCount,faultValue,EquipmentID,MCTNumber,Latitude,Longitude,LocationTimeStamp
0,1,990349,2015-02-21 10:47:13.000,Low (Severity Low) Engine Coolant Level,,unknown,unknown,unknown,unknown,0,111,17,True,2,,1439,105354361,38.857638,-84.626851,2015-02-21 11:34:25.000
1,2,990360,2015-02-21 11:34:34.000,,,unknown,unknown,unknown,unknown,11,629,12,True,127,,1439,105354361,38.857638,-84.626851,2015-02-21 11:35:10.000
2,3,990364,2015-02-21 11:35:31.000,Incorrect Data Steering Wheel Angle,,unknown,unknown,unknown,unknown,11,1807,2,False,127,,1369,105336226,41.42125,-87.767361,2015-02-21 11:35:26.000
3,4,990370,2015-02-21 11:35:33.000,Incorrect Data Steering Wheel Angle,,unknown,unknown,unknown,unknown,11,1807,2,True,127,,1369,105336226,41.421018,-87.767361,2015-02-21 11:36:08.000
4,5,990416,2015-02-21 11:39:41.000,,,22281684P01*22357957P01*22362082P01*,13063430,0USA13_13_0415_2238A,VOLVO,0,4364,17,False,2,,1674,105427130,38.416481,-89.442638,2015-02-21 11:39:37.000


# Cleaning fault data

##### From the readme: 
<br> "Remove faults occurring in the vicinity of the service locations at (36.0666667, -86.4347222), (35.5883333, -86.4438888), and (36.1950, -83.174722)" <br> <br>


We want to go with 3 decimal points to get within 111m.

In [5]:
fault_df[['Latitude', 'Longitude','EquipmentID']] = fault_df[['Latitude','Longitude','EquipmentID']].astype(str)
#Per tony's findings, EquipmentId needs to be a string for filtering it later to work

In [6]:
fault_df['Latitude'] = fault_df['Latitude'].str.extract(r'(\-?\d+.\d{3})')

In [7]:
fault_df['Longitude'] = fault_df['Longitude'].str.extract(r'(\-?\d+.\d{3})')

In [8]:
fault_df['Location'] = fault_df['Latitude'].map(str) + ' , ' + fault_df['Longitude'].map(str)

In [9]:
fault_df = fault_df.loc[(fault_df.Location != '36.066 , -86.434') & (fault_df.Location != '35.588 , -86.443') & (fault_df.Location != '36.195 , -83.174')]

In [10]:
fault_df.shape

(1139316, 21)

##### From the readme:
<br> "Remove faults where the EquipmentID has more than 5 characters." confirmed with Michael to also drop NaNs

In [11]:
fault_df['EquipmentID'] = fault_df['EquipmentID'].dropna()

In [12]:
fault_df = fault_df[fault_df['EquipmentID'].str.len()<=5] 

In [13]:
fault_df.shape

(1137322, 21)

In [14]:
fault_df['EquipmentID'].str.len().unique()

array([4, 3, 5], dtype=int64)

##### our own analysis:
<br> actionDescription,faultValue,ecuSource were all empty columns that we'll remove.

In [15]:
fault_df = fault_df.drop(['actionDescription','faultValue','ecuSource'],axis=1)

In [16]:
fault_df.columns

Index(['RecordID', 'ESS_Id', 'EventTimeStamp', 'eventDescription',
       'ecuSoftwareVersion', 'ecuSerialNumber', 'ecuModel', 'ecuMake', 'spn',
       'fmi', 'active', 'activeTransitionCount', 'EquipmentID', 'MCTNumber',
       'Latitude', 'Longitude', 'LocationTimeStamp', 'Location'],
      dtype='object')

## Data Dictionary
* RecordID: same as FaultId in onboard_df. is the ID associated with the specific fault issue
* ESS_Id: the event subscriber service event that contained the fault
* EventTimeStamp: when the above event took place
* eventDescription
* ecuSoftwareVersion: version string from reporting vehicle computer system
* ecuSerialNumber: serial number of the ECM (Engine Control Module)
* ecuModel: model of reporting ECM
* ecuMake: Manufacturer of reporting ECM
* spn: fault code being reported
* fmi: failure mode associated with the fault code
* active: whether code is being set or removed
* activeTransitionCount: number of times the code has been set or unset
* EquipmentID: assigned truck number
* MCTNumber: communications terminal assigned to the truck
* Latitude
* Longitude
* LocationTimeStamp: time location of time of event was recorded
* Location

In [17]:
fault_df['EquipmentID'].unique()

array(['1439', '1369', '1674', ..., '2380', '2378', '2381'], dtype=object)

### To do:
* did a deratement happen within the next 1-6 hours? create a true/false column for this
* get rid of columns we don't need
* figure out what to do with the onboard_df data: bring it into the fault_df and...
* figure out what to do with the missing values in it (blog post resource: https://leportella.com/missing-data/#:~:text=Interpolation%20is%20a%20mathematical%20method,data%20and%20the%20value%20after)

In [18]:
#From Michael: keep all trucks since we get an idea of what a not derated truck looks like. 

### Dealing with datetime data
Converting our timestamps to datetime as well as creating a loop to see if a deratement was done within 1-6 hours of the fault reported

In [19]:
fault_df['LocationTimeStamp'] = pd.to_datetime(fault_df['LocationTimeStamp'])

In [20]:
fault_df['EventTimeStamp'] = pd.to_datetime(fault_df['EventTimeStamp'])

In [21]:
#first create a column filling in the timestamp where the 5246 occurs
#fault_df['time_next_5246'] = pd.DataFrame.where(cond=(fault_df['spn'] == 5246), self=fault_df['EventTimeStamp'])

In [22]:
#checking to see if it worked
#fault_df[(fault_df['spn'] == 1569)|(fault_df['spn'] == 5246)]

In [23]:
#then back-fill with the spn 5246 grouped by EquipmentID
#fault_df['time_next_5246'] = fault_df[['time_next_5246','EquipmentID']].groupby('EquipmentID').fillna(fault_df['time_next_5246'])

In [24]:
#test our data
#fault_df[(fault_df['spn'] == 1569)|(fault_df['spn'] == 5246)]

In [25]:
#sort by EventTimeStamp
df = fault_df[fault_df.EquipmentID == '1490'].sort_values('EventTimeStamp') #this truck has a lot of full derates

In [26]:
#Make new column that has values for the rows 5246
# df['time_next_5246'] =  pd.DataFrame.where(cond=(df['spn'] == 5246), self=df['EventTimeStamp']).fillna(method='bfill')

In [27]:
#Use bfill on this column to fill in missing values
#df['time_next_5246'] = df['time_next_5246'].fillna(method='bfill')

In [28]:
#turn the above code into a function we can apply to a groupby on the full df
def derate_time(df):
    df['time_next_5246'] =  pd.DataFrame.where(cond=(df['spn'] == 5246), 
                                               self=df['EventTimeStamp']).fillna(method='bfill')
    return df   

In [29]:
#check if our function works
derate_time(df)

Unnamed: 0,RecordID,ESS_Id,EventTimeStamp,eventDescription,ecuSoftwareVersion,ecuSerialNumber,ecuModel,ecuMake,spn,fmi,active,activeTransitionCount,EquipmentID,MCTNumber,Latitude,Longitude,LocationTimeStamp,Location,time_next_5246
2865,2866,1024457,2015-02-23 14:43:04,,04993120*00063662*040213150018*07700044*I0*BBZ*,79488009,6X1u10D1500000000,CMMNS,4334,18,True,1,1490,105369865,40.391,-86.792,2015-02-23 14:43:40,"40.391 , -86.792",2015-07-24 16:39:41
4009,4010,1045409,2015-02-24 11:20:30,,04993120*00063662*040213150018*07700044*I0*BBZ*,79488009,6X1u10D1500000000,CMMNS,4334,18,False,1,1490,105369865,36.067,-86.434,2015-02-24 11:20:11,"36.067 , -86.434",2015-07-24 16:39:41
4361,4362,1050117,2015-02-24 14:52:39,,04993120*00063662*040213150018*07700044*I0*BBZ*,79488009,6X1u10D1500000000,CMMNS,4334,18,True,1,1490,105369865,37.59,-85.869,2015-02-24 14:36:00,"37.590 , -85.869",2015-07-24 16:39:41
5489,5490,1067152,2015-02-25 11:27:39,,04993120*00063662*040213150018*07700044*I0*BBZ*,79488009,6X1u10D1500000000,CMMNS,4334,18,False,1,1490,105369865,41.832,-87.749,2015-02-25 11:27:35,"41.832 , -87.749",2015-07-24 16:39:41
39378,40490,2204533,2015-05-01 15:18:01,,04993120*00063662*040213150018*07700044*I0*BBZ*,79488009,6X1u10D1500000000,CMMNS,4334,18,True,1,1490,105369865,38.509,-78.784,2015-05-01 15:18:37,"38.509 , -78.784",2015-07-24 16:39:41
39851,40963,2211666,2015-05-02 06:46:44,,04993120*00063662*040213150018*07700044*I0*BBZ*,79488009,6X1u10D1500000000,CMMNS,4334,18,False,1,1490,105369865,36.486,-80.747,2015-05-02 06:46:40,"36.486 , -80.747",2015-07-24 16:39:41
54742,56812,2475232,2015-05-17 16:53:22,,04993120*00063662*040213150018*07700044*I0*BBZ*,79488009,6X1u10D1500000000,CMMNS,4334,18,True,1,1490,105369865,38.295,-85.628,2015-05-17 16:53:58,"38.295 , -85.628",2015-07-24 16:39:41
55531,57601,2487690,2015-05-18 13:06:50,,04993120*00063662*040213150018*07700044*I0*BBZ*,79488009,6X1u10D1500000000,CMMNS,4334,18,False,1,1490,105369865,38.109,-87.554,2015-05-18 13:06:46,"38.109 , -87.554",2015-07-24 16:39:41
58293,60363,2533311,2015-05-20 15:00:15,,04993120*00063662*040213150018*07700044*I0*BBZ*,79488009,6X1u10D1500000000,CMMNS,4334,18,True,1,1490,105369865,34.054,-84.593,2015-05-20 15:00:51,"34.054 , -84.593",2015-07-24 16:39:41
58957,61027,2543638,2015-05-21 06:49:00,,04993120*00063662*040213150018*07700044*I0*BBZ*,79488009,6X1u10D1500000000,CMMNS,4334,18,False,1,1490,105369865,35.779,-83.987,2015-05-21 06:48:55,"35.779 , -83.987",2015-07-24 16:39:41


In [30]:
#running it on two trucks to see if the groupby works
df = fault_df[(fault_df.EquipmentID == '1490') | (fault_df.EquipmentID == '1585')].sort_values('EventTimeStamp') #this truck has a lot of full derates

In [31]:
#it works on the smaller df!
df.groupby('EquipmentID').apply(derate_time)

Unnamed: 0,RecordID,ESS_Id,EventTimeStamp,eventDescription,ecuSoftwareVersion,ecuSerialNumber,ecuModel,ecuMake,spn,fmi,active,activeTransitionCount,EquipmentID,MCTNumber,Latitude,Longitude,LocationTimeStamp,Location,time_next_5246
27,28,990636,2015-02-21 11:53:02,Low (Severity Low) Engine Coolant Level,04993120*00187677*082113134117*07700053*I0*BBZ*,79619763,6X1u10D1500000000,CMMNS,111,17,False,1,1585,105443412,39.202,-85.957,2015-02-21 11:40:35,"39.202 , -85.957",2017-03-19 05:22:04
34,35,990771,2015-02-21 12:01:10,Low (Severity Low) Engine Coolant Level,04993120*00187677*082113134117*07700053*I0*BBZ*,79619763,6X1u10D1500000000,CMMNS,111,17,True,1,1585,105443412,38.904,-85.823,2015-02-21 12:01:53,"38.904 , -85.823",2017-03-19 05:22:04
36,37,990818,2015-02-21 12:04:50,Low (Severity Low) Engine Coolant Level,04993120*00187677*082113134117*07700053*I0*BBZ*,79619763,6X1u10D1500000000,CMMNS,111,17,False,1,1585,105443412,38.870,-85.810,2015-02-21 12:04:45,"38.870 , -85.810",2017-03-19 05:22:04
85,86,991438,2015-02-21 12:10:34,Low (Severity Low) Engine Coolant Level,04993120*00187677*082113134117*07700053*I0*BBZ*,79619763,6X1u10D1500000000,CMMNS,111,17,True,1,1585,105443412,38.348,-85.735,2015-02-21 12:41:54,"38.348 , -85.735",2017-03-19 05:22:04
92,93,991537,2015-02-21 12:47:36,Low (Severity Low) Engine Coolant Level,04993120*00187677*082113134117*07700053*I0*BBZ*,79619763,6X1u10D1500000000,CMMNS,111,17,False,1,1585,105443412,38.353,-85.706,2015-02-21 12:47:32,"38.353 , -85.706",2017-03-19 05:22:04
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
815798,837162,21824033,2017-07-27 06:34:17,Low (Severity Low) Catalyst Tank Level,04993120*00165661*042216134005*07700071*I0*BBZ*,60815807,6X1u10D1500000000,CMMNS,1761,17,False,1,1585,105392547,,-83.478,2017-07-27 06:34:13,"nan , -83.478",NaT
827491,848855,23484498,2017-08-14 06:26:17,Low Voltage (Power Supply (obsolete)),BB41103* BB41104*,S211200763,EC60-adv,BNDWS,627,4,True,127,1585,105392547,36.160,-77.691,2017-08-14 06:36:12,"36.160 , -77.691",NaT
827548,848912,23489707,2017-08-14 07:43:32,Low Voltage (Power Supply (obsolete)),BB41103* BB41104*,S211200763,EC60-adv,BNDWS,627,4,False,127,1585,105392547,36.160,-77.691,2017-08-14 07:43:28,"36.160 , -77.691",NaT
833126,855463,24309226,2017-08-22 14:18:31,Low Voltage (Power Supply (obsolete)),,,,,627,4,True,127,1585,105392547,36.160,-77.691,2017-08-22 14:56:58,"36.160 , -77.691",NaT


In [32]:
#testing it out on the full df
fault_df = fault_df.sort_values('EventTimeStamp')

In [33]:
fault_df = fault_df.groupby('EquipmentID').apply(derate_time)

In [34]:
#make sure they're not all NaT
fault_df.time_next_5246.unique()

array([                          'NaT', '2019-02-14T13:46:15.000000000',
       '2019-04-29T05:02:21.000000000', ...,
       '2020-02-06T08:03:09.000000000', '2020-02-13T14:01:40.000000000',
       '2020-02-17T17:08:39.000000000'], dtype='datetime64[ns]')

### let's see if we can make a next column that inputs True/False for if a deratement occurs within the next x hours (time tbd)


In [35]:
#first subtracting datetime data types
testdf = fault_df[fault_df['time_next_5246'].notnull()]

In [36]:
testdf['time_until_derate'] = testdf['time_next_5246']-testdf['EventTimeStamp']

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  testdf['time_until_derate'] = testdf['time_next_5246']-testdf['EventTimeStamp']


In [37]:
testdf.head()

Unnamed: 0,RecordID,ESS_Id,EventTimeStamp,eventDescription,ecuSoftwareVersion,ecuSerialNumber,ecuModel,ecuMake,spn,fmi,active,activeTransitionCount,EquipmentID,MCTNumber,Latitude,Longitude,LocationTimeStamp,Location,time_next_5246,time_until_derate
1154225,1211449,108631659,2000-03-19 10:51:28,Low (Severity Medium) Engine Coolant Level,04358814*06075794*030816202706*09400153*G1*BDR*,79921284,6X1u13D1500000000,CMMNS,111,18,False,12,1961,105383198,36.698,-87.453,2000-03-19 10:51:23,"36.698 , -87.453",2019-02-14 13:46:15,6906 days 02:54:47
1154310,1211534,108663627,2000-03-19 18:56:09,High Voltage (Fuel Level),,,CECU3B-NAMUX4,PACCR,96,3,False,126,1970,105438602,40.966,-87.24,2000-03-19 18:56:04,"40.966 , -87.240",2019-04-29 05:02:21,6979 days 10:06:12
1154309,1211533,108663626,2000-03-19 18:56:09,High Voltage (Left Fuel Level Sensor),,,CECU3B-NAMUX4,PACCR,829,3,False,126,1970,105438602,40.966,-87.24,2000-03-19 18:56:04,"40.966 , -87.240",2019-04-29 05:02:21,6979 days 10:06:12
982227,1016466,49680016,2010-12-31 22:04:03,Incorrect Data J1939 Network #1 Primary Vehicl...,AAAI000032*AAAM000038*BB41275 *A82J140721A_9...,5W24155105,EC80ESP,BNDWS,639,2,False,4,1827,105464811,36.123,-86.398,2018-05-22 11:33:20,"36.123 , -86.398",2019-01-21 09:01:38,2942 days 10:57:35
501145,512094,10004179,2010-12-31 23:04:15,Incorrect Data J1939 Network #1 Primary Vehicl...,BB41103* BB41104*,S451311881,EC60-adv,BNDWS,639,2,True,127,302,105435601,38.348,-85.708,2016-06-25 15:01:54,"38.348 , -85.708",2020-01-06 10:13:57,3292 days 11:09:42


In [38]:
#we can get rid of eventDescription
fault_df[fault_df.spn == 5246]['eventDescription'].unique()

array([nan], dtype=object)

In [39]:
fault_df['time_until_derate'] = fault_df['time_next_5246']-fault_df['EventTimeStamp']

In [40]:
fault_df

Unnamed: 0,RecordID,ESS_Id,EventTimeStamp,eventDescription,ecuSoftwareVersion,ecuSerialNumber,ecuModel,ecuMake,spn,fmi,active,activeTransitionCount,EquipmentID,MCTNumber,Latitude,Longitude,LocationTimeStamp,Location,time_next_5246,time_until_derate
1154194,1211418,108604426,2000-03-18 19:14:10,High Voltage (Fuel Level),,,CECU3B-NAMUX4,PACCR,96,3,True,126,2015,105427130,36.935,-86.507,2000-03-18 19:14:46,"36.935 , -86.507",NaT,NaT
1154193,1211417,108604425,2000-03-18 19:14:10,High Voltage (Left Fuel Level Sensor),,,CECU3B-NAMUX4,PACCR,829,3,True,126,2015,105427130,36.935,-86.507,2000-03-18 19:14:46,"36.935 , -86.507",NaT,NaT
1154196,1211420,108604488,2000-03-18 19:20:47,High Voltage (Left Fuel Level Sensor),,,CECU3B-NAMUX4,PACCR,829,3,False,126,2015,105427130,36.929,-86.496,2000-03-18 19:20:43,"36.929 , -86.496",NaT,NaT
1154195,1211419,108604487,2000-03-18 19:20:47,High Voltage (Fuel Level),,,CECU3B-NAMUX4,PACCR,96,3,False,126,2015,105427130,36.929,-86.496,2000-03-18 19:20:43,"36.929 , -86.496",NaT,NaT
1154198,1211422,108608408,2000-03-19 02:59:58,Not Reporting Data Wheel Sensor ABS Axle 2 Right,AAAI000032*AAAM000038*BB41275 *A82J140721A_9...,5W26153559,EC80ESP,BNDWS,792,7,False,13,1849,105381862,36.758,-86.171,2000-03-19 02:59:53,"36.758 , -86.171",NaT,NaT
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1187333,1248457,123906113,2020-03-06 14:14:13,Low (Severity Medium) Engine Coolant Level,04384413*22544852*090619141107*60701756*G1*BGT*,,,,111,18,True,8,2377,108605700,35.030,-85.321,2020-03-06 14:14:49,"35.030 , -85.321",NaT,NaT
1187334,1248458,123906131,2020-03-06 14:15:34,Low (Severity Medium) Engine Coolant Level,04384413*22544852*090619141107*60701756*G1*BGT*,,,,111,18,False,8,2377,108605700,35.027,-85.323,2020-03-06 14:15:30,"35.027 , -85.323",NaT,NaT
1113250,1161753,87903706,2026-05-16 14:44:11,,unknown,unknown,unknown,unknown,5742,4,False,1,1744,105306493,35.586,-86.444,2019-05-23 07:44:25,"35.586 , -86.444",NaT,NaT
1113251,1161754,87903707,2026-05-16 14:44:11,,unknown,unknown,unknown,unknown,5743,4,False,1,1744,105306493,35.586,-86.444,2019-05-23 07:44:25,"35.586 , -86.444",NaT,NaT


### how much time will we need to get to a station?

In [41]:
#Josh said anything less than 1-2 hours is going to be too late and the truck will have to be towed, 
#let's give a bit more breathing room
fault_df.loc[(fault_df['time_until_derate'] > datetime.timedelta(hours=1)) & 
             (fault_df['time_until_derate'] < datetime.timedelta(hours=6)), 'derate_soon'] = True

In [42]:
#fill our NaT with 'False' as a derate hasn't happened at all yet for that truck
fault_df['derate_soon'] = fault_df['derate_soon'].fillna(False)

In [43]:
#checking to make sure it filled things in correctly
fault_df[(fault_df.derate_soon == False) & (fault_df.time_until_derate.notnull())]

Unnamed: 0,RecordID,ESS_Id,EventTimeStamp,eventDescription,ecuSoftwareVersion,ecuSerialNumber,ecuModel,ecuMake,spn,fmi,active,activeTransitionCount,EquipmentID,MCTNumber,Latitude,Longitude,LocationTimeStamp,Location,time_next_5246,time_until_derate,derate_soon
1154225,1211449,108631659,2000-03-19 10:51:28,Low (Severity Medium) Engine Coolant Level,04358814*06075794*030816202706*09400153*G1*BDR*,79921284,6X1u13D1500000000,CMMNS,111,18,False,12,1961,105383198,36.698,-87.453,2000-03-19 10:51:23,"36.698 , -87.453",2019-02-14 13:46:15,6906 days 02:54:47,False
1154310,1211534,108663627,2000-03-19 18:56:09,High Voltage (Fuel Level),,,CECU3B-NAMUX4,PACCR,96,3,False,126,1970,105438602,40.966,-87.240,2000-03-19 18:56:04,"40.966 , -87.240",2019-04-29 05:02:21,6979 days 10:06:12,False
1154309,1211533,108663626,2000-03-19 18:56:09,High Voltage (Left Fuel Level Sensor),,,CECU3B-NAMUX4,PACCR,829,3,False,126,1970,105438602,40.966,-87.240,2000-03-19 18:56:04,"40.966 , -87.240",2019-04-29 05:02:21,6979 days 10:06:12,False
982227,1016466,49680016,2010-12-31 22:04:03,Incorrect Data J1939 Network #1 Primary Vehicl...,AAAI000032*AAAM000038*BB41275 *A82J140721A_9...,5W24155105,EC80ESP,BNDWS,639,2,False,4,1827,105464811,36.123,-86.398,2018-05-22 11:33:20,"36.123 , -86.398",2019-01-21 09:01:38,2942 days 10:57:35,False
501145,512094,10004179,2010-12-31 23:04:15,Incorrect Data J1939 Network #1 Primary Vehicl...,BB41103* BB41104*,S451311881,EC60-adv,BNDWS,639,2,True,127,302,105435601,38.348,-85.708,2016-06-25 15:01:54,"38.348 , -85.708",2020-01-06 10:13:57,3292 days 11:09:42,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1182653,1242794,121404040,2020-02-17 17:08:37,,PC4_C1408P151H*,Y043718,MX16U15D13,PCAR,5625,13,False,1,302,105418777,38.192,-85.707,2020-02-17 17:08:33,"38.192 , -85.707",2020-02-17 17:08:39,0 days 00:00:02,False
1182655,1242796,121404042,2020-02-17 17:08:37,,PC4_C1408P151H*,Y043718,MX16U15D13,PCAR,4346,19,False,1,302,105418777,38.192,-85.707,2020-02-17 17:08:33,"38.192 , -85.707",2020-02-17 17:08:39,0 days 00:00:02,False
1182658,1242799,121404045,2020-02-17 17:08:37,,PC4_C1408P151H*,Y043718,MX16U15D13,PCAR,5113,9,False,1,302,105418777,38.192,-85.707,2020-02-17 17:08:33,"38.192 , -85.707",2020-02-17 17:08:39,0 days 00:00:02,False
1182657,1242798,121404044,2020-02-17 17:08:39,,,,,,5246,19,False,88,302,105418777,38.192,-85.707,2020-02-17 17:08:34,"38.192 , -85.707",2020-02-17 17:08:39,0 days 00:00:00,False


## Columns for model testing df
**Keep for sure**:
* EquipmentID: rename to _truck_
* derate_soon
* spn
* ecuMake
* fmi (?): fmi explained https://www.diesellaptops.com/blogs/news/truck-sae-codes-such-as-j1939-j1708-spn-fmi-mid-explained ('This code is set by the ECM detecting a variety of problems, such as too much voltage, not enough voltage, resistance is incorrect, and so on.'), we should filter specific values based off of this list
* EventTimeStamp (leaning keep)
* time_next_5246 (leaning keep)
* time_until_derate (leaning keep)
* RecordID (?): it's what we used to join the two datasets, but doesn't tell us much about the data/is arbitrary



**Unsure**:
* active (no idea): True means code is being removed/not removed?
* activeTransitionCount (no idea but I might have info regarding it)
 
**Get rid of**:
* eventDescription
* Latitude
* Longitude
* LocationTimeStamp
* ecuSoftwareVersion (leaning get rid of)--what's measuring the issues shouldn't affect the deratement likelihood, could _possibly_ backfill based off of other data but that sounds messy
* ecuModel (no idea)--what's measuring the issues shouldn't affect the deratement likelihood, could _possibly_ backfill based off of other data but that sounds messy
* ecuSerialNumber (leaning get rid of)--what's measuring the issues shouldn't affect the deratement likelihood, could _possibly_ backfill based off of other data but that sounds messy
* MCTNumber (no idea): communications device that I assume has nothing to do with the deratement
* ESS_Id (no idea): 'Event Subscribers - Sometimes called "Listeners", are callable methods or functions that react to an event being propagated throughout the Event Registry.'--from Michael: this is another identifier made redundant by RecordID



## Download as new csv
We want to combine this with the cleaned up onboard dataframe in another notebook.

In [44]:
#commenting out so it isn't run again 
#fault_df.to_csv('..\data\cleaned_fault_data.csv')