In [1]:
# 导入工具包
import pandas as pd

## 数据勘查

In [3]:
# 读取数据
records = pd.read_csv('./datasets/machine_fault_records.csv', low_memory=False)  # 机械设备故障记录
codes =  pd.read_csv('./datasets/machine_fault_codes.csv', encoding='gbk')  # 机械设备故障代码

In [4]:
# 数据统计信息
records.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 131134 entries, 0 to 131133
Data columns (total 19 columns):
 #   Column              Non-Null Count   Dtype  
---  ------              --------------   -----  
 0   ACCRUNTIMES         131134 non-null  int64  
 1   AREA                131134 non-null  int64  
 2   CD_VP_UNIQUENO      0 non-null       float64
 3   CITY                131134 non-null  int64  
 4   COLLECTTIME         131134 non-null  object 
 5   COOLANTTEMPERATURE  0 non-null       float64
 6   DIRECTION           0 non-null       float64
 7   ELCANDVOL           0 non-null       float64
 8   FAULTTIME           131134 non-null  object 
 9   FMI                 131131 non-null  float64
 10  FRID                131134 non-null  int64  
 11  LATITUDE            131134 non-null  float64
 12  LONGITUDE           131134 non-null  float64
 13  PROVINCE            131134 non-null  int64  
 14  READFLAG            131134 non-null  int64  
 15  ROTATIONALSPEED     0 non-null    

In [6]:
codes.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 473 entries, 0 to 472
Data columns (total 7 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   CODE          473 non-null    int64 
 1   SPN           473 non-null    int64 
 2   FMI           473 non-null    int64 
 3   DESCRIPTION   471 non-null    object
 4   ENGINETYPEID  473 non-null    int64 
 5   FAULTSOURCE   473 non-null    object
 6   FAULTNAME     473 non-null    object
dtypes: int64(4), object(3)
memory usage: 26.0+ KB


In [5]:
# 数据格式
records.sample(5)

Unnamed: 0,ACCRUNTIMES,AREA,CD_VP_UNIQUENO,CITY,COLLECTTIME,COOLANTTEMPERATURE,DIRECTION,ELCANDVOL,FAULTTIME,FMI,FRID,LATITUDE,LONGITUDE,PROVINCE,READFLAG,ROTATIONALSPEED,SPN,SYSTEMDATE,UNIQUENO
2547,1182,330283,,330200,2018/4/1 12:35,,,,2018/4/1 12:35,15.0,2360830,29.766042,29.766042,330000,1,,8809.0,,2.02e+17
74264,289,130983,,130900,2018/4/10 17:20,,,,2018/4/10 17:20,15.0,2441691,38.32645,38.32645,130000,1,,8809.0,,2.02e+17
48111,691,500118,,500100,2018/4/7 16:08,,,,2018/4/7 16:08,15.0,2408492,29.365616,29.365616,500000,1,,8809.0,,2.02e+17
105774,62,130434,,130400,2018/4/15 17:41,,,,2018/4/15 17:41,31.0,2489810,36.293788,36.293788,130000,1,,524287.0,,2.02e+17
10005,1039,330109,,330100,2018/4/2 13:58,,,,2018/4/2 13:58,15.0,2370422,30.326263,30.326263,330000,1,,8809.0,,2.02e+17


In [8]:
codes.sample(5)

Unnamed: 0,CODE,SPN,FMI,DESCRIPTION,ENGINETYPEID,FAULTSOURCE,FAULTNAME
336,1332,522043,19,CANTOTSC1VR超时错误,1958,潍柴国三,CANTOTSC1VR超时错误
166,1085,5490,3,"""这个阀检测到的电阻值超出限制范围，K2阀的电压值过高。?电缆/插接件是有缺陷的和电源电压有...",1958,变速箱,控制继电器K2对电源电压短路
113,1015,5080,5,"""TCU在这个输出针脚检测到一个错误的电压,像是这个针脚断路.?电缆有缺陷,到TCU没有连接...",1958,变速箱,在客户特定功能3断路.
0,1175,1576,5,油泵电磁阀1开路或对地短路,1958,上柴国三,油泵电磁阀1开路或对地短路
400,1396,1417,6,同Bank中5缸和其他缸开路或者与弱电源短路,1958,潍柴国三,同Bank中5缸和其他缸开路或者与弱电源短路


## 数据处理

In [9]:
# 删除指定列为空值的行
records.dropna(subset=['SPN', 'FMI'], inplace=True)

In [10]:
# 删除值全为空的列
empty_columns = records.columns[records.isnull().sum() == records.shape[0]]

print('删除值全为空的%i列：' % len(empty_columns), empty_columns)

records.drop(empty_columns, axis=1, inplace=True)

删除值全为空的6列 Index(['CD_VP_UNIQUENO', 'COOLANTTEMPERATURE', 'DIRECTION', 'ELCANDVOL',
       'ROTATIONALSPEED', 'SYSTEMDATE'],
      dtype='object')


In [11]:
# 删除只有同一个值的特征列
only_one_value_columns = records.columns[records.nunique(axis=0) == 1]

print('删除只有同一个值的%i列：' % len(only_one_value_columns), only_one_value_columns)

records.drop(only_one_value_columns, axis=1, inplace=True)

删除只有同一个值的1列： Index(['UNIQUENO'], dtype='object')


In [12]:
records.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 131127 entries, 0 to 131133
Data columns (total 12 columns):
 #   Column       Non-Null Count   Dtype  
---  ------       --------------   -----  
 0   ACCRUNTIMES  131127 non-null  int64  
 1   AREA         131127 non-null  int64  
 2   CITY         131127 non-null  int64  
 3   COLLECTTIME  131127 non-null  object 
 4   FAULTTIME    131127 non-null  object 
 5   FMI          131127 non-null  float64
 6   FRID         131127 non-null  int64  
 7   LATITUDE     131127 non-null  float64
 8   LONGITUDE    131127 non-null  float64
 9   PROVINCE     131127 non-null  int64  
 10  READFLAG     131127 non-null  int64  
 11  SPN          131127 non-null  float64
dtypes: float64(4), int64(6), object(2)
memory usage: 13.0+ MB
