## 数据清洗代码

In [1]:
import pandas as pd
import os

In [2]:
def createTrainSet(readpath:str,savepath:str,labelfilepath:str="./data/train/outputs/主蒸汽流量.csv")->pd.DataFrame:
    """
    :param readpath: 存放各指标数据集的路径
    :param labelfilepath: 训练集label的存放位置(csv)
    :return DataFrame: 生成训练集/测试集(pandas DataFrame)
    """
    list_dfs = []
    for file in os.listdir(readpath):
        # 需将"时间"解析为日期格式
        df_onefile = pd.read_csv((readpath+file),encoding='utf-8',index_col=['时间'],parse_dates=['时间'])
        list_dfs.append(df_onefile)
    # 读取label
    df_labeloftrain = pd.read_csv(labelfilepath,encoding='utf-8',index_col=['时间'],parse_dates=['时间'])
    list_dfs.append(df_labeloftrain)

    # 水平连接(默认取的是'outer')
    df_train = pd.concat(objs=list_dfs,axis=1)
    df_train.to_csv(savepath,encoding = 'utf-8-sig',)

In [3]:
# 创建训练集
createTrainSet('./data/train/inputs/',"./data/train/trainset.csv")

In [5]:
df_trainSet = pd.read_csv("./data/train/trainset.csv",encoding='utf-8-sig',index_col=['时间'],parse_dates=['时间'])
df_trainSet.info()

  df_trainSet = pd.read_csv("./data/train/trainset.csv",encoding='utf-8-sig',index_col=['时间'],parse_dates=['时间'])


<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 257400 entries, 2021-12-20 00:00:00 to 2021-12-22 23:29:59
Data columns (total 22 columns):
 #   Column     Non-Null Count   Dtype  
---  ------     --------------   -----  
 0   CO含量       257400 non-null  float64
 1   HCL含量      257400 non-null  float64
 2   NOx含量      257400 non-null  float64
 3   SO2含量      257400 non-null  float64
 4   一次风调门      257400 non-null  float64
 5   一次风量       257400 non-null  float64
 6   主蒸汽流量设定值   257400 non-null  float64
 7   二次风调门      257400 non-null  float64
 8   二次风量       257400 non-null  float64
 9   引风机转速      257400 non-null  float64
 10  推料器启停      257384 non-null  object 
 11  推料器手动指令    257400 non-null  float64
 12  推料器自动投退信号  257384 non-null  object 
 13  推料器自动指令    257400 non-null  float64
 14  氧量设定值      257400 non-null  float64
 15  汽包水位       257400 non-null  float64
 16  炉排启停       257384 non-null  object 
 17  炉排实际运行指令   257400 non-null  float64
 18  炉排手动指令     257400 non-null  fl

## 一些测试

In [7]:
df1 = pd.read_csv("./data/train/inputs/CO含量.csv",encoding='utf-8',index_col=['时间'],parse_dates=['时间'])
df2 = pd.read_csv("./data/train/inputs/HCL含量.csv",encoding='utf-8',index_col=['时间'],parse_dates=['时间'])
df1.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 257400 entries, 2021-12-20 00:00:00 to 2021-12-22 23:29:59
Data columns (total 1 columns):
 #   Column  Non-Null Count   Dtype  
---  ------  --------------   -----  
 0   CO含量    257400 non-null  float64
dtypes: float64(1)
memory usage: 3.9 MB


In [10]:
# 查看各指标数据集的shape
for file in os.listdir("./data/train/inputs/"):
    df_onefile = pd.read_csv(("./data/train/inputs/"+file),encoding='utf-8',index_col=['时间'],parse_dates=['时间'])
    print(file+"'shape",df_onefile.shape,sep = ':',end='\n')

CO含量.csv'shape:(257400, 1)
HCL含量.csv'shape:(257400, 1)
NOx含量.csv'shape:(257400, 1)
SO2含量.csv'shape:(257400, 1)
一次风调门.csv'shape:(257400, 1)
一次风量.csv'shape:(257400, 1)
主蒸汽流量设定值.csv'shape:(257400, 1)
二次风调门.csv'shape:(257400, 1)
二次风量.csv'shape:(257400, 1)
引风机转速.csv'shape:(257400, 1)
推料器启停.csv'shape:(257382, 1)
推料器手动指令.csv'shape:(257400, 1)
推料器自动投退信号.csv'shape:(257382, 1)
推料器自动指令.csv'shape:(257400, 1)
氧量设定值.csv'shape:(257400, 1)
汽包水位.csv'shape:(257400, 1)
炉排启停.csv'shape:(257382, 1)
炉排实际运行指令.csv'shape:(257400, 1)
炉排手动指令.csv'shape:(257400, 1)
炉排自动投退信号.csv'shape:(257382, 1)
给水流量.csv'shape:(257400, 1)


In [15]:
# 以炉排自动投退信号和给水流量为例进行concat
df1 = pd.read_csv("./data/train/inputs/炉排自动投退信号.csv",encoding='utf-8',index_col=['时间'],parse_dates=['时间'])
df2 = pd.read_csv("./data/train/inputs/给水流量.csv",encoding='utf-8',index_col=['时间'],parse_dates=['时间'])

In [16]:
df3 = pd.concat([df1,df2],axis=1)

In [19]:
df3.apply(lambda x:sum(pd.notna(x)))

炉排自动投退信号    257384
给水流量        257400
dtype: int64

In [20]:
df3[pd.isna(df3['炉排自动投退信号'])]

Unnamed: 0_level_0,炉排自动投退信号,给水流量
时间,Unnamed: 1_level_1,Unnamed: 2_level_1
2021-12-20 00:00:00,,73.4054
2021-12-20 00:00:01,,73.5788
2021-12-20 00:00:02,,73.7589
2021-12-20 00:00:03,,73.9473
2021-12-20 00:00:04,,74.1118
2021-12-20 00:00:05,,74.2273
2021-12-21 00:00:01,,69.2164
2021-12-21 00:00:02,,69.35
2021-12-21 00:00:03,,69.4572
2021-12-21 00:00:04,,69.5473
