In [None]:
'''1.Get simulation list of leak data'''
import os
import math
import numpy as np
import pandas as pd
from tqdm import tqdm
path='D:\\0-Data\\9-stcleak3\\'
if not os.path.exists(path+'cons\\'):
    os.makedirs(path+'cons\\')
#Prepare the node and pipeline files first
encoding='utf-8'
unit=pd.read_csv(path+'netcsv\\1.PipeData.csv',encoding=encoding)#pipeline
base_user_flow=np.array(pd.read_csv(path+'netcsv\\6.SinkData.csv',usecols=[6],encoding=encoding)).T#scenarios
#Get the simulation list
n=5#sample quantity multiplier
error=0.1#scenario change rate
error_name='0100'
for i in tqdm(range(len(unit))):#for each pipeline
    L=unit.loc[i,'长度']
    num=(int(math.log(L))+1)*n#n samples for every meter
    loc_ratio=np.round(np.random.uniform(low=0.005,high=0.995,size=[num,1]),3)#leak location ratio
    loc=np.round(loc_ratio*L,4)#leak location, m
    user_flow=base_user_flow*np.random.uniform(low=1-error,high=1+error,size=[num,len(base_user_flow[0])])#various scenarios
    cycle_flow=np.sum(user_flow,axis=1).reshape([-1,1])
    leak_size=np.random.uniform(low=np.sum(base_user_flow)*0.001,high=np.sum(base_user_flow)*0.01,size=[num,1])#leak flow, kg/s
    leak_ratio=np.round(leak_size/cycle_flow,3)#leak ratio
    simu_index=np.array(list(range(num))).reshape([-1,1])
    t=np.hstack([simu_index,loc,loc_ratio,leak_size,leak_ratio,cycle_flow,user_flow])
    t=pd.DataFrame(t)
    t.columns=['SimuIndex','Loc','LocRatio','Leak','LeakRatio','TotalFlow']+['UserFlow_'+str(i+1) for i in range(len(base_user_flow[0]))]

    t['Code']=unit.loc[i,'编号'].astype(int)
    t['ID']=unit.loc[i,'名称']
    t['Fault']='pipeLeak'
    t['Loc']=t['Loc'].astype(float)
    t['SimuIndex']=t['SimuIndex'].astype(int)
    t=t[['SimuIndex','Fault','Code','ID','Loc','LocRatio','Leak','LeakRatio','TotalFlow']+['UserFlow_'+str(i+1) for i in range(len(base_user_flow[0]))]]
    t.to_csv(path+'cons\\constraint_'+error_name+'_'+str(i)+'_leak.csv',index=False)#leak scenarios
    t['Leak']=0
    t['LeakRatio']=0
    t.to_csv(path+'cons\\constraint_'+error_name+'_'+str(i)+'_noleak.csv',index=False)#normal scenarios

In [None]:
'''2.Leak data simulation'''
# See json_tool.py & leak_simu_json.py
# Run leak_simu_json.py

In [None]:
'''3.Collect leak data'''
import time
import pandas as pd
from tqdm import tqdm

def main(i,error,leak):
    main_path='E:\\0-Work\\Data\\9-stcleak3\\'
    info=pd.read_csv(main_path+'cons\\constraint_'+error+'_'+str(i)+'_'+leak+'.csv',header=0)
    source_name='0.SourceDataResult.csv'
    pipe_name='1.PipeDataResult.csv'
    twoway_name='2.TwowaysDataResult.csv'
    tee_name='3.TeesDataResult.csv'
    cross_name='4.CrossDataResult.csv'
    plug_name='5.PlugDataResult.csv'
    sink_name='6.SinkDataResult.csv'
    valve_name='7.ValveDataResult.csv'

    col=pd.DataFrame()
    L=len(info)
    for j in tqdm(range(L)):
        path=main_path+'result_'+leak+'\\'+error+'\\pipe_'+str(i)+'\\'+str(j)+'\\'
        try:
            source_p=pd.read_csv(path+source_name,header=0,usecols=[2]).T.reset_index(drop=True)#2
            source_m=pd.read_csv(path+source_name,header=0,usecols=[4]).T.reset_index(drop=True)#2
            sink_p=pd.read_csv(path+sink_name,header=0,usecols=[2]).T.reset_index(drop=True)
            sink_m=pd.read_csv(path+sink_name,header=0,usecols=[4]).T.reset_index(drop=True)
            pipe_p1=pd.read_csv(path+pipe_name,header=0,usecols=[2]).T.reset_index(drop=True)
            pipe_p2=pd.read_csv(path+pipe_name,header=0,usecols=[5]).T.reset_index(drop=True)
            pipe_m=pd.read_csv(path+pipe_name,header=0,usecols=[9]).T.reset_index(drop=True)
            two_p=pd.read_csv(path+twoway_name,header=0,usecols=[2]).T.reset_index(drop=True)
            two_m=pd.read_csv(path+twoway_name,header=0,usecols=[5]).T.reset_index(drop=True)
            tee_p=pd.read_csv(path+tee_name,header=0,usecols=[2]).T.reset_index(drop=True)
            tee_m=pd.read_csv(path+tee_name,header=0,usecols=[5]).T.reset_index(drop=True)
            cross_p=pd.read_csv(path+cross_name,header=0,usecols=[2]).T.reset_index(drop=True)
            cross_m=pd.read_csv(path+cross_name,header=0,usecols=[5]).T.reset_index(drop=True)
            plug_p=pd.read_csv(path+plug_name,header=0,usecols=[2]).T.reset_index(drop=True)
            valve_p1=pd.read_csv(path+valve_name,header=0,usecols=[2]).T.reset_index(drop=True)
            valve_p2=pd.read_csv(path+valve_name,header=0,usecols=[5]).T.reset_index(drop=True)
            valve_m=pd.read_csv(path+valve_name,header=0,usecols=[3]).T.reset_index(drop=True)
    
            cat=pd.concat([source_m,source_p,sink_p,sink_m,two_p,tee_p,cross_p,valve_p1,valve_p2,valve_m,two_m,tee_m,cross_m,plug_p,pipe_p1,pipe_p2,pipe_m],axis=1) #horizontal
            cat.columns=[i for i in range(4577)]
            col=pd.concat([col,cat],axis=0) #vertical
        except Exception as e:
            if j!=0:
                col=pd.concat([col,pd.DataFrame([''])],axis=0)
            elif j==0:
                temp=pd.DataFrame([[np.nan for i in range(4577)]])
                col=pd.concat([col,temp],axis=0)

    col=col.reset_index(drop=True)
    col=pd.concat([info,col],axis=1)#horizontal
    # dimension of each part
    source_num,sink_num,pipe_num,two_num,tee_num,cross_num,plug_num,valve_num=source_p.shape[1],sink_p.shape[1],pipe_p1.shape[1],two_p.shape[1],tee_p.shape[1],cross_p.shape[1],plug_p.shape[1],valve_p1.shape[1]
    # rename
    Source=[str(i+1)+'_SourceM' for i in range(source_num)]+[str(i+1)+'_SourceP' for i in range(source_num)]# water source
    Sink=[str(i+1)+'_SinkP' for i in range(sink_num)]+[str(i+1)+'_SinkM' for i in range(sink_num)]# water user
    TwoP,TwoM=[str(i+1)+'_2P' for i in range(two_num)],[str(i+1)+'_2M' for i in range(two_num)]# twoway
    TeeP,TeeM=[str(i+1)+'_3P' for i in range(tee_num)],[str(i+1)+'_3M' for i in range(tee_num)]# tee-joint
    CrossP,CrossM=[str(i+1)+'_4P' for i in range(cross_num)],[str(i+1)+'_4M' for i in range(cross_num)]# cross
    Plug=[str(i+1)+'_PlugP' for i in range(plug_num)]# plug
    Valve=[str(i+1)+'_ValveP1' for i in range(valve_num)]+[str(i+1)+'_ValveP2' for i in range(valve_num)]+[str(i+1)+'_ValveM' for i in range(valve_num)]# valve
    Pipe=[str(i+1)+'_PipeP1' for i in range(pipe_num)]+[str(i+1)+'_PipeP2' for i in range(pipe_num)]+[str(i+1)+'_PM' for i in range(pipe_num)]# pipeline

    Label=['SimuIndex','Fault','Code','ID','Loc','LocRatio','Leak','LeakRatio','TotalFlow']+['UserFlow_'+str(i+1) for i in range(sink_num)]
    col.columns=Label+Source+Sink+TwoP+TeeP+CrossP+Valve+TwoM+TeeM+CrossM+Plug+Pipe

    col.to_csv(main_path+'\\result_'+leak+'\\'+error+'\\sample_'+error+'_pipe_'+str(i)+'_'+leak+'.csv',index=False)

time1=time.time()
name_list=['0100']
for j in range(len(name_list)):
    error_name=name_list[j]
    for i in range(0,603):
        print('error',error_name,'pipe:',i)
        main(i,error_name,'leak')#leak noleak
time2=time.time()
print('all done in:',round(time2-time1,3))

In [None]:
'''4.Get residual data'''
import os
import pandas as pd
from tqdm import tqdm

def main(i,error):
    main_path='D:\\0-Data\\9-stcleak4\\'
    if not os.path.exists(main_path+'dataset\\'):
        os.makedirs(main_path+'dataset\\') 
    if not os.path.exists(main_path+'dataset\\'+error+'\\'):
        os.makedirs(main_path+'dataset\\'+error+'\\')    
    leak='noleak'
    info_0=pd.read_csv(main_path+'result_'+leak+'\\'+error+'\\sample_'+error+'_pipe_'+str(i)+'_'+leak+'.csv',header=0)
    leak='leak'
    info_1=pd.read_csv(main_path+'result_'+leak+'\\'+error+'\\sample_'+error+'_pipe_'+str(i)+'_'+leak+'.csv',header=0)
    residual=info_1.copy()
    residual.iloc[:,117:]=info_1.iloc[:,117:]-info_0.iloc[:,117:]
    residual.iloc[:,[117,118]+list(range(229,337))]=info_1.iloc[:,[117,118]+list(range(229,337))]#origin data of flow

    residual.to_csv(main_path+'dataset\\'+error+'\\sample_'+error+'_pipe_'+str(i)+'_residual.csv',index=False)

name_list=['0100']
for j in range(len(name_list)):
    error_name=name_list[j]
    for i in tqdm(range(603)):
        main(i,error_name)

In [None]:
'''5.Add features of pressures of calibration points '''
import numpy as np
import pandas as pd

path='D:\\0-Data\\9-stcleak4\\'
encoding='utf-8'#'GBK'
source=pd.read_csv(path+'netcsv\\0.SourceData.csv',header=0,encoding=encoding)
source1=source.copy()
source1['attr']='source'
pipe=pd.read_csv(path+'netcsv\\1.PipeData.csv',header=0,encoding=encoding)
pipe1=pipe.copy()
pipe1['attr']='pipe'
two=pd.read_csv(path+'netcsv\\2.TwowaysData.csv',header=0,encoding=encoding)
two1=two.copy()
two1['attr']='two'
tee=pd.read_csv(path+'netcsv\\3.TeesData.csv',header=0,encoding=encoding)
tee1=tee.copy()
tee1['attr']='tee'
cross=pd.read_csv(path+'netcsv\\4.CrossData.csv',header=0,encoding=encoding)
cross1=cross.copy()
cross1['attr']='cross'
plug=pd.read_csv(path+'netcsv\\5.PlugData.csv',header=0,encoding=encoding)
plug1=plug.copy()
plug1['attr']='plug'
sink=pd.read_csv(path+'netcsv\\6.SinkData.csv',header=0,encoding=encoding)
sink1=sink.copy()
sink1['attr']='sink'
valve=pd.read_csv(path+'netcsv\\7.ValveData.csv',header=0,encoding=encoding)
valve1=valve.copy()
valve1['attr']='valve'
#files with 1 for searching, origin files for modifying
nodes=pd.concat([source1,sink1,valve1,plug1,two1,tee1,cross1],axis=0).reset_index(drop=True)
files={'source':source,'pipe':pipe,'two':two,'tee':tee,'cross':cross,'plug':plug,'sink':sink,'valve':valve}
column_name={'two':'_2P','tee':'_3P','plug':'_PlugP','valve1':'_ValveP1','valve2':'_ValveP2'}

cali_info=pd.read_excel(path+'netcsv\\calipoint.xlsx')#from project data
pipe_info=pd.read_excel(path+'netcsv\\pipeinfo.xlsx')#from project data
data_info=pd.read_csv(path+'result_leak\\0100\\sample_0100_pipe_0_leak.csv')

get_info=[]
column_index=[]
for i in range(len(cali_info)):
    gis_code=cali_info.loc[i,'管线编号']
    pipe_index=np.where(pipe_info['GIS编号']==gis_code)[0][0]
    pipe_code=pipe_info.loc[pipe_index,'管道编号']
    pipe_index2=np.where(pipe['名称']==pipe_code)[0][0]
    node1_code=pipe.loc[pipe_index2,'口1连接元件编号']
    node1_index=np.where(nodes['编号']==node1_code)[0][0]
    node1_attr=nodes.loc[node1_index,'attr']
    node1_index2=np.where(files[node1_attr]['编号']==node1_code)[0][0]
    if node1_attr=='valve':
        node1_kou=np.where(pipe_index2==valve[['口1连接元件编号','口2连接元件编号']])[1][0]+1
        node1_column_index=np.where(data_info.columns==(str(node1_index2+1)+column_name[node1_attr+str(node1_kou)]))[0][0]
    else:
        node1_column_index=np.where(data_info.columns==(str(node1_index2+1)+column_name[node1_attr]))[0][0]

    node2_code=pipe.loc[pipe_index2,'口2连接元件编号']
    node2_index=np.where(nodes['编号']==node2_code)[0][0]
    node2_attr=nodes.loc[node2_index,'attr']
    node2_index2=np.where(files[node2_attr]['编号']==node2_code)[0][0]
    if node2_attr=='valve':
        node2_kou=np.where(pipe_index2==valve[['口1连接元件编号','口2连接元件编号']])[1][0]+1
        node2_column_index=np.where(data_info.columns==(str(node2_index2+1)+column_name[node2_attr+str(node2_kou)]))[0][0]
    else:
        node2_column_index=np.where(data_info.columns==(str(node2_index2+1)+column_name[node2_attr]))[0][0]

    get_info.append([node1_code,node1_attr,node1_index2,node1_column_index,
                     node2_code,node2_attr,node2_index2,node2_column_index])
    
new_info=pd.DataFrame(get_info)
new_info.columns=['口1连接元件编号','口1连接元件类型','口1连接元件序号','口1连接元件列号','口2连接元件编号','口2连接元件类型','口2连接元件序号','口2连接元件列号']
cali_info1=pd.concat([cali_info,new_info],axis=1)
cali_info1.to_csv(path+'netcsv\\cali_info.csv',index=False,encoding='GBK')

In [None]:
'''6.Extract features we need'''
import os
import numpy as np
import pandas as pd
from tqdm import tqdm
path='D:\\0-Data\\9-stcleak4\\'
error='0100'
out_path=path+'dataset\\'+error+'\\feature\\'
if not os.path.exists(out_path):
    os.makedirs(out_path) 
# feature index
info_i=list(range(9))
source_m=[117,118]
source_p=[119,120]
user_p=list(range(121,229))
user_m=list(range(229,337))

# calibration index
cali_info1=pd.read_csv(path+'netcsv\\cali_info.csv',encoding='GBK')
column_index_list=[]
for i in range(len(cali_info1)):
    kou1_attr,kou2_attr=cali_info1.loc[i,'口1连接元件类型'],cali_info1.loc[i,'口2连接元件类型']
    if kou1_attr=='valve':
        column_index_list.append(cali_info1.loc[i,'口1连接元件列号'])
    elif kou1_attr!='plug':
        if kou2_attr=='plug':
            column_index_list.append(cali_info1.loc[i,'口1连接元件列号'])
        elif kou2_attr=='valve':
            column_index_list.append(cali_info1.loc[i,'口2连接元件列号'])
calib_p=np.sort(column_index_list).tolist()

feature_index=info_i+source_m+source_p+user_p+user_m+calib_p
for i in tqdm(range(603)):
    origin_dataset=pd.read_csv(path+'dataset\\'+error+'\\sample_'+error+'_pipe_'+str(i)+'_residual.csv',header=0).dropna()
    simple_dataset=origin_dataset.iloc[:,feature_index]
    simple_dataset.to_csv(out_path+'sample_'+error+'_pipe_'+str(i)+'_residual_feature.csv',index=False)