In [9]:
import h2o
h2o.init()

In [None]:
#Draw box diagram
import seaborn as sns
import pandas as pd
from matplotlib import pyplot as plt
plt.rcParams['font.sans-serif'] =['Times New Roman'] 
plt.rcParams['axes.unicode_minus'] = False
data = pd.read_csv('data_origin.csv')
columns=list(data.columns)
unit=["( ℃ )","( min )","( mm )","","( ℃ )","( d )","( % )","( % )","( % )","( mL CH$_4$/gVS )"]
data.dropna(inplace=True)
fig, axs = plt.subplots(2, 5, figsize=(28,15))
columns = data.columns
# my_palette = ["blue", "orange", "green", "red", "purple"]
# colors = sns.color_palette(my_palette, n_colors=len(columns))
colors = sns.color_palette("RdYlGn", n_colors=len(columns))
for ax, col, color, letter ,uni in zip(axs.flatten(), columns, colors, list("abcdefghij"),unit):
    sns.boxplot(data=data[col], color=color, ax=ax)
    sns.stripplot(data=data[col], color="black",ax=ax,jitter=True)
    ax.text(-0.25, 1.05, f"({letter})", transform=ax.transAxes, fontsize=30, fontweight='bold')
    ax.tick_params(axis='y', labelsize=19)
    ax.set_xticklabels('')
    ax.set_xlabel(col, fontweight='bold',fontsize=23)
    ax.set_ylabel(uni, fontweight='bold',fontsize=23,labelpad=10)
plt.subplots_adjust(wspace=0.5)
plt.savefig("./箱式图.png",dpi=600)
plt.show()

In [None]:
#Plotting Pearson matrices
import pandas as pd
import numpy as np
import sklearn
import seaborn as sns
from pylab import *
from matplotlib import pyplot as plt
plt.rcParams['font.sans-serif'] =['Times New Roman'] 
plt.rcParams['axes.unicode_minus'] = False
from pandas.plotting import scatter_matrix
from sklearn.preprocessing import StandardScaler, Normalizer, MinMaxScaler
from sklearn.metrics import r2_score, explained_variance_score, mean_squared_error,mean_absolute_error
from sklearn.model_selection import train_test_split
data = pd.read_csv('data_origin.csv')
x=data.shape[1]
a=data.iloc[:,0:x].corr()
plt.subplots(figsize=(28,18),dpi=600)
mask = np.zeros_like(a, dtype=np.bool)   
mask[np.tril_indices_from(mask)]= True    
h=sns.heatmap(a,annot=True, vmax=1, square=True,cbar_kws={"shrink": 0.8},linecolor="black",annot_kws={'size':27,"weight":'bold'},linewidths=2,cbar=False)
cb=h.figure.colorbar(h.collections[0]) 
cb.ax.tick_params(labelsize=24,size=2,width=2) 
plt.xticks(fontsize=30,rotation=90,weight='bold')
plt.yticks(fontsize=30,rotation=360,weight='bold')
plt.savefig("./Figure",dpi=600,transparent=True)
plt.show()

In [None]:
#Light GBM for data filling
from sklearn.experimental import enable_iterative_imputer 
from sklearn.impute import IterativeImputer
from lightgbm import LGBMRegressor
from h2o.automl import H2OAutoML
from sklearn.metrics import r2_score, explained_variance_score, mean_squared_error,mean_absolute_error

data = pd.read_csv('data_origin.csv')
LGBR = LGBMRegressor()
imputer_lightgbm = IterativeImputer(max_iter = 30, 
                           random_state = 1, 
                           estimator = LGBR)
imputer_lightgbm.fit(data) 
imputer_lightgbm_data = imputer_lightgbm.transform(data) 
imputer_lightgbm_data = pd.DataFrame(imputer_lightgbm_data,columns=data.columns)
imputer_lightgbm_data.to_csv("./imputer_data/imputer_lightgbm_data.csv")
gujunmu=h2o.upload_file("./imputer_data/imputer_lightgbm_data.csv")
gujunmu= gujunmu[:,1:]
gujunmu_split=gujunmu.split_frame(ratios=[0.8],seed=1)
gujunmu_train=gujunmu_split[0]
gujunmu_test=gujunmu_split[1]
preditors=list(gujunmu.columns)
preditors.remove("Cumulative methane production")
aml60 = H2OAutoML(max_runtime_secs=60)
aml60.train(x = preditors,y = "Cumulative methane production",training_frame=gujunmu_train,validation_frame=gujunmu_test)
preds_train= aml60.leader.predict(gujunmu_train)
preds = aml60.leader.predict(gujunmu_test)
y_train=gujunmu_train[:,-1]
y_train= h2o.as_list(y_train)
y_valid=gujunmu_test[:,-1]
y_valid = h2o.as_list(y_valid)
preds= h2o.as_list(preds)
preds_train= h2o.as_list(preds_train)
score训练集 = r2_score(y_train,preds_train)
score训练集=round(score训练集, 2)
score测试集 = r2_score(y_valid,preds)
score测试集=round(score测试集, 2)
rmse_test=round(sqrt(mean_squared_error(y_valid,preds)), 2)
rmse_train=round(sqrt(mean_squared_error(y_train,preds_train)), 2)
result = '''
LightGBM填补结果:
Train R$^2$:{}
Train RMSE: {}
Test R$^2$:{}
Test RMSE: {}
'''.format(score训练集,rmse_train,score测试集, rmse_test)
print(result)

In [None]:
#RF for data filling
from sklearn.experimental import enable_iterative_imputer 
from sklearn.impute import IterativeImputer
from sklearn.ensemble import RandomForestRegressor
from h2o.automl import H2OAutoML
from sklearn.metrics import r2_score, explained_variance_score, mean_squared_error,mean_absolute_error
data = pd.read_csv('data_origin.csv')
RF = RandomForestRegressor()
imputer_rf = IterativeImputer(max_iter = 30, 
                           random_state = 1, 
                           estimator = RF)

imputer_rf.fit(data) 
imputer_rf_data = imputer_rf.transform(data) 
imputer_rf_data = pd.DataFrame(imputer_rf_data,columns=data.columns)
imputer_rf_data.to_csv("./imputer_data/imputer_rf_data.csv")
gujunmu=h2o.upload_file("./imputer_data/imputer_rf_data.csv")
gujunmu= gujunmu[:,1:]
gujunmu_split=gujunmu.split_frame(ratios=[0.8],seed=1)
gujunmu_train=gujunmu_split[0]
gujunmu_test=gujunmu_split[1]
preditors=list(gujunmu.columns)
preditors.remove("Cumulative methane production")
aml60 = H2OAutoML(max_runtime_secs=60)
aml60.train(x = preditors,y = "Cumulative methane production",training_frame=gujunmu_train,validation_frame=gujunmu_test)
preds_train= aml60.leader.predict(gujunmu_train)
preds = aml60.leader.predict(gujunmu_test)
y_train=gujunmu_train[:,-1]
y_train= h2o.as_list(y_train)
y_valid=gujunmu_test[:,-1]
y_valid = h2o.as_list(y_valid)
preds= h2o.as_list(preds)
preds_train= h2o.as_list(preds_train)
score训练集 = r2_score(y_train,preds_train)
score训练集=round(score训练集, 2)
score测试集 = r2_score(y_valid,preds)
score测试集=round(score测试集, 2)
rmse_test=round(sqrt(mean_squared_error(y_valid,preds)), 2)
rmse_train=round(sqrt(mean_squared_error(y_train,preds_train)), 2)
result = '''
RF填补结果:
Train R$^2$:{}
Train RMSE: {}
Test R$^2$:{}
Test RMSE: {}
'''.format(score训练集,rmse_train,score测试集, rmse_test)
print(result)

In [None]:
#KNN for data filling
from sklearn.experimental import enable_iterative_imputer 
from sklearn.impute import IterativeImputer
from sklearn.neighbors import KNeighborsRegressor
from h2o.automl import H2OAutoML
from sklearn.metrics import r2_score, explained_variance_score, mean_squared_error,mean_absolute_error
data = pd.read_csv('data_origin.csv')
KNN = KNeighborsRegressor(n_neighbors=10, p = 1) 

imputer_knn = IterativeImputer(max_iter = 30, 
                           random_state = 1, 
                           estimator = KNN)
imputer_knn.fit(data) 
imputer_knn_data = imputer_knn.transform(data) 
imputer_knn_data = pd.DataFrame(imputer_knn_data,columns=data.columns)
imputer_rf_data.to_csv("./imputer_data/imputer_knn_data.csv")
gujunmu=h2o.upload_file("./imputer_data/imputer_knn_data.csv")
gujunmu= gujunmu[:,1:]
gujunmu_split=gujunmu.split_frame(ratios=[0.8],seed=1)
gujunmu_train=gujunmu_split[0]
gujunmu_test=gujunmu_split[1]
preditors=list(gujunmu.columns)
preditors.remove("Cumulative methane production")
aml60 = H2OAutoML(max_runtime_secs=60)#,
aml60.train(x = preditors,y = "Cumulative methane production",training_frame=gujunmu_train,validation_frame=gujunmu_test)
preds_train= aml60.leader.predict(gujunmu_train)
preds = aml60.leader.predict(gujunmu_test)
y_train=gujunmu_train[:,-1]
y_train= h2o.as_list(y_train)
y_valid=gujunmu_test[:,-1]
y_valid = h2o.as_list(y_valid)
preds= h2o.as_list(preds)
preds_train= h2o.as_list(preds_train)
score训练集 = r2_score(y_train,preds_train)
score训练集=round(score训练集, 2)
score测试集 = r2_score(y_valid,preds)
score测试集=round(score测试集, 2)
rmse_test=round(sqrt(mean_squared_error(y_valid,preds)), 2)
rmse_train=round(sqrt(mean_squared_error(y_train,preds_train)), 2)
result = '''
KNN填补结果:
Train R$^2$:{}
Train RMSE: {}
Test R$^2$:{}
Test RMSE: {}
'''.format(score训练集,rmse_train,score测试集, rmse_test)
print(result)

In [None]:
#import data
gujunmu=h2o.upload_file("./imputer_data/imputer_rf_data.csv")
gujunmu= gujunmu[:,1:]
gujunmu.shape
gujunmu.describe()
gujunmu_split=gujunmu.split_frame(ratios=[0.8],seed=1)
gujunmu_train=gujunmu_split[0]
gujunmu_test=gujunmu_split[1]
print(gujunmu_train.shape,gujunmu_test.shape)
preditors=list(gujunmu.columns)
preditors.remove("Cumulative methane production")
preditors

In [None]:
#Run 60s results
from h2o.automl import H2OAutoML
aml60=H2OAutoML(max_runtime_secs=60)#,
aml60.train(x=preditors,y="Cumulative methane production",training_frame=gujunmu_train,validation_frame=gujunmu_test)
print(aml60.leaderboard)
aml60.leader.model_performance(gujunmu_test)
preds = aml60.leader.predict(gujunmu_test)
preds_train= aml60.leader.predict(gujunmu_train)
from sklearn.metrics import r2_score, explained_variance_score, mean_squared_error,mean_absolute_error
y_valid=gujunmu_test[:,-1]
y_valid = h2o.as_list(y_valid)
y_train=gujunmu_train[:,-1]
y_train= h2o.as_list(y_train)
preds= h2o.as_list(preds)
preds_train= h2o.as_list(preds_train)
result = '''
Train R$^2$:{}
Train RMSE: {}
Test R$^2$:{}
Test RMSE: {}
'''.format(score训练集,rmse_train,score测试集, rmse)
print(result)

In [None]:
#Run 300s results
from h2o.automl import H2OAutoML
aml300=H2OAutoML(max_runtime_secs=300)
aml300.train(x=preditors,y="Cumulative methane production",training_frame=gujunmu_train,validation_frame=gujunmu_test)
print(aml300.leaderboard)
aml300.leader.model_performance(gujunmu_test)
preds = aml300.leader.predict(gujunmu_test)
preds_train= aml300.leader.predict(gujunmu_train)
from sklearn.metrics import r2_score, explained_variance_score, mean_squared_error,mean_absolute_error
y_valid=gujunmu_test[:,-1]
y_valid = h2o.as_list(y_valid)
y_train=gujunmu_train[:,-1]
y_train= h2o.as_list(y_train)
preds= h2o.as_list(preds)
preds_train= h2o.as_list(preds_train)
result = '''
Train R$^2$:{}
Train RMSE: {}
Test R$^2$:{}
Test RMSE: {}
'''.format(score训练集,rmse_train,score测试集, rmse)
print(result)

Because it is the same code, it will not be repeated. The latter time periods are 600, 900, 1200, 1500, 1800, 2100.
The optimal run result is 1500s, so the derived model at 1500s is used for subsequent applications.

In [None]:
#Run 1500s results(max)
from h2o.automl import H2OAutoML
aml1500=H2OAutoML(max_runtime_secs=1500)#,
aml1500.train(x=preditors,y="Cumulative methane production",training_frame=gujunmu_train,validation_frame=gujunmu_test)
print(aml1500.leaderboard)
result=aml1500.leader.model_performance(gujunmu_test)
preds = aml1500.predict(gujunmu_test)
preds_train= aml1500.predict(gujunmu_train)
from sklearn.metrics import r2_score, explained_variance_score, mean_squared_error,mean_absolute_error
y_valid=gujunmu_test[:,-1]
y_valid = h2o.as_list(y_valid)
y_train=gujunmu_train[:,-1]
y_train= h2o.as_list(y_train)
preds= h2o.as_list(preds)
preds_train= h2o.as_list(preds_train)
result = '''
Train R$^2$:{}
Train RMSE: {}
Test R$^2$:{}
Test RMSE: {}
'''.format(score训练集,rmse_train,score测试集, rmse)
print(result)

In [2]:
# save the model
# model_path = h2o.save_model(model=aml1500.leader, path="./Best_model/", force=True)
# print(model_path)
best_model = h2o.load_model("D:\machine learning\水热机器学习\Best_model\GBM_grid_1_AutoML_3_20230310_90755_model_125")
# # load the model
# saved_model = h2o.load_model(model_path)

In [None]:
#Residual Analysis
ra_plot = best_model.residual_analysis_plot(gujunmu,figsize=(8,5))
plt.xticks(fontsize=15)
plt.yticks(fontsize=15)
plt.xlabel("Fitted",fontsize=15,weight="bold")
plt.ylabel("Residuals", fontsize=15,weight="bold")
plt.title("Residual Analysis for Best GBM", fontsize=15,weight="bold")
plt.savefig("残差分析图", dpi=600, transparent=True)
plt.show()

In [None]:
#learning curve plot
learning_curve_plot = best_model.learning_curve_plot(figsize=(8,5))
plt.xticks(fontsize=15)
plt.yticks(fontsize=15)
plt.xlabel("Number of trees",fontsize=15,weight="bold")
plt.ylabel("RMSE", fontsize=15,weight="bold")
plt.title("Learning Curve for Best GBM", fontsize=15,weight="bold")
plt.savefig("学习曲线图", dpi=600, transparent=True)
plt.show()

In [None]:
#Draw the fit graph
preds = best_model.predict(gujunmu_test)
preds_train= best_model.predict(gujunmu_train)
from sklearn.metrics import r2_score, explained_variance_score, mean_squared_error,mean_absolute_error
y_valid=gujunmu_test[:,-1]
y_valid = h2o.as_list(y_valid)
y_train=gujunmu_train[:,-1]
y_train= h2o.as_list(y_train)
preds= h2o.as_list(preds)
preds_train= h2o.as_list(preds_train)
import math
import scipy.stats as stats
from sklearn.utils.fixes import loguniform
import pandas as pd
import numpy as np
import sklearn
import seaborn as sns
from pylab import *
from sklearn.neighbors import KNeighborsRegressor
from matplotlib import pyplot as plt
plt.rcParams['font.sans-serif'] =['Times New Roman'] 
plt.rcParams['axes.unicode_minus'] = False
from pandas.plotting import scatter_matrix
from sklearn.preprocessing import StandardScaler, Normalizer, MinMaxScaler
from sklearn.metrics import r2_score, explained_variance_score, mean_squared_error,mean_absolute_error
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold
from sklearn.model_selection import train_test_split
score训练集 = r2_score(y_train,preds_train)
score训练集=round(score训练集, 2)
score测试集 = r2_score(y_valid,preds)
score测试集=round(score测试集, 2)
print(score训练集,score测试集)
mse = mean_squared_error(y_valid,preds)
rmse= sqrt(mse)
rmse=round(rmse, 2)
mse_train=mean_squared_error(y_train,preds_train)
rmse_train=sqrt(mse_train)
rmse_train=round(rmse_train, 2)
result = '''
Train R$^2$:{}
Train RMSE: {}
Test R$^2$:{}
Test RMSE: {}
'''.format(score训练集,rmse_train,score测试集, rmse)
print(result)
res= pd.concat([y_train,preds_train],axis=1)
res.columns = ['Actual Biogas Production (m$^3$/t VS)','Predict Biogas Production (m$^3$/t VS)']
ace1=pd.concat([y_valid,preds],axis=1)
ace1.columns = ['Actual Biogas Production (m$^3$/t VS)','Predict Biogas Production (m$^3$/t VS)']
font2 = {'family' : 'Times New Roman',
'size'   : 20}
from matplotlib import pyplot as plt
plt.rcParams['font.sans-serif'] =['Times New Roman'] 
plt.rcParams['axes.unicode_minus'] = False
plt.figure(figsize=(5, 5), dpi=600)
ax1=sns.jointplot(data=res,x='Actual Biogas Production (m$^3$/t VS)',y='Predict Biogas Production (m$^3$/t VS)',
                  kind='reg',color='red',xlim=(0,600),ylim=(0,600),marker="o",scatter_kws={'s': 50, 'alpha': 0.5})
plt.xticks(fontsize=15)
plt.yticks(fontsize=15)
plt.xlabel("Actual Cumulative methane production (mL CH$_4$/gVS)",fontsize=15,weight="bold")
plt.ylabel("Predict Cumulative methane production (mL CH$_4$/gVS)",fontsize=15,weight="bold")
ax2=sns.jointplot(data=ace1,x='Actual Biogas Production (m$^3$/t VS)',y='Predict Biogas Production (m$^3$/t VS)',
                  kind='reg',color='#20B2AA',xlim=(0,600),ylim=(0,600),marker="o",scatter_kws={'s': 70, 'alpha': 0.5})
plt.xticks(fontsize=15)
plt.yticks(fontsize=15)
plt.xlabel("Actual Cumulative methane production (mL CH$_4$/gVS)",fontsize=15,weight="bold")
plt.ylabel("Predict Cumulative methane production (mL CH$_4$/gVS)",fontsize=15,weight="bold")
plt.text(50,400,result,font2,weight="bold")
ax1.savefig("train.png",dpi=300)
ax2.savefig("test.png",dpi=300)
plt.show()
from PIL import Image
def blend_two_images():

    img1 = Image.open( "train.png")

    img1 = img1.convert('RGBA')

    img2 = Image.open( "test.png")

    img2 = img2.convert('RGBA')

    img = Image.blend(img1, img2, 0.7)

    img.show()
    

    img.save( "预测图.png")

    return

blend_two_images()

In [None]:
#calculate MAPE
from sklearn.experimental import enable_iterative_imputer 
from sklearn.impute import IterativeImputer
from sklearn.ensemble import RandomForestRegressor
import pandas as pd
import numpy as np

def mape(actual, predict):
    actual, predict = np.array(actual), np.array(predict)
    return np.mean(np.abs((actual - predict) / actual)) *100

#No nan MAPE
software_test = h2o.upload_file("data_software_test.csv")
actual = np.array(h2o.as_list(software_test[:,-1]))
predict = np.array(h2o.as_list(best_model.predict(software_test[:,:-1]))) 
mape_value = mape(actual, predict)
print("MAPE(No Nan): {:.2f}%".format(mape_value))

#Conclude Nan MAPE
#import data
# software_test_nan = h2o.upload_file("data_software_test_nan.csv")
# software_test_nan=h2o.as_list(software_test_nan)
# #Use RF filled the nan data
# data = pd.read_csv("./imputer_data/imputer_rf_data.csv")
# data = data.iloc[:,1:]
# RF = RandomForestRegressor()
# imputer_rf = IterativeImputer(max_iter = 30, 
#                            random_state = 1, 
#                            estimator = RF)
# imputer_rf.fit(data) 
# imputer_rf_data_nan = imputer_rf.transform(software_test_nan) 
# imputer_rf_data_nan = pd.DataFrame(imputer_rf_data_nan,columns=data.columns)
# imputer_rf_data_nan =  h2o.H2OFrame.from_python(imputer_rf_data_nan)
imputer_rf_data_nan =  h2o.upload_file("imputer_rf_sofatware_nan_data.csv")
#Calculate conclude nan MAPE
actual_nan = np.array(h2o.as_list(imputer_rf_data_nan[:,-1]))
predict_nan = np.array(h2o.as_list(best_model.predict(imputer_rf_data_nan[:,:-1])))
mape_value_nan = mape(actual_nan, predict_nan)
print("MAPE(Conclude Nan): {:.2f}%".format(mape_value_nan))

In [None]:
#Variable Importance Graph
ra_plot = best_model.varimp_plot()
plt.xticks(fontsize=15)
plt.yticks(fontsize=15,weight="bold")
plt.show()

In [None]:
#Calculate SHAP Values based on the best model
class H2OWrapper:
    import h2o
    def __init__(self, h2o_best_model, feature_names):
        self.ag_model = h2o_best_model
        self.feature_names = feature_names
    
    def predict(self, X):
        import h2o
        if isinstance(X, pd.Series):
            X = X.values.reshape(1,-1)
        if not isinstance(X, pd.DataFrame):
            X = pd.DataFrame(X, columns=self.feature_names)
            X = h2o.H2OFrame(X)
        return self.h2o.as_list(h2o_best_model.predict(X))
    
import shap
X_x_summary = shap.kmeans(data.iloc[:,1:-1],10)
print("Baseline feature-values: \n", X_x_summary)

ag_wrapper = H2OWrapper(h2o_best_model, feature_names)
explainer = shap.KernelExplainer(ag_wrapper.predict, X_x_summary)
NSHAP_SAMPLES = 100  # how many samples to use to approximate each Shapely value, larger values will be slower
N_VAL =100# how many datapoints from validation data should we interpret predictions for, larger values will be slower
shap_values = explainer.shap_values(data.iloc[:,1:-1], nsamples=NSHAP_SAMPLES)
shap_df = pd.DataFrame(shap_values, columns=data.columns[1:-1])
shap_df.to_csv('shap_values.csv', index=False)

In [None]:
#Perform Bayesian optimization for directed conditional search
import time
import pandas as pd
import numpy as np
from bayes_opt import BayesianOptimization
def run_secs(n):
    for i in range (0,n):
        path="./Best_model/GBM_grid_1_AutoML_3_20230310_90755_model_125/"
        best_model=h2o.load_model(path)
        start=time.time()
        def black_box_function(Hydrothermal_temperature, Hydrothermal_time, Particle_size, Solid_to_liquid_ratio, 
                               Anaerobic_temperature, Anaerobic_time, Lignin, Cellulose, Hemicellulose):
            X = pd.DataFrame(np.array([Hydrothermal_temperature, Hydrothermal_time, Particle_size, Solid_to_liquid_ratio, 
                               Anaerobic_temperature,Anaerobic_time, Lignin, Cellulose, Hemicellulose]).reshape(1, -1),
                             columns=preditors)
            X = h2o.H2OFrame.from_python(X)
            preds = best_model.predict(X)
            preds= h2o.as_list(preds)
            preds = np.squeeze(np.array(preds))
            return preds
        #Search scope limitation based on SHAP big data guidance and professional experience
        pbounds= {'Hydrothermal_temperature': (130,200),
                 'Hydrothermal_time': (0,250),
                  "Particle_size":(0,40),
                 'Solid_to_liquid_ratio': (0,0.2),
                  'Anaerobic_temperature': (37.000,37.001),
                   'Anaerobic_time': (15,40),
                   'Lignin': (10.810,10.811),
                  'Cellulose': (34.940,34.941),
                  'Hemicellulose': (25.350,25.351)
                 }

        optimizer_rf = BayesianOptimization(
                f=black_box_function,
                pbounds=pbounds,
                verbose=2,  # verbose = 1 prints only when a maximum is observed, verbose = 0 is silent
                
            )
        optimizer_rf.maximize(
                init_points=5,  
                n_iter=400               
            )

        print(optimizer_rf.max)
        end = time.time()
        print('Running time: %s Seconds'%(end-start))
        data_name =r"D:\machine learning\水热机器学习\data\data"+str(i)+".csv"
        b=pd.DataFrame(np.array(optimizer_rf.max).reshape(-1,1))
        b.to_csv(data_name)
run_secs(20)

import os
import pandas as pd
file_dir = "D:/machine learning/水热机器学习/data/"
files = os.listdir(file_dir)
df1 = pd.read_csv(os.path.join(file_dir, files[0]))
for e in files[1:]:
    df2 = pd.read_csv(os.path.join(file_dir, e))
    df1 = pd.concat((df1, df2), axis=0, join='inner')
print(df1) 
df1.to_csv("Bayes_optimize_data.csv")