In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler,PolynomialFeatures
from sklearn.model_selection import train_test_split, GridSearchCV, KFold
from sklearn.pipeline import Pipeline

from sklearn.linear_model import Ridge, LinearRegression,ElasticNet#线性回归
from sklearn.neural_network import MLPRegressor #MLP
from sklearn.ensemble import RandomForestRegressor #RF
from xgboost.sklearn import XGBRegressor #XGB
from sklearn.svm import SVR

In [2]:
#数据加载
df=pd.read_excel(r"C:\Users\yaoyao tang\Desktop\ML-hydrothermal\experiment_dataset.xlsx")

In [3]:
df.head()

Unnamed: 0,邻苯二胺,络氨酸,浓度,体积,温度,时间,Ex,Em1,Em2,QY
0,0.6,0.4,2,15,200,10,562,624,676,15.5
1,0.8,0.8,4,15,140,12,562,630,678,15.0
2,1.0,1.0,2,15,140,10,562,626,678,19.3
3,0.8,0.8,2,10,180,4,562,624,678,14.6
4,1.0,1.0,6,15,160,12,562,630,678,14.5


In [5]:
new_df=df.iloc[:,:6]
new_df["QY"]=df["QY"]

In [6]:
new_df.head()

Unnamed: 0,邻苯二胺,络氨酸,浓度,体积,温度,时间,QY
0,0.6,0.4,2,15,200,10,15.5
1,0.8,0.8,4,15,140,12,15.0
2,1.0,1.0,2,15,140,10,19.3
3,0.8,0.8,2,10,180,4,14.6
4,1.0,1.0,6,15,160,12,14.5


In [7]:
new_df.columns = ['M1','M2','C','V', 'T', 'H', 'QY']

In [10]:
x=np.array(new_df.loc[:,"M1":"H"])
y=np.array(new_df.loc[:,"QY"])

In [12]:
print(y[:5])
print(y.shape)

[15.5 15.  19.3 14.6 14.5]
(200,)


In [13]:
#评价指标
from sklearn.metrics import r2_score,mean_absolute_error,mean_squared_error
from math import sqrt
def test(y_pred,y_true):
    r2=r2_score(y_true,y_pred)
    rmse=sqrt(mean_squared_error(y_pred,y_true))
    mae=mean_absolute_error(y_pred,y_true)
    return np.array([r2,rmse,mae])

In [15]:
X=x
Y=y

In [16]:
import gc
save_csv= True
verbose = False
n_jobs=4
# cross validation settup
Ntrials = 6
outter_nsplit = 4
inner_nsplit = 4
tot_count = Ntrials * outter_nsplit
# Results store
lr_test = np.zeros((tot_count,3))
mlp_test = np.zeros((tot_count,3))
svr_test = np.zeros((tot_count,3))
xgb_test = np.zeros((tot_count,3))
lr_train=np.zeros((tot_count,3))
mlp_train=np.zeros((tot_count,3))
svr_train=np.zeros((tot_count,3))
xgb_train=np.zeros((tot_count,3))

for j in range(Ntrials):
    print("trial = ",j)
    train_index = []  
    test_index = []  

    outer_cv = KFold(n_splits=outter_nsplit, shuffle=True)
    for train_ind,test_ind in outer_cv.split(X,Y):#产生训练集和测试集的序号（数组）
        train_index.append(train_ind.tolist())
        test_index.append(test_ind.tolist())

    for k in range(outter_nsplit):#outter_nsplit
        count = j * outter_nsplit + k
        print(str(count), " / ",str(tot_count))
        X_train = X[train_index[k]]
        Y_train = Y[train_index[k]]

        X_test = X[test_index[k]]
        Y_test = Y[test_index[k]]

        # XGBoost
        xgb_res = XGBRegressor()
        
        param_grid = dict(learning_rate=[0.001,0.01,0.1],
              n_estimators=[10,20, 30,40, 50,60],
              colsample_bylevel = [0.5,0.7,0.9],
              gamma=[0,0.2,0.4],
              max_depth =[3,5,7],
              reg_lambda = [0.1,1,10], 
              subsample=[0.4,0.7,1])
        xgb_cv = GridSearchCV(xgb_res,param_grid = param_grid, cv=inner_nsplit,scoring='r2',verbose=verbose,n_jobs=n_jobs)
        xgb_cv.fit(X_train, Y_train)
        model=xgb_cv.best_estimator_
        
        xgb_train[count] = test(model.predict(X_train),Y_train)
        xgb_test[count] = test(model.predict(X_test),Y_test)
        del xgb_cv
        del model
        gc.collect()

        # MLP
        mlp_res = Pipeline([            
                ('sc', StandardScaler()), 
                ('reg',  MLPRegressor())
                ])
        param_grid = dict(reg__hidden_layer_sizes=[[6],[13],[6,6],[13,13],[6,13]],
                          reg__alpha=[1e-4,1e-3, 1e-2, 1e-1,1], #L2 penalty (regularization term) parameter.
                          reg__early_stopping=[True],
                         reg__solver= ['lbfgs'])
        mlp_cv = GridSearchCV(mlp_res, param_grid = param_grid,cv=inner_nsplit,scoring='r2',verbose=verbose,n_jobs=n_jobs)
        mlp_cv.fit(X_train, Y_train)
        model=mlp_cv.best_estimator_
        
        mlp_train[count] = test(model.predict(X_train),Y_train)
        mlp_test[count] = test(model.predict(X_test),Y_test)
        del mlp_cv
        del model
        gc.collect()

        # SVR - rbf
        svr_res = Pipeline([            
                ('sc', StandardScaler()), 
                ('reg',  SVR())
                ])
        tuned_parameters = dict(reg__kernel=['rbf'],
                                reg__tol= [1e-3,1e-2,1e-1],
                                reg__C=[0.9,1,1.1],
                                reg__epsilon=[0,0.1,0.2],
                                reg__gamma=[1e-3,1e-2,1e-1,1/6]
                              )
        svr_cv = GridSearchCV(svr_res,tuned_parameters,cv=inner_nsplit,scoring='r2',verbose=verbose,n_jobs=n_jobs)
        svr_cv.fit(X_train,Y_train)
        model=svr_cv.best_estimator_
        
        svr_train[count] = test(model.predict(X_train),Y_train)
        svr_test[count] = test(model.predict(X_test),Y_test)
        del svr_res
        del model
        gc.collect()

        #LR
        lr_res=Pipeline(
            [('poly',PolynomialFeatures()),
             ('std_scaler',StandardScaler()),
             ('en',ElasticNet())])#l1,l2正则化
       
        param_grid={'poly__degree':[2,3,4,5,6,7,8],
                   'en__alpha':[0.01,0.1,0.3,0.5,0.8],
                   'en__l1_ratio':[0.01,0.1,0.3,0.5,0.8]}
        lr_cv=GridSearchCV(lr_res,param_grid=param_grid,cv=inner_nsplit,scoring="r2",verbose=verbose,n_jobs=n_jobs)
        lr_cv.fit(X_train,Y_train)
        model=lr_cv.best_estimator_
       
        lr_train[count] = test(model.predict(X_train),Y_train)
        lr_test[count] = test(model.predict(X_test),Y_test)
        del lr_cv
        del model
        gc.collect()
del X
del Y
gc.collect()

trial =  0
0  /  24


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res, self.max_iter)
  model = cd_fast.enet_coordinate_descent(


1  /  24


KeyboardInterrupt: 

In [None]:
#电脑运行速度有限，可转移至超算中计算

In [None]:
lr_results = pd.DataFrame(data=lr_train,columns=["r2","rmse","mae"])
mlp_results = pd.DataFrame(data=mlp_train,columns=["r2","rmse","mae"])
svr_results = pd.DataFrame(data=svr_train,columns=["r2","rmse","mae"])
xgb_results = pd.DataFrame(data=xgb_train,columns=["r2","rmse","mae"])

In [None]:
lr_results.to_csv('./result/rg_train_results.csv', header=True)
mlp_results.to_csv('./result/mlp_train_results.csv', header=True)
svr_results.to_csv('./result/svr_train_results.csv', header=True)
xgb_results.to_csv('./result/xgb_train_results.csv', header=True)

In [None]:
lr_results = pd.DataFrame(data=lr_test,columns=["r2","rmse","mae"])
mlp_results = pd.DataFrame(data=mlp_test,columns=["r2","rmse","mae"])
svr_results = pd.DataFrame(data=svr_test,columns=["r2","rmse","mae"])
xgb_results = pd.DataFrame(data=xgb_test,columns=["r2","rmse","mae"])

In [None]:
lr_results.to_csv('./result/rg_test_results.csv', header=True)
mlp_results.to_csv('./result/mlp_test_results.csv', header=True)
svr_results.to_csv('./result/svr_test_results.csv', header=True)
xgb_results.to_csv('./result/xgb_test_results.csv', header=True)