In [None]:
from sklearn.preprocessing import StandardScaler
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt 
import warnings
warnings.filterwarnings('ignore')

import xgboost as xgb
from sklearn.svm import SVR
import lightgbm as lgb
import seaborn as sns
from sklearn.ensemble import RandomForestRegressor as RFR
from sklearn.model_selection import KFold,cross_validate
from xgboost import XGBRegressor as xc
from hyperopt import hp,fmin,tpe,Trials,partial
from hyperopt.early_stop import no_progress_loss

In [None]:
## read trainset
df=pd.read_csv('../trainset/features_trainset_all.csv')
del df['Unnamed: 0']
del df['pretty_formula']
print('data shape：',df.shape)
df.drop_duplicates(subset=['composition'], inplace=True)
print('data shape：',df.shape)
df=df.dropna(axis=1)
df=df.dropna(axis=0)
col_list=df.columns
col_name=col_list[2:len(col_list)]
featur=df[col_name].values
target=df['D_max'].values
X=featur
Y=target
del df['composition']
print('data shape：',df.shape)
df.head()

In [None]:
##normalization
y = df['D_max']

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
X_scaled = pd.DataFrame(X_scaled)
#y.head()
X_scaled.shape

In [None]:
## RF
param_grid = {
    'n_estimators':hp.quniform('n_estimators',50,550,5),
    'max_features':hp.quniform('max_features',3,29,1),
    'max_depth':hp.quniform('max_depth',8,55,1),
    'min_samples_split':hp.quniform('min_samples_split',2,10,1),
    'min_impurity_decrease':hp.quniform("min_impurity_decrease",0,5,0.1)
}

def hyperopt_objective(params):
    reg = RFR(n_estimators=int(params['n_estimators']),
             max_depth=int(params['max_depth']),
             max_features=int(params['max_features']),
             min_samples_split=int(params['min_samples_split']),
             min_impurity_decrease=params['min_impurity_decrease'],
             random_state = 12138,
             verbose = False,
             n_jobs=10)
    cv = KFold(n_splits=5,shuffle=True,random_state=12138)
    validation_loss = cross_validate(reg,X_scaled,y,scoring='neg_root_mean_squared_error',cv=cv,
                                    verbose=False,n_jobs=10,error_score='raise')
    return np.mean(abs(validation_loss['test_score']))


def param_hyperopt(max_evals=100):
    
    trials = Trials()
    early_stop_fn = no_progress_loss(100) 
    params_best =fmin(hyperopt_objective,
                     space = param_grid,
                     algo = tpe.suggest,
                     max_evals = max_evals,
                     verbose = True,
                     trials = trials,
                     early_stop_fn = early_stop_fn
                     )
    print('\n','best params:',params_best,'\n')
    return params_best,trials

def hyperopt_validation(params):
    reg = RFR(n_estimators=int(params['n_estimators']),
             max_depth=int(params['max_depth']),
             max_features=int(params['max_features']),
             min_samples_split=int(params['min_samples_split']),
             min_impurity_decrease=int(params['min_impurity_decrease']),
             random_state =12138,
             verbose = False,
             n_jobs=-1)
    cv = KFold(n_splits=5,shuffle=True,random_state=2)
    validation_loss = cross_validate(reg,X_scaled,y,scoring='r2',cv=cv,
                                    verbose=False,n_jobs=-1,error_score='raise')
    return np.mean(abs(validation_loss['test_score']))

In [None]:
params_best,trials = param_hyperopt(500) 
hyperopt_validation(params_best)

In [None]:
params_best,trials = param_hyperopt(500) 
hyperopt_validation(params_best)

In [None]:
params_best,trials = param_hyperopt(500) 
hyperopt_validation(params_best)

26%|████████████▍                                  | 132/500 [04:45<13:14,  2.16s/trial, best loss: 0.778666724850233]

 best params: {'max_depth': 26.0, 'max_features': 10.0, 'min_impurity_decrease': 0.0, 'min_samples_split': 2.0, 'n_estimators': 280.0} 

0.7589432158826565

In [None]:
##SVM
param_grid = {
    'C':hp.quniform('C',1,50,1),
    'gamma':hp.quniform('gamma',0.1,0.45,0.005),
    'epsilon':hp.quniform('epsilon',0,0.2,0.002)
}

def hyperopt_objective(params):
    reg = SVR(C=int(params['C']),
             epsilon=params['epsilon'],
             gamma=params['gamma'],
             kernel='rbf',
             verbose = False
             )
    cv = KFold(n_splits=5,shuffle=True,random_state=12138)
    validation_loss = cross_validate(reg,X_scaled,y,scoring='neg_root_mean_squared_error',cv=cv,
                                    verbose=False,n_jobs=-1,error_score='raise')
    return np.mean(abs(validation_loss['test_score']))


def param_hyperopt(max_evals=100):
    trials = Trials()
    early_stop_fn = no_progress_loss(100)
    params_best =fmin(hyperopt_objective,
                     space = param_grid,
                     algo = tpe.suggest,
                     max_evals = max_evals,
                     verbose = True,
                     trials = trials,
                     early_stop_fn = early_stop_fn
                     )
    print('\n','best params:',params_best,'\n')
    return params_best,trials

def hyperopt_validation(params):
    reg = SVR(C=int(params['C']),
             epsilon=params['epsilon'],
             gamma=params['gamma'],
             kernel='rbf',
             verbose = False)
    cv = KFold(n_splits=5,shuffle=True,random_state=2)
    validation_loss = cross_validate(reg,X_scaled,y,scoring='r2',cv=cv,
                                    verbose=False,n_jobs=-1,error_score='raise')
    return np.mean(abs(validation_loss['test_score']))

In [None]:
params_best,trials = param_hyperopt(500) 
hyperopt_validation(params_best)

In [None]:
params_best,trials = param_hyperopt(500) 
hyperopt_validation(params_best)

In [None]:
params_best,trials = param_hyperopt(500)
hyperopt_validation(params_best)

75%|██████████████████████████████████▋           | 377/500 [23:44<07:44,  3.78s/trial, best loss: 0.7807890462189855]

 best params: {'C': 5.0, 'epsilon': 0.03, 'gamma': 0.37} 

0.7524649155888021

In [None]:
##XGBoost
param_grid_simple = {'n_estimators': hp.quniform("n_estimators",150,450,3)
                     ,"learning_rate": hp.quniform("learning_rate",0.05,0.3,0.002)
                     ,"colsample_bytree":hp.quniform("colsample_bytree",0.3,1,0.1)
                     ,"colsample_bynode":hp.quniform("colsample_bynode",0.1,1,0.1)
                     ,"gamma":hp.quniform("gamma",0,15,0.2)
                     ,"reg_lambda":hp.quniform("reg_lambda",0,25,0.5)
                     ,"min_child_weight":hp.quniform("min_child_weight",0,50,0.5)
                     ,"max_depth":hp.quniform("max_depth",5,45,1)
                     ,"subsample":hp.quniform("subsample",0.5,1,0.1)
                    }

def hyperopt_objective(params):
    reg = xc(n_estimators=int(params['n_estimators']),
             max_depth=int(params['max_depth']),
             reg_lambda=params['reg_lambda'],
             learning_rate=params['learning_rate'],
             subsample=params['subsample'],
             colsample_bytree=params['colsample_bytree'],
             colsample_bynode=params['colsample_bynode'],
             gamma = params['gamma'],
             min_child_weight=params['min_child_weight'],
             objective='reg:squarederror',
             random_state = 12138,
             n_jobs=-1)
    cv = KFold(n_splits=5,shuffle=True,random_state=12138)
    validation_loss = cross_validate(reg,X_scaled,y,scoring='neg_root_mean_squared_error',cv=cv,
                                    verbose=False,n_jobs=-1,error_score='raise')
    return np.mean(abs(validation_loss['test_score']))

def param_hyperopt(max_evals=100):

    trials = Trials()
    early_stop_fn = no_progress_loss(100)
    params_best = fmin(hyperopt_objective
                       , space = param_grid_simple
                       , algo = tpe.suggest
                       , max_evals = max_evals
                       , verbose=True
                       , trials = trials
                       , early_stop_fn = early_stop_fn
                      )
    print("\n","\n","best params: ", params_best,
          "\n")
    return params_best, trials

def hyperopt_validation(params):
    reg = xc(n_estimators=int(params['n_estimators']),
             max_depth=int(params['max_depth']),
             reg_lambda=params['reg_lambda'],
             learning_rate=params['learning_rate'],
             subsample=params['subsample'],
             colsample_bytree=params['colsample_bytree'],
             colsample_bynode=params['colsample_bynode'],
             gamma = params['gamma'],
             min_child_weight=params['min_child_weight'],
             objective='reg:squarederror',
             random_state = 12138,
             n_jobs=-1)
    cv = KFold(n_splits=5,shuffle=True,random_state=2)
    validation_loss = cross_validate(reg,X_scaled,y,scoring='r2',cv=cv,
                                    verbose=False,n_jobs=-1,error_score='raise')
    return np.mean(abs(validation_loss['test_score']))

In [None]:
params_best,trials = param_hyperopt(500) 
hyperopt_validation(params_best)

In [None]:
params_best,trials = param_hyperopt(500) 
hyperopt_validation(params_best)

In [None]:
params_best,trials = param_hyperopt(500) 
hyperopt_validation(params_best)

 24%|██████████▋                                 | 121/500 [26:35<1:23:17, 13.19s/trial, best loss: 0.7304879093621787]

 
 best params:  {'colsample_bynode': 0.9, 'colsample_bytree': 0.9, 'gamma': 0.4, 'learning_rate': 0.056, 'max_depth': 30.0, 'min_child_weight': 1.0, 'n_estimators': 351.0, 'reg_lambda': 10.5, 'subsample': 0.8} 

0.783826239174134

In [None]:
##LightGBM
param_grid = {
    'n_estimators':hp.quniform('n_estimators',100,800,5),
    'num_leaves':hp.quniform('num_leaves',10,400,5),
    'learning_rate':hp.quniform('learning_rate',0.1,0.5,0.02),
    'min_child_samples':hp.quniform('min_child_samples',1,40,1),
    'reg_alpha':hp.quniform("reg_alpha",0,10,0.5),
    'reg_lambda':hp.quniform("reg_lambda",0,100,2),
    'subsample':hp.quniform("subsample",0.5,1,0.1)
}

def hyperopt_objective(params):
    reg = lgb.LGBMRegressor(
             n_estimators=int(params['n_estimators']),
             num_leaves=int(params['num_leaves']),
             min_child_samples=int(params['min_child_samples']),
             learning_rate=params['learning_rate'],
             reg_alpha=params['reg_alpha'],
             reg_lambda=params['reg_lambda'], 
             subsample = params['subsample'],
             random_state = 12138,
             verbose = int(False),
             n_jobs=-1)
    cv = KFold(n_splits=5,shuffle=True,random_state=12138)
    validation_loss = cross_validate(reg,X_scaled,y,scoring='neg_root_mean_squared_error',cv=cv,
                                    verbose=False,n_jobs=-1,error_score='raise')
    return np.mean(abs(validation_loss['test_score']))

def param_hyperopt(max_evals=100):
    trials = Trials()
    early_stop_fn = no_progress_loss(100)
    params_best =fmin(hyperopt_objective,
                     space = param_grid,
                     algo = tpe.suggest,
                     max_evals = max_evals,
                     verbose = True,
                     trials = trials,
                     early_stop_fn = early_stop_fn
                     )
    print('\n','best params:',params_best,'\n')
    return params_best,trials

def hyperopt_validation(params):
    reg = lgb.LGBMRegressor(
             n_estimators=int(params['n_estimators']),
             num_leaves=int(params['num_leaves']),
             min_child_samples=int(params['min_child_samples']),
             learning_rate=params['learning_rate'],
             reg_alpha=params['reg_alpha'],
             reg_lambda=params['reg_lambda'], 
             subsample = params['subsample'],
             random_state = 12138,
             verbose = int(False),
             n_jobs=-1)
    cv = KFold(n_splits=5,shuffle=True,random_state=2)
    validation_loss = cross_validate(reg,X_scaled,y,scoring='r2',cv=cv,
                                    verbose=False,n_jobs=-1,error_score='raise')
    return np.mean(abs(validation_loss['test_score']))

In [None]:
params_best,trials = param_hyperopt(500) 
hyperopt_validation(params_best)

In [None]:
params_best,trials = param_hyperopt(500) 
hyperopt_validation(params_best)

In [None]:
params_best,trials = param_hyperopt(500) 
hyperopt_validation(params_best)

 33%|███████████████▎                              | 167/500 [13:52<27:40,  4.99s/trial, best loss: 0.7369564970850893]

 best params: {'learning_rate': 0.12, 'min_child_samples': 6.0, 'n_estimators': 775.0, 'num_leaves': 385.0, 'reg_alpha': 0.5, 'reg_lambda': 92.0, 'subsample': 1.0} 

0.7768789919876828

In [None]:
##Fusion
from sklearn.ensemble import VotingRegressor
from sklearn.model_selection import train_test_split
X_train,X_test,Y_train,Y_test = train_test_split(X_scaled,y,train_size=0.8,random_state=12138)

reg_svm = SVR(C=5,epsilon=0.03,gamma=0.37)
reg_rf = RFR(max_depth=26,max_features=10,min_impurity_decrease=0,min_samples_split=2,n_estimators=280,random_state=12138)
reg_xgb = xc(colsample_bynode=0.9,colsample_bytree=0.9,gamma=0.4,learning_rate=0.056,max_depth=30,
         min_child_weight=1,n_estimators=351,reg_lambda=10.5,subsample=0.8,random_state=12138)
reg_gbm = lgb.LGBMRegressor(learning_rate = 0.12,min_child_samples = 6,n_estimators = 775, 
         num_leaves =385,reg_alpha = 0.5,reg_lambda = 92,subsample = 1,random_state=12138)
estimators = [('svm',reg_svm), ('rf',reg_rf), ('xgb',reg_xgb), ('lightgbm',reg_gbm)]

In [None]:
VC_hard =VotingRegressor(estimators).fit(X_train, Y_train)
print('Test',VC_hard.score(X_test,Y_test))
print('Train',VC_hard.score(X_train,Y_train))

reg_rf.fit(X_train, Y_train)
reg_xgb.fit(X_train, Y_train)
reg_svm.fit(X_train, Y_train)
reg_gbm.fit(X_train, Y_train)

print('SVM',reg_svm.score(X_train, Y_train),reg_svm.score(X_test,Y_test))
print('RF',reg_rf.score(X_train, Y_train),reg_rf.score(X_test,Y_test))
print('XGB',reg_xgb.score(X_train, Y_train),reg_xgb.score(X_test,Y_test))
print('LightGBM',reg_gbm.score(X_train, Y_train),reg_gbm.score(X_test,Y_test))

In [None]:
params_space = {
                'weight1': hp.quniform("weight1",0,1,0.01),
                'weight2': hp.quniform("weight2",0,1,0.01),
                'weight3': hp.quniform("weight3",0,1,0.01),
                'weight4': hp.quniform("weight4",0,1,0.01)
}

def hyperopt_objective_weight(params):
    weight1 = params['weight1']
    weight2 = params['weight2']
    weight3 = params['weight3']
    weight4 = params['weight4']
    weights = [weight1, weight2, weight3,weight4]

    reg = VotingRegressor(estimators=estimators, n_jobs=-1,weights=weights)
    cv = KFold(n_splits=5,shuffle=True,random_state=12138)
    validation_loss = cross_validate(reg,X_scaled,y,scoring='r2',cv=cv,
                                    verbose=False,n_jobs=-1,error_score='raise')
    return -np.mean(abs(validation_loss['test_score']))

def param_hyperopt(max_evals=100):

    trials = Trials()
    early_stop_fn = no_progress_loss(50)
    params_best = fmin(hyperopt_objective_weight
                       , space = params_space
                       , algo = tpe.suggest
                       , max_evals = max_evals
                       , verbose=True
                       , trials = trials
                       , early_stop_fn = early_stop_fn
                      )
    print("\n","\n","best params: ", params_best,
          "\n")
    return params_best, trials

def hyperopt_validation(params):
    reg = VotingRegressor(estimators=estimators,n_jobs=-1,
                          weights=[params[0]['weight1'],params[0]['weight2'],params[0]['weight3'],params[0]['weight4']])
    cv = KFold(n_splits=5,shuffle=True,random_state=2)
    validation_loss = cross_validate(reg,X_test,Y_test,scoring='r2',cv=cv,
                                    verbose=False,n_jobs=-1,error_score='raise')
    return np.mean(abs(validation_loss['test_score']))

In [None]:
params_best = param_hyperopt(300)
hyperopt_validation(params_best)

In [None]:
params_best = param_hyperopt(300)
hyperopt_validation(params_best)

In [None]:
params_best = param_hyperopt(300)
hyperopt_validation(params_best)

42%|█████████████████▎                       | 127/300 [1:32:10<2:05:33, 43.55s/trial, best loss: -0.8007935717455845]

 
 best params:  {'weight1': 0.47000000000000003, 'weight2': 0.01, 'weight3': 0.8300000000000001, 'weight4': 0.5} 

0.6483590637281165

In [None]:
weight1 = 0.47
weight2 = 0.01
weight3 = 0.83
weight4 = 0.5

weights = [weight1, weight2, weight3, weight4]

weight_sum = weight1 + weight2 + weight3 + weight4
X_train,X_test,Y_train,Y_test = train_test_split(X_scaled,y,train_size=0.8,random_state=12138)
Voting_soft_weight = VotingRegressor(estimators=estimators, n_jobs=-1,
                                      weights=weights).fit(X_train, Y_train)
print(Voting_soft_weight.score(X_train, Y_train))
print(Voting_soft_weight.score(X_test, Y_test))

|Models|train_score|test_score|
|:--:|:--:|:--:|
|SVM|0.8757|0.7879|
|RF|0.9670|0.7933|
|XGB|0.9908|0.8064|
|GBM|0.9994|0.8102|
|avg|0.9802|0.8218|
|opt|0.9851|0.8219|

In [None]:
plt.figure(figsize=(8,8),dpi=200)
plt.style.use('seaborn-white')
plt.rcParams['font.family'] = 'Times New Roman'

Test=[reg_svm.score(X_test,Y_test),reg_rf.score(X_test,Y_test),reg_xgb.score(X_test,Y_test),
      reg_gbm.score(X_test,Y_test),Voting_soft_weight.score(X_test, Y_test)]

x = [0,0.8,1.6,2.4,3.2]
plt.rcParams['font.sans-serif']=['SimHei']
plt.rcParams['axes.unicode_minus'] = False

plt.bar(x, Test, width=width, label='$Test-R^{2}$',color='blue')  

plt.ylabel("$R^{2}$",fontsize=15)
plt.xticks([0,0.8,1.6,2.4,3.2],['SVM','RF','XGBoost','LightGBM','Fusion'],fontsize=15)
plt.xlim([-0.5,3.6])
plt.ylim([0.75,0.83])
plt.tick_params(axis="y", direction="out", which="major", labelsize=15, length=5)
plt.legend(fontsize=15,loc=2)
plt.savefig('all_comparision_test_r2.tif',bbox_inches='tight',dpi=330)