In [None]:
from sklearn import preprocessing
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt 
import warnings
warnings.filterwarnings('ignore')

import xgboost as xgb
from sklearn.svm import SVR
import lightgbm as lgb
import seaborn as sns
from sklearn.ensemble import RandomForestRegressor as RFR
from sklearn.model_selection import KFold,cross_validate
from xgboost import XGBRegressor as xc
from hyperopt import hp,fmin,tpe,Trials,partial
from hyperopt.early_stop import no_progress_loss

In [None]:
## read trainset
df=pd.read_csv('../../data/features_trainset_NoLaAlC3.csv')
del df['Unnamed: 0']
del df['pretty_formula']
print('data shape：',df.shape)
df.drop_duplicates(subset=['composition'], inplace=True)
print('data shape：',df.shape)
df=df.dropna(axis=1)
df=df.dropna(axis=0)
col_list=df.columns
col_name=col_list[2:len(col_list)]
featur=df[col_name].values
target=df['D_max'].values
X=featur
Y=target
del df['composition']
print('data shape：',df.shape)
df.head()

In [None]:
##normalization
y = df['D_max']

scaler=preprocessing.StandardScaler().fit(X)
X_scaled = scaler.transform(X)
X_scaled = pd.DataFrame(X_scaled)
X_scaled.shape

In [None]:
##RF
param_grid = {
    'n_estimators':hp.quniform('n_estimators',50,550,5),
    'max_features':hp.quniform('max_features',3,29,1),
    'max_depth':hp.quniform('max_depth',8,55,1),
    'min_samples_split':hp.quniform('min_samples_split',2,10,1),
    'min_impurity_decrease':hp.quniform("min_impurity_decrease",0,5,0.1)
}

def hyperopt_objective(params):
    reg = RFR(n_estimators=int(params['n_estimators']),
             max_depth=int(params['max_depth']),
             max_features=int(params['max_features']),
             min_samples_split=int(params['min_samples_split']),
             min_impurity_decrease=params['min_impurity_decrease'],
             random_state = 12138,
             verbose = False,
             n_jobs=10)
    cv = KFold(n_splits=5,shuffle=True,random_state=12138)
    validation_loss = cross_validate(reg,X_scaled,y,scoring='neg_root_mean_squared_error',cv=cv,
                                    verbose=False,n_jobs=10,error_score='raise')
    return np.mean(abs(validation_loss['test_score']))

def param_hyperopt(max_evals=100):

    trials = Trials()
    early_stop_fn = no_progress_loss(100)
    params_best =fmin(hyperopt_objective,
                     space = param_grid,
                     algo = tpe.suggest,
                     max_evals = max_evals,
                     verbose = True,
                     trials = trials,
                     early_stop_fn = early_stop_fn
                     )

    print('\n','best params:',params_best,'\n')
    return params_best,trials

def hyperopt_validation(params):
    reg = RFR(n_estimators=int(params['n_estimators']),
             max_depth=int(params['max_depth']),
             max_features=int(params['max_features']),
             min_samples_split=int(params['min_samples_split']),
             min_impurity_decrease=int(params['min_impurity_decrease']),
             random_state =12138,
             verbose = False,
             n_jobs=-1)
    cv = KFold(n_splits=5,shuffle=True,random_state=2)
    validation_loss = cross_validate(reg,X_scaled,y,scoring='r2',cv=cv,
                                    verbose=False,n_jobs=-1,error_score='raise')
    return np.mean(abs(validation_loss['test_score']))

In [None]:
params_best,trials = param_hyperopt(500) 
hyperopt_validation(params_best)

In [None]:
params_best,trials = param_hyperopt(500) 
hyperopt_validation(params_best)

In [None]:
params_best,trials = param_hyperopt(500) 
hyperopt_validation(params_best)

 81%|█████████████████████████████████████▎        | 406/500 [15:26<03:34,  2.28s/trial, best loss: 0.7537608826909155]

 best params: {'max_depth': 31.0, 'max_features': 6.0, 'min_impurity_decrease': 0.0, 'min_samples_split': 2.0, 'n_estimators': 190.0} 

0.7689841267336975

In [None]:
##SVM
param_grid = {
    'C':hp.quniform('C',1,50,1),
    'gamma':hp.quniform('gamma',0.1,0.45,0.005),
    'epsilon':hp.quniform('epsilon',0,0.2,0.002)
}

def hyperopt_objective(params):
    reg = SVR(C=int(params['C']),
             epsilon=params['epsilon'],
             gamma=params['gamma'],
             kernel='rbf',
             verbose = False
             )
    cv = KFold(n_splits=5,shuffle=True,random_state=12138)
    validation_loss = cross_validate(reg,X_scaled,y,scoring='neg_root_mean_squared_error',cv=cv,
                                    verbose=False,n_jobs=-1,error_score='raise')
    return np.mean(abs(validation_loss['test_score']))

def param_hyperopt(max_evals=100):
    trials = Trials()
    early_stop_fn = no_progress_loss(100)
    params_best =fmin(hyperopt_objective,
                     space = param_grid,
                     algo = tpe.suggest,
                     max_evals = max_evals,
                     verbose = True,
                     trials = trials,
                     early_stop_fn = early_stop_fn
                     )
    print('\n','best params:',params_best,'\n')
    return params_best,trials

def hyperopt_validation(params):
    reg = SVR(C=int(params['C']),
             epsilon=params['epsilon'],
             gamma=params['gamma'],
             kernel='rbf',
             verbose = False)
    cv = KFold(n_splits=5,shuffle=True,random_state=2)
    validation_loss = cross_validate(reg,X_scaled,y,scoring='r2',cv=cv,
                                    verbose=False,n_jobs=-1,error_score='raise')
    return np.mean(abs(validation_loss['test_score']))

In [None]:
params_best,trials = param_hyperopt(500) 
hyperopt_validation(params_best)

In [None]:
params_best,trials = param_hyperopt(500) 
hyperopt_validation(params_best)

In [None]:
params_best,trials = param_hyperopt(500) 
hyperopt_validation(params_best)

34%|████████████████                               | 171/500 [08:54<17:09,  3.13s/trial, best loss: 0.753735273678075]

 best params: {'C': 7.0, 'epsilon': 0.052000000000000005, 'gamma': 0.375} 

0.7700961282829023

In [None]:
##XGBoost
def hyperopt_objective(params):
    reg = xc(n_estimators=int(params['n_estimators']),
             max_depth=int(params['max_depth']),
             reg_lambda=params['reg_lambda'],
             learning_rate=params['learning_rate'],
             subsample=params['subsample'],
             colsample_bytree=params['colsample_bytree'],
             colsample_bynode=params['colsample_bynode'],
             gamma = params['gamma'],
             min_child_weight=params['min_child_weight'],
             objective='reg:squarederror',
             random_state = 12138,
             n_jobs=-1)
    cv = KFold(n_splits=5,shuffle=True,random_state=12138)
    validation_loss = cross_validate(reg,X_scaled,y,scoring='neg_root_mean_squared_error',cv=cv,
                                    verbose=False,n_jobs=-1,error_score='raise')
    return np.mean(abs(validation_loss['test_score']))

param_grid_simple = {'n_estimators': hp.quniform("n_estimators",150,450,3)
                     ,"learning_rate": hp.quniform("learning_rate",0.05,0.3,0.002)
                     ,"colsample_bytree":hp.quniform("colsample_bytree",0.3,1,0.1)
                     ,"colsample_bynode":hp.quniform("colsample_bynode",0.1,1,0.1)
                     ,"gamma":hp.quniform("gamma",0,15,0.2)
                     ,"reg_lambda":hp.quniform("reg_lambda",0,25,0.5)
                     ,"min_child_weight":hp.quniform("min_child_weight",0,50,0.5)
                     ,"max_depth":hp.quniform("max_depth",5,45,1)
                     ,"subsample":hp.quniform("subsample",0.5,1,0.1)
                    }

def param_hyperopt(max_evals=100):

    trials = Trials()
    early_stop_fn = no_progress_loss(100)
    params_best = fmin(hyperopt_objective
                       , space = param_grid_simple
                       , algo = tpe.suggest
                       , max_evals = max_evals
                       , verbose=True
                       , trials = trials
                       , early_stop_fn = early_stop_fn
                      )
    print("\n","\n","best params: ", params_best,
          "\n")
    return params_best, trials

def hyperopt_validation(params):
    reg = xc(n_estimators=int(params['n_estimators']),
             max_depth=int(params['max_depth']),
             reg_lambda=params['reg_lambda'],
             learning_rate=params['learning_rate'],
             subsample=params['subsample'],
             colsample_bytree=params['colsample_bytree'],
             colsample_bynode=params['colsample_bynode'],
             gamma = params['gamma'],
             min_child_weight=params['min_child_weight'],
             objective='reg:squarederror',
             random_state = 12138,
             n_jobs=-1)
    cv = KFold(n_splits=5,shuffle=True,random_state=2)
    validation_loss = cross_validate(reg,X_scaled,y,scoring='r2',cv=cv,
                                    verbose=False,n_jobs=-1,error_score='raise')
    return np.mean(abs(validation_loss['test_score']))

In [None]:
params_best,trials = param_hyperopt(500) 
hyperopt_validation(params_best)

In [None]:
params_best,trials = param_hyperopt(500) 
hyperopt_validation(params_best)

In [None]:
params_best,trials = param_hyperopt(500) 
hyperopt_validation(params_best)

49%|██████████████████████▋                       | 246/500 [52:43<54:26, 12.86s/trial, best loss: 0.7039801801565011]

 
 best params:  {'colsample_bynode': 0.5, 'colsample_bytree': 0.7000000000000001, 'gamma': 0.2, 'learning_rate': 0.08600000000000001, 'max_depth': 20.0, 'min_child_weight': 6.5, 'n_estimators': 378.0, 'reg_lambda': 17.0, 'subsample': 0.7000000000000001} 

0.7973721494134384

In [None]:
##LightGBM
param_grid = {
    'n_estimators':hp.quniform('n_estimators',100,800,5),
    'num_leaves':hp.quniform('num_leaves',10,400,5),
    'learning_rate':hp.quniform('learning_rate',0.1,0.5,0.02),
    'min_child_samples':hp.quniform('min_child_samples',1,40,1),
    'reg_alpha':hp.quniform("reg_alpha",0,10,0.5),
    'reg_lambda':hp.quniform("reg_lambda",0,100,2),
    'subsample':hp.quniform("subsample",0.5,1,0.1)
}

def hyperopt_objective(params):
    reg = lgb.LGBMRegressor(
             n_estimators=int(params['n_estimators']),
             num_leaves=int(params['num_leaves']),
             min_child_samples=int(params['min_child_samples']),
             learning_rate=params['learning_rate'],
             reg_alpha=params['reg_alpha'],
             reg_lambda=params['reg_lambda'], 
             subsample = params['subsample'],
             random_state = 12138,
             verbose = int(False),
             n_jobs=-1)
    cv = KFold(n_splits=5,shuffle=True,random_state=12138)
    validation_loss = cross_validate(reg,X_scaled,y,scoring='neg_root_mean_squared_error',cv=cv,
                                    verbose=False,n_jobs=-1,error_score='raise')
    return np.mean(abs(validation_loss['test_score']))

def param_hyperopt(max_evals=100):

    trials = Trials()
    early_stop_fn = no_progress_loss(100)
    params_best =fmin(hyperopt_objective,
                     space = param_grid,
                     algo = tpe.suggest,
                     max_evals = max_evals,
                     verbose = True,
                     trials = trials,
                     early_stop_fn = early_stop_fn
                     )
    print('\n','best params:',params_best,'\n')
    return params_best,trials

def hyperopt_validation(params):
    reg = lgb.LGBMRegressor(
             n_estimators=int(params['n_estimators']),
             num_leaves=int(params['num_leaves']),
             min_child_samples=int(params['min_child_samples']),
             learning_rate=params['learning_rate'],
             reg_alpha=params['reg_alpha'],
             reg_lambda=params['reg_lambda'], 
             subsample = params['subsample'],
             random_state = 12138,
             verbose = int(False),
             n_jobs=-1)
    cv = KFold(n_splits=5,shuffle=True,random_state=2)
    validation_loss = cross_validate(reg,X_scaled,y,scoring='r2',cv=cv,
                                    verbose=False,n_jobs=-1,error_score='raise')
    return np.mean(abs(validation_loss['test_score']))

In [None]:
params_best,trials = param_hyperopt(500) 
hyperopt_validation(params_best)

In [None]:
params_best,trials = param_hyperopt(500) 
hyperopt_validation(params_best)

In [None]:
params_best,trials = param_hyperopt(500) 
hyperopt_validation(params_best)

53%|████████████████████████▎                     | 264/500 [18:45<16:46,  4.26s/trial, best loss: 0.7086424032980618]

 best params: {'learning_rate': 0.12, 'min_child_samples': 21.0, 'n_estimators': 765.0, 'num_leaves': 85.0, 'reg_alpha': 0.5, 'reg_lambda': 100.0, 'subsample': 0.8} 

0.7972314395038

___

In [None]:
from sklearn.ensemble import VotingRegressor
from sklearn.model_selection import train_test_split
import ternary
X_train,X_test,Y_train,Y_test = train_test_split(X_scaled,y,train_size=0.8,random_state=12138)

reg_svm = SVR(C=7,epsilon=0.052,gamma=0.375)
reg_rf = RFR(max_depth=31,max_features=6,min_impurity_decrease=0,min_samples_split=2,n_estimators=190,random_state=12138)
reg_xgb = xc(colsample_bynode=0.5,colsample_bytree=0.7,gamma=0.2,learning_rate=0.086,max_depth=20,
         min_child_weight=6.5,n_estimators=378,reg_lambda=17,subsample=0.7,random_state=12138)
reg_gbm = lgb.LGBMRegressor(learning_rate = 0.12,min_child_samples = 21,n_estimators = 765, 
         num_leaves =85,reg_alpha = 0.5,reg_lambda = 100,subsample = 0.8,random_state=12138)
estimators = [('svm',reg_svm), ('rf',reg_rf), ('xgb',reg_xgb), ('lightgbm',reg_gbm)]

In [None]:
VC_hard =VotingRegressor(estimators).fit(X_train, Y_train)
print('Test',VC_hard.score(X_test,Y_test))
print('Train',VC_hard.score(X_train,Y_train))

reg_rf.fit(X_train, Y_train)
reg_xgb.fit(X_train, Y_train)
reg_svm.fit(X_train, Y_train)
reg_gbm.fit(X_train, Y_train)

print('SVM',reg_svm.score(X_train, Y_train),reg_svm.score(X_test,Y_test))
print('RF',reg_rf.score(X_train, Y_train),reg_rf.score(X_test,Y_test))
print('XGB',reg_xgb.score(X_train, Y_train),reg_xgb.score(X_test,Y_test))
print('LightGBM',reg_gbm.score(X_train, Y_train),reg_gbm.score(X_test,Y_test))

In [None]:
params_space = {
                'weight1': hp.quniform("weight1",0,1,0.01),
                'weight2': hp.quniform("weight2",0,1,0.01),
                'weight3': hp.quniform("weight3",0,1,0.01),
                'weight4': hp.quniform("weight4",0,1,0.01)
}

def hyperopt_objective_weight(params):
    weight1 = params['weight1']
    weight2 = params['weight2']
    weight3 = params['weight3']
    weight4 = params['weight4']
    weights = [weight1, weight2, weight3,weight4]
    
    reg = VotingRegressor(estimators=estimators, n_jobs=-1,weights=weights)
    cv = KFold(n_splits=5,shuffle=True,random_state=12138)
    validation_loss = cross_validate(reg,X_scaled,y,scoring='r2',cv=cv,
                                    verbose=False,n_jobs=8,error_score='raise')
    return -np.mean(abs(validation_loss['test_score']))

def param_hyperopt(max_evals=100):
    
    trials = Trials()
    early_stop_fn = no_progress_loss(50)
    params_best = fmin(hyperopt_objective_weight
                       , space = params_space
                       , algo = tpe.suggest
                       , max_evals = max_evals
                       , verbose=True
                       , trials = trials
                       , early_stop_fn = early_stop_fn
                      )

    print("\n","\n","best params: ", params_best,
          "\n")
    return params_best, trials

def hyperopt_validation(params):
    reg = VotingRegressor(estimators=estimators,n_jobs=-1,
                          weights=[params[0]['weight1'],params[0]['weight2'],params[0]['weight3'],params[0]['weight4']])
    cv = KFold(n_splits=5,shuffle=True,random_state=2)
    validation_loss = cross_validate(reg,X_scaled,y,scoring='r2',cv=cv,
                                    verbose=False,n_jobs=-1,error_score='raise')
    return np.mean(abs(validation_loss['test_score']))

In [None]:
params_best = param_hyperopt(300)
hyperopt_validation(params_best)

In [None]:
params_best = param_hyperopt(300)
hyperopt_validation(params_best)

In [None]:
params_best = param_hyperopt(300)
hyperopt_validation(params_best)

39%|███████████████▊                         | 116/300 [1:27:28<2:18:45, 45.25s/trial, best loss: -0.8055057048625247]

 
 best params:  {'weight1': 0.44, 'weight2': 0.0, 'weight3': 0.6, 'weight4': 0.72} 

0.8022380089884512

In [None]:
weight1 = 0.44
weight2 = 0
weight3 = 0.6
weight4 = 0.72

weights = [weight1, weight2, weight3, weight4]

weight_sum = weight1 + weight2 + weight3 + weight4
X_train,X_test,Y_train,Y_test = train_test_split(X_scaled,y,train_size=0.8,random_state=12138)
Voting_soft_weight = VotingRegressor(estimators=estimators, n_jobs=-1,
                                      weights=weights).fit(X_train, Y_train)
print(Voting_soft_weight.score(X_train, Y_train))
print(Voting_soft_weight.score(X_test, Y_test))

In [None]:
df_feature=pd.read_csv('../../data/features_LaAlC.csv')
compositions=df_feature['composition']
del df_feature['Unnamed: 0']
del df_feature['composition']
del df_feature['pretty_formula'] 
print('data shape：',df_feature.shape)
df_feature.head()

In [None]:
pred_X = df_feature[col_name]
pred_X_transformed=scaler.transform(pred_X)
pred_X_transformed.shape

In [None]:
weight1 = 0.44
weight2 = 0
weight3 = 0.6
weight4 = 0.72
weights = [weight1, weight2, weight3, weight4]
Voting_soft_weight = VotingRegressor(estimators=estimators, n_jobs=-1,
                                      weights=weights).fit(X_scaled, y)

weight_sum = weight1 + weight2 + weight3
xxx = Voting_soft_weight.predict(pred_X_transformed)
df_pred_LaAlC=pd.DataFrame({'pretty_formula':compositions,'pred_D_max':xxx})
df_pred_LaAlC['pred_D_max'].describe()
df_pred_LaAlC.to_csv('LaAlC3_fusion_D_max.csv')
comp_1=[]
comp_2=[]
comp_3=[]
points=[]
for i in range(0,101,1):
    for j in range(0,101-i,1):
        k=100-i-j
        comp_1.append(i)
        comp_2.append(j)
        comp_3.append(k)
        points.append((i,j,k))

D_max=df_pred_LaAlC['pred_D_max'].values
data=dict()
for x in range(0,len(D_max)):
    data[points[x]]=D_max[x]
    
scale=100
figure,tax = ternary.figure(scale=scale)
figure.set_size_inches((10,8))
figure.set_facecolor('w')
tax.boundary(linewidth=1.5)
tax.gridlines(color='blue',multiple=10,linewidth=0.5,alpha=0.7)
tax.ticks(axis='lbr',linewidth=1,multiple=20,fontsize=20,offset=0.02)

tax.clear_matplotlib_ticks()
tax.get_axes().axis('off')

tax.heatmap(data, scale=scale,style="h", vmin=min(D_max), vmax=max(D_max), cmap='coolwarm',use_rgba=False, colorbar=True)
tax.left_axis_label(r"$\leftarrow$ C", fontsize=30, offset=0.12)
tax.right_axis_label(r"$\leftarrow$ Al", fontsize=30, offset=0.12)
tax.bottom_axis_label("La "+r"$\rightarrow$", fontsize=30, offset=0.04)
tax.savefig('LaAlC3_fusion_D_max.jpg',bbox_inches='tight')