In [1]:
#Load needed libraries
#import pychemia as pc
import numpy as np
import re
import os
from ase.io import read, write
import mlmd.tools.readers as readers
import mlmd.tools.builders as builders
import mlmd.tools.storage as storage
import mlmd.machine_learning.mlt as mlt
import mlmd.machine_learning.mlp as mlp
from sklearn.externals import joblib

In [2]:
#path to the training.in file
path_to_training_file='training_features_fireball.in'
#reading the information in training.in file
path_to_dft_data,code, pote_name, feature_parameters,\
GBR_E_parameters, GBR_F_parameters, nn_E_parameters, nn_F_parameters= readers.load_training(path_to_training_file)

In [3]:
# Loading the parameters for the feature calcuation

trans = feature_parameters['trans']
#trans -> translation dictionary {'chemical simbol':nuclear number Z}
#trans={'C': 6, 'Si': 14} for a potential with C and O

eta2b = feature_parameters['eta2b']
#values for the eta 2 body parameter in the Filter Behler Parrinello features

Rp = feature_parameters['Rp']
#values for the Rp (gaussian centers 2 body interaction) parameter in the Filter Behler Parrinello features

eta3b = feature_parameters['eta3b']
#values for the eta 3 body parameter in the Filter Behler Parrinello features

cos_p = feature_parameters['cos_p']
#values for the cos(\theta_P) (gaussian centers 3 body interaction) 
#parameter in the Filter Behler Parrinello features

validation_percentage = feature_parameters['validation_percentage']

In [4]:
#loading information from the directory with the dft calculations
stru_symb,stru_name,stru_posi,stru_forc,stru_ener= readers.load_structures_from_xyz_log(path_to_dft_data.strip())
#arrays to store information needed for training

#stru_symb -> array with length (number of structures)
#stru_symb[i] -> array with the composition of the i structure

#stru_posi -> numpy_array with lenght (number of structures)
#stru_posi[i] -> numpy_array with the positions of the atoms in structure i
#stru_posi[i] -> has shape (number_of_atoms, 3(xyz_coordinates))

#stru_forc -> numpy_array with lenght (number of structures)
#stru_forc[i] -> numpy_array with the forces over the atoms in structure i
#stru_forc[i] -> has shape (number_of_atoms, 3(xyz_coordinates))

#stru_ener -> numpy_aray with lenght (number of structures)
#stru_ener[i] -> energy of the i structure

In [5]:
feat_2b, feat_3b,X, DX= builders.build_SIFF_DSIFF(trans, eta2b,\
                        Rp, eta3b, cos_p,stru_symb, stru_name, stru_posi)
#X-> Filtered Behler & Parrinello (FBP)
#X-> Feature representation of stru (numb_struc, numb_of_features)
#DX -> Derivative of FBP
#DX dimensions (structures, atoms_in_structure, number_of_features, xyz_components)
#feat_2b numb_of_2-body featues
#feat_3b numb_of_3-body featues



In [6]:
dire_expe_name='training_data_test_1'
storage.save_features_E_F(feat_2b, feat_3b,dire_expe_name, stru_name,eta2b, Rp,\
                          eta3b, cos_p, stru_forc, stru_ener, X, DX)
np.save(dire_expe_name+'/feature_parameters', feature_parameters)
#for loading fetures already calculated use
#stru_ener, X, stru_forc, DX= readers.load_features_from_file(dire_expe_name)

In [7]:
# train GBR E models
#loading parameters for the GBR E models

GBR_E_models_to_train = GBR_E_parameters['GBR_E_models_to_train']
GBR_E_n_estimators = GBR_E_parameters['GBR_E_n_estimators']
GBR_E_max_depth = GBR_E_parameters['GBR_E_max_depth']
GBR_E_min_samples_split = GBR_E_parameters['GBR_E_min_samples_split']
GBR_E_min_samples_leaf = GBR_E_parameters['GBR_E_min_samples_leaf']
GBR_E_learning_rate = GBR_E_parameters['GBR_E_learning_rate']

In [8]:
#training GBR E models and select the best
#training GBR E model
E_mode= [] #array to store GBR E models
E_eval= [] #array to store the mse validation of the models
E_scal= [] #array to store the energy scaler
for GBR_i in range(GBR_E_models_to_train[0]):
    #create training objetc for energy and  forces
    training= mlt.mlt(stru_ener, X, stru_forc, DX)
    
    #create training objetc for energy
    #training= mlt.mlt(stru_ener, X)
    
    #creating the training and validation sets
    E_scal.append(training.preprocessing_X_SIFF_return_scaler(validation_percentage[0]))
    n_estimators = GBR_E_n_estimators[GBR_i]
    max_depth = GBR_E_max_depth[GBR_i]
    min_samples_split = GBR_E_min_samples_split[GBR_i] 
    min_samples_leaf = GBR_E_min_samples_leaf[GBR_i]
    learning_rate = GBR_E_learning_rate[GBR_i]
    parameters_dict = {'n_estimators': n_estimators, 'max_depth': max_depth, 'min_samples_split': min_samples_split,
          'learning_rate': learning_rate, 'loss': 'ls', 'min_samples_leaf':min_samples_leaf}
    
    mse, GBR_E_mode= training.GBR_train_evaluate_E_model(parameters_dict)
    E_eval.append(mse)
    E_mode.append(GBR_E_mode)
    
#selecting best GBR model
E_sorted= np.argsort(np.array(E_eval))
GBR_E_mode= E_mode[E_sorted[0]]
E_scal= E_scal[E_sorted[0]]
#save best GBR model
os.mkdir('%s/E' % dire_expe_name)
filename = '%s/E/GBR_E.sav' % dire_expe_name
joblib.dump(GBR_E_mode, filename)

filename = '%s/E/scaler_E.sav' % dire_expe_name
joblib.dump(E_scal, filename)

['training_data_test_1/E/scaler_E.sav']

In [9]:
#loading parameters for the GBR F models
GBR_F_models_to_train = GBR_F_parameters['GBR_F_models_to_train']
GBR_F_n_estimators = GBR_F_parameters['GBR_F_n_estimators']
GBR_F_max_depth = GBR_F_parameters['GBR_F_max_depth']
GBR_F_min_samples_split = GBR_F_parameters['GBR_F_min_samples_split']
GBR_F_min_samples_leaf = GBR_F_parameters['GBR_F_min_samples_leaf']
GBR_F_learning_rate = GBR_F_parameters['GBR_F_learning_rate']

In [10]:
#training GBR F models and select the best
#training GBR F model
F_mode= [] #array to store GBR F models
F_eval= [] #array to store the mse validation of the models
F_scal= [] #array to store the force scaler
#GBR force training
for GBR_i in range(GBR_F_models_to_train[0]):
    #create training objetc for energy and  forces
    training= mlt.mlt(stru_ener, X, stru_forc, DX)
    
    F_scal.append(training.preprocessing_DX_DSIFF_return_scaler(validation_percentage[0]))
    n_estimators = GBR_F_n_estimators[GBR_i]
    max_depth = GBR_F_max_depth[GBR_i]
    min_samples_split = GBR_F_min_samples_split[GBR_i] 
    min_samples_leaf = GBR_F_min_samples_leaf[GBR_i]
    learning_rate = GBR_F_learning_rate[GBR_i]
    parameters_dict = {'n_estimators': n_estimators, 'max_depth': max_depth, 'min_samples_split': min_samples_split,
          'learning_rate': learning_rate, 'loss': 'ls', 'min_samples_leaf':min_samples_leaf}
    #print 'GBR_F_%s_%d'%(pote_name, GBR_i)
    mse1, GBR_F_0= training.GBR_train_evaluate_F_model(0,parameters_dict) #x
    mse2, GBR_F_1= training.GBR_train_evaluate_F_model(1,parameters_dict) #y
    mse3, GBR_F_2= training.GBR_train_evaluate_F_model(2,parameters_dict) #z

    F_eval.append((mse1 + mse2 + mse3)/3.0)
    F_mode.append([GBR_F_0, GBR_F_1, GBR_F_2])
    
#selecting best GBR model
F_sorted= np.argsort(np.array(F_eval))
GBR_F_mode= F_mode[F_sorted[0]]
F_scal= F_scal[F_sorted[0]]
#save best GBR model
os.mkdir('%s/F' % dire_expe_name)
for comp in range(3):
    filename = '%s/F/GBR_F_comp_%d.sav' % (dire_expe_name, comp)
    joblib.dump(GBR_F_mode[comp], filename)
    

filename = '%s/F/scaler_Fx.sav' % dire_expe_name
joblib.dump(F_scal[0], filename)

filename = '%s/F/scaler_Fy.sav' % dire_expe_name
joblib.dump(F_scal[1], filename)

filename = '%s/F/scaler_Fz.sav' % dire_expe_name
joblib.dump(F_scal[2], filename)

['training_data_test_1/F/scaler_Fz.sav']