Main result of this one was re-computing all the descriptors for the training data and saving it to a jld2. Also got a qhat at the end just to re-familiarize myself.

In [None]:
using Revise
using Pkg; Pkg.activate(".")

In [None]:
using Unitful
using PotentialLearning
using Random: randperm
using JLD2
using InteratomicPotentials
using AtomsBase, AtomsCalculators
using Statistics
using CairoMakie, ColorSchemes

In [None]:
ace_cmte_dict = load("../cesmix_prez/ace_cmte1.jld2")
ensemble_members = ace_cmte_dict["members"]
base_train_idxs = ace_cmte_dict["base_train_idxs"]

For the PCA and to fit the extensivity model, I need access to the initial training set. 
So I need to read in these files again, slice the appropriate portion of them, and recompute the training descriptors

In [None]:
pristine_base_train = Vector{DataSet}()
frenkel_base_train = Vector{DataSet}()

tag = :pristine
for i in 1:5
    println(i)
    indxs = base_train_idxs[(tag,i)]
    configs = load_data("./data/pristine_$(i).xyz", ExtXYZ(u"eV", u"Å"))

    push!(pristine_base_train, configs[indxs])
end

tag = :frenkel
for i in 1:5
    println(i)
    indxs = base_train_idxs[(tag,i)]
    configs = load_data("./data/frenkel_$(i).xyz", ExtXYZ(u"eV", u"Å"))

    push!(frenkel_base_train, configs[indxs])
end

In [None]:
# from subsampling_dpp.jl in PL.jl examples
function concat_dataset(confs::Vector{DataSet})
    N = length(confs)
    confs_vec = [[confs[i][j] for j = 1:length(confs[i])] for i = 1:N]
    confs_all = reduce(vcat, confs_vec)
    return DataSet(confs_all)
end

In [None]:
concat_frenkel_base_train = concat_dataset(frenkel_base_train)
concat_pristine_base_train = concat_dataset(pristine_base_train)

In [None]:
ace = ACE(species            = [:Hf],
          body_order         = 4,
          polynomial_degree  = 10,
          wL                 = 1.5,
          csp                = 1.0,
          r0                 = 2.15,
          rcutoff            = 5.0)

In [None]:
edescr_frenkel_train = compute_local_descriptors(concat_frenkel_base_train,ace)
fdescr_frenkel_train = compute_force_descriptors(concat_frenkel_base_train,ace)

In [None]:
edescr_pristine_train = compute_local_descriptors(concat_pristine_base_train,ace)
fdescr_pristine_train = compute_force_descriptors(concat_pristine_base_train,ace)

Really need to investigate why the energy descriptors are so much slower than the force descriptors. Something is very obviously wrong here. 

In [None]:
pristine_train_ds = DataSet(concat_pristine_base_train .+ edescr_pristine_train .+ fdescr_pristine_train)
frenkel_train_ds = DataSet(concat_frenkel_base_train .+ edescr_frenkel_train .+ fdescr_frenkel_train)

In [None]:
save("training_data.jld2", Dict("frenkel_train_ds"  => frenkel_train_ds,
                                "pristine_train_ds" => pristine_train_ds))

In [None]:
includet("../files/conformal_prediction_utils.jl")
includet("../files/committee_potentials.jl")
includet("../files/committee_qois.jl")

In [None]:
my_cmte = CommitteePotential(ensemble_members; energy_units=u"eV", length_units=u"Å")
cmte_energy = CmteEnergy(Statistics.std, strip_units=true)

In [None]:
calibtest_datasets = load("../cesmix_prez/datasets_with_descriptors.jld2")
pristine_base_calib_ds = calibtest_datasets["pristine_base_calib_ds"]
pristine_base_test_ds = calibtest_datasets["pristine_base_test_ds"]
frenkel_base_calib_ds = calibtest_datasets["frenkel_base_calib_ds"]
frenkel_base_test_ds = calibtest_datasets["frenkel_base_test_ds"]

In [None]:
includet("../files/conformal_prediction_utils.jl")

In [None]:
combined_calib_ds = concat_dataset([pristine_base_calib_ds; frenkel_base_calib_ds])
combined_test_ds = concat_dataset([pristine_base_test_ds; frenkel_base_test_ds])

In [None]:
ecalib_pred = [ustrip(PotentialLearning.potential_energy(sys,my_cmte)) for sys in combined_calib_ds]
ecalib_ref = [get_values(get_energy(config)) for config in combined_calib_ds]
calib_uq = [ustrip(compute(cmte_energy,config,my_cmte)) for config in combined_calib_ds]

qhat = calibrate(ecalib_pred, ecalib_ref, calib_uq, 0.1)

In [None]:
etest_pred = [ustrip(PotentialLearning.potential_energy(config,my_cmte)) for config in combined_test_ds]
etest_ref = [ustrip(get_values(get_energy(config))) for config in combined_test_ds]
test_uq = [ustrip(compute(cmte_energy, config, my_cmte)) for config in combined_test_ds]

num_test = length(etest_pred)
test_abs_residuals = abs.(etest_pred .- etest_ref)

In [None]:
qhat_scores = qhat*test_uq
coverage = sum(test_abs_residuals .> qhat_scores) / num_test