In [None]:
using Revise
using Pkg; Pkg.activate(".")

In [None]:
using Unitful
using PotentialLearning
using Random: randperm
using JLD2
using InteratomicPotentials

From all the data, I'm taking the first five runs as trainining data. Moreover, I want to subsample further, taking only 1 out of every 3 configs (roughly 1 every 900 fs)

In [None]:
pristine_base_train = Vector{DataSet}()
frenkel_base_train = Vector{DataSet}()
base_train_idxs = Dict{Tuple{Symbol, Int64}, Vector{Int64}}()

#prisine
for i in 1:5
    println(i)
    configs = load_data("./data/pristine_$(i).xyz", ExtXYZ(u"eV", u"Å"))
    indxs = randperm(1001)[1:300]

    push!(pristine_base_train, configs[indxs])

    base_train_idxs[(:pristine, i)] = indxs
end

#frenkel
for i in 1:5
    println(i)
    configs = load_data("./data/frenkel_$(i).xyz", ExtXYZ(u"eV", u"Å"))
    indxs = randperm(1001)[1:300]

    push!(frenkel_base_train, configs[indxs])

    base_train_idxs[(:frenkel, i)] = indxs
end

In [None]:
# from subsampling_dpp.jl in PL.jl examples
function concat_dataset(confs::Vector{DataSet})
    N = length(confs)
    confs_vec = [[confs[i][j] for j = 1:length(confs[i])] for i = 1:N]
    confs_all = reduce(vcat, confs_vec)
    return DataSet(confs_all)
end

In [None]:
base_train = concat_dataset([pristine_base_train; frenkel_base_train])

Let's just figure out what a reasonable ace basis is

In [None]:
ace = ACE(species            = [:Hf],
          body_order         = 4,
          polynomial_degree  = 10,
          wL                 = 1.5,
          csp                = 1.0,
          r0                 = 2.15,
          rcutoff            = 5.0)

In [None]:
trial_indxs = randperm(3000)
trial_train_indxs = trial_indxs[1:1000]
trial_val_indxs = trial_indxs[1001:3000]

Something was very weird about this calculation, because the local descriptor calculation was like an order of magnitude slower than the force descriptor calculation...

In [None]:
edescr_train_trial = compute_local_descriptors(base_train[trial_train_indxs],ace)
fdescr_train_trial = compute_force_descriptors(base_train[trial_train_indxs],ace)

trial_train_ds = DataSet(base_train[trial_train_indxs] .+ edescr_train_trial .+ fdescr_train_trial)

In [None]:
fdescr_train_trial[1][1].b[1]

In [None]:
trial_lb = LBasisPotential(ace)
ws, int = [100.0, 0.0], false # changed to fitting to just energies

lp = PotentialLearning.LinearProblem(trial_train_ds)
learn!(lp, ws, int; λ=0.01)

resize!(trial_lb.β, length(lp.β))
trial_lb.β .= lp.β
trial_lb.β0 .= lp.β0


In [None]:
trial_val_indxs = trial_val_indxs[1:500]
edescr_trial_val = compute_local_descriptors(base_train[trial_val_indxs], ace)
fdescr_trial_val = compute_force_descriptors(base_train[trial_val_indxs], ace)

In [None]:
trial_val_ds = DataSet(base_train[trial_val_indxs] .+ edescr_trial_val .+ fdescr_trial_val)

In [None]:
natoms = [length(position(sys)) for sys in get_system.(trial_val_ds)]
e_val_trial_ref = get_all_energies(trial_val_ds)
epa_val_trial_ref = e_val_trial_ref ./ natoms

f_val_trial_ref = get_all_forces(trial_val_ds)

In [None]:
e_val_trial_pred = get_all_energies(trial_val_ds, trial_lb)
epa_val_trial_pred = e_val_trial_pred ./natoms

f_val_trial_pred = get_all_forces(trial_val_ds, trial_lb)

In [None]:
@show epa_mae, epa_rmse, epa_rsq = calc_metrics(epa_val_trial_pred, epa_val_trial_ref) # reran after just fitting to energies

In [None]:
@show f_mae, f_rmse, f_rsq = calc_metrics(f_val_trial_pred, f_val_trial_ref)

In [None]:
f_val_trial_pred[1:10
]

In [None]:
f_val_trial_ref[1:10]

OK let's push forward, we're going to fit 10 ensemble members

In [None]:
edescr_base_train = compute_local_descriptors(base_train,ace)
fdescr_base_train = compute_force_descriptors(base_train,ace)

full_base_train = DataSet(base_train .+ edescr_base_train .+ fdescr_base_train)

In [None]:
members = Vector{LBasisPotential}()
relative2base_cmte_indxs = Dict{Int64, Vector{Int64}}()
for i in 1:10
    rand_idxs = randperm(3000)[1:750]
    lp = PotentialLearning.LinearProblem(full_base_train[rand_idxs])

    lb = LBasisPotential(ace)
    ws, int = [1.0, 0.0], false # changed to fitting to just energies
    learn!(lp, ws, int; λ=0.01)

    resize!(lb.β, length(lp.β))
    lb.β .= lp.β
    lb.β0 .= lp.β0

    push!(members, lb)
    relative2base_cmte_indxs[i] = rand_idxs
end


In [None]:
save("ace_cmte1.jld2", Dict("members" => members,
                            "relative2base_cmte_indxs" => relative2base_cmte_indxs,
                            "base_train_idxs" => base_train_idxs))