In [None]:
using Revise

using PotentialLearning, InteratomicPotentials
using Unitful
using Random
using AtomsBase
using DelimitedFiles
using Statistics: mean
using StatsBase
using Clustering, Distances
using Trapz

using CairoMakie

In [None]:
ace = ACE(#species           = [:C,:H,:O,:N,:F],
          species           = [:C,:H,:O,:N],
          #body_order        = 4,
          body_order        = 3,
          #polynomial_degree = 16,
          polynomial_degree = 10,
          wL                = 2.0,
          csp               = 1.0,
          r0                = 1.43,
          rcutoff           = 4.4 )
length(ace)

In [None]:
qm9_file = "../files/QM9/qm9_fullset_alldata.xyz"
raw_data = load_data(qm9_file, ExtXYZ(u"eV", u"Å"))
raw_data = [config for config in raw_data if !(:F in atomic_symbol(get_system(config)))]

Removing structures with Fluorine results in 1,1923 fewer configs

In [None]:
raw_data = DataSet([config for config in raw_data if !(:F in atomic_symbol(get_system(config)))])

In [None]:
master_perm_idxs = Random.randperm(Xoshiro(1), length(raw_data))

In [None]:
max_num_train = 120_001
possible_training_idxs = master_perm_idxs[1:max_num_train]
possible_test_idxs = master_perm_idxs[max_num_train+1:end]

In [None]:
#num_train = 120_000
num_train = 40_000
train_idxs = possible_training_idxs[1:num_train]

In [None]:
raw_data[train_idxs]

Only fitted this once, then I skip the `ooc_learn_eonly!` routine in future runs of this notebook


In [None]:
lb = LBasisPotential(ace)
#_AtWA, _AtWb = PotentialLearning.ooc_learn_eonly!(lb, raw_data[train_idxs];symmetrize=false, λ=0.01, pbar=false)

In [None]:
#open("qm9_4elem_3body_poly10_fit40K.txt", "w") do io
#    writedlm(io, lb.β)
#end

lb.β .= readdlm("qm9_4elem_3body_poly10_fit40K.txt", Float64)
#open("qm9_4elem_3body_poly10_fit40K.txt", "r") do io
#    readdlm(io, lb.β)
#end

In [None]:
lb.β

In [None]:
etest_ref = get_all_energies(raw_data[possible_test_idxs])

In [None]:
etest_local_descrs = compute_local_descriptors(raw_data[possible_test_idxs],lb.basis)
ds_test = DataSet(raw_data[possible_test_idxs] .+ etest_local_descrs)

In [None]:
etest_pred = get_all_energies(ds_test,lb)

In [None]:
@show e_mae, e_rmse, e_rsq = calc_metrics(etest_pred,etest_ref)

(e_mae, e_rmse, e_rsq) = calc_metrics(etest_pred, etest_ref) = (0.04422691459021048, 0.06343019540788734, 0.9999609908582189)

In [None]:
test_sys = get_system(raw_data[train_idxs[2]])

In [None]:
mean(InteratomicPotentials.compute_local_descriptors(test_sys, lb.basis))

Some notes about computing the distance vector: 
- feature vector for each config is averaged over atoms (not summed)
- feature vector is standardized when generating the k-means cluster
- does appear to be using Euclidean distance with k-means (they pass "Minkowski", which seems to default to p=2)
- when getting the final distance metric, they take the average distance between all cluster centers

In [None]:
mean_feature_perconfig = Vector{Float64}[]
for (i,config) in enumerate(raw_data[train_idxs])
    if i % 100 == 0
        println(i)
    end
    mean_feature = mean(InteratomicPotentials.compute_local_descriptors(get_system(config), lb.basis))
    push!(mean_feature_perconfig,mean_feature)
end

In [None]:
mean_train_features = reduce(hcat,mean_feature_perconfig)


In [None]:
dt = StatsBase.fit(ZScoreTransform, mean_train_features, dims=2)
std_mean_train_features = StatsBase.transform(dt,mean_train_features)

In [None]:
dt.mean

In [None]:
num_neighbors = 10
km = kmeans(std_mean_train_features, num_neighbors, distance=Distances.Euclidean(), rng=Xoshiro(1))

In [None]:
km.centers

In [None]:
mean_test_feature_perconfig = Vector{Float64}[]
for (i,config) in enumerate(raw_data[possible_test_idxs])
    if i % 100 == 0
        println(i)
    end
    mean_feature = mean(InteratomicPotentials.compute_local_descriptors(get_system(config), lb.basis))
    push!(mean_test_feature_perconfig,mean_feature)
end


In [None]:
mean_test_features = reduce(hcat, mean_test_feature_perconfig)
std_mean_test_features = StatsBase.transform(dt,mean_test_features)

In [None]:
example_test_vec = std_mean_test_features[:,1]
mean(mapslices(x->Distances.euclidean(example_test_vec,x), km.centers, dims=1))
#Distances.euclidean(example_test_vec,km.centers[:,1])

In [None]:
function my_distance(test_vec)
    dist = mean(mapslices(x->Distances.euclidean(test_vec,x), km.centers, dims=1))
    #dist = minimum(mapslices(x->Distances.euclidean(test_vec,x), km.centers, dims=1))
end

In [None]:
test_feature_distances = mapslices(my_distance, std_mean_test_features, dims=1)

In [None]:
fraction_calib = 0.1
num_calib = floor(Int64, fraction_calib*length(possible_test_idxs))
num_test = length(possible_test_idxs) - num_calib

calib_idxs = possible_test_idxs[1:num_calib] # this is useless, etest and test_feature_distances already indexed from raw
test_idxs  = possible_test_idxs[num_calib+1:end] # this is useless

#actually I do need those idxs sets (that index the raw_data)
calib_num_atoms = length.(get_system.(raw_data[calib_idxs]))
test_num_atoms = length.(get_system.(raw_data[test_idxs]))

In [None]:
possible_test_idxs

So the Medford paper takes as a quantity of interest as the energy normalized by the number of atoms, rather than the raw energy. I suspect that it doesn't make that much of a difference for this dataset since the number of atoms are pretty similar, but for very big differences I'm sure it probably starts to matter. 

In [None]:
calib_scores = abs.(etest_pred[1:num_calib] .- etest_ref[1:num_calib]) ./ test_feature_distances[1:num_calib]

In [None]:
alpha = 0.05
q_hat = quantile(calib_scores, ceil((num_calib+1)*(1-alpha))/num_calib)

In [None]:
test_abs_residuals = abs.(etest_pred[num_calib+1:end] .- etest_ref[num_calib+1:end])
qhat_scores = q_hat*test_feature_distances[num_calib+1:end]

In [None]:
sum(test_abs_residuals .> qhat_scores) / num_test

In [None]:
hist(test_feature_distances[num_calib+1:end],bins=100)
# I'm not sure, quantitatively, what constitutes being sufficiently adaptive.
# Ultimately it seems dependent on the dataset in addition to the score function

In [None]:
calib_scores

In [None]:
alpha_complements = collect(range(0.01,0.99,step=0.01))
alpha_refs = 1 .- alpha_complements

predicted_alphas = Float64[]
for ac in alpha_complements
    alpha = 1-ac
    qh = quantile(calib_scores, clamp(ceil((num_calib+1)*(1-alpha))/num_calib, 0.0, 1.0))

    qh_scores = qh*test_feature_distances[num_calib+1:end]
    predicted_alpha = sum(test_abs_residuals .> qh_scores) / num_test
    push!(predicted_alphas, predicted_alpha)
end


In [None]:
function make_calibration_plot(expected_ps, observed_ps; width=600)
    # Convert to percentages
    expected_ps = expected_ps .* 100
    observed_ps = observed_ps .* 100

    fig = Figure(resolution=(width, width))
    ax = Axis(fig[1, 1],
        aspect=DataAspect(),
        xlabel="Expected conf. level",
        ylabel="Observed conf. level",
        limits=(0, 100, 0, 100)
    )

    # Main line
    lines!(ax, expected_ps, observed_ps)

    # Diagonal reference line
    lines!(ax, expected_ps, expected_ps, linestyle=:dash, alpha=0.4)

    # Filled area between curves
    band!(ax, expected_ps, expected_ps, observed_ps, color=(:blue, 0.2))

    # Configure ticks - approximately 4 ticks on each axis
    ax.xticks = 0:10:100
    ax.yticks = 0:10:100

    # Add percentage signs to ticks
    ax.xtickformat = xs -> ["$(Int(x))%" for x in xs]
    ax.ytickformat = xs -> ["$(Int(x))%" for x in xs]

    ## Add text for miscalibration area
    #text!(ax, "miscalc. area = $(round(area, digits=3))",
    #    position=(8, 2),
    #    align=(:left, :bottom)
    #)

    return fig
end

In [None]:
function compute_miscalibration_area(expected_ps, observed_ps)
    area = 0.0
    for i in 2:length(expected_ps)-1
        trap = abs(trapz(expected_ps[i-1:i+1], observed_ps[i-1:i+1]) -
                 trapz(expected_ps[i-1:i+1], expected_ps[i-1:i+1]))
        area += trap
    end
    area
end

In [None]:
compute_miscalibration_area(alpha_refs, predicted_alphas)

In [None]:
make_calibration_plot(alpha_refs,predicted_alphas)