First attempt here was hierarchical clustering, but didn't realize the whole distance matrix was needed. Without using an out-of-core method, this is entirely infeasible, so I switched to DBscan

In [None]:
using Revise

using PotentialLearning, InteratomicPotentials
using Unitful
using Random
using AtomsBase
using DelimitedFiles
using Statistics: mean, var
using StatsBase
using Clustering, Distances, NearestNeighbors
using Trapz
using LinearAlgebra: Symmetric, eigen, mul!, svd, cond, dot, norm

using MultivariateStats, StatsAPI

using JLD2

#using CairoMakie CairoMakie.activate!()
using GLMakie; GLMakie.activate!(inline=false)

Setup basis, read in and organize data 

In [None]:
ace = ACE(species           = [:C,:H,:O,:N],
          body_order        = 3,
          polynomial_degree = 10,
          wL                = 2.0,
          csp               = 1.0,
          r0                = 1.43,
          rcutoff           = 4.4 )
lb = LBasisPotential(ace)
length(ace)

qm9_file = "../files/QM9/qm9_fullset_alldata.xyz"
raw_data = load_data(qm9_file, ExtXYZ(u"eV", u"Å"))
raw_data = DataSet([config for config in raw_data if !(:F in atomic_symbol(get_system(config)))])

max_num_train = 120_001
master_perm_idxs = readdlm("./primary_permutation.txt", Int64)
possible_training_idxs = master_perm_idxs[1:max_num_train]
possible_test_idxs = master_perm_idxs[max_num_train+1:end]

num_train = 40_000
train_idxs = possible_training_idxs[1:num_train]

lb.β .= readdlm("qm9_4elem_3body_poly10_fit40K.txt", Float64)

Compute test descriptors and metrics (sanity check)

In [None]:
etest_ref = get_all_energies(raw_data[possible_test_idxs])

etest_local_descrs = compute_local_descriptors(raw_data[possible_test_idxs],lb.basis)
ds_test = DataSet(raw_data[possible_test_idxs] .+ etest_local_descrs)
etest_pred = get_all_energies(ds_test,lb)

num_atoms_test = length.(get_system.(raw_data[possible_test_idxs]))

@show e_mae, e_rmse, e_rsq = calc_metrics(etest_pred./num_atoms_test,etest_ref./num_atoms_test)

In [None]:
function compute_mean_features(ds)
    mean_feature_perconfig = Vector{Float64}[]
    for (i,config) in enumerate(ds)
        if i % 100 == 0
            println(i)
        end
        mean_feature = mean(InteratomicPotentials.compute_local_descriptors(get_system(config), lb.basis))
        push!(mean_feature_perconfig,mean_feature)
    end

    reduce(hcat,mean_feature_perconfig)
end

function normdists2centers(feature_vec, km)
    dists = mapslices(x->Distances.euclidean(feature_vec,x), km.centers, dims=1)
    normed_dists = dists ./ sum(dists)
end

In [None]:
mean_train_features = compute_mean_features(raw_data[train_idxs])
mean_test_features  = compute_mean_features(raw_data[possible_test_idxs])

dt = StatsBase.fit(ZScoreTransform, mean_train_features, dims=2)
std_mean_train_features = StatsBase.transform(dt,mean_train_features)
std_mean_test_features = StatsBase.transform(dt,mean_test_features)

In [None]:
save("ace_qm9_train_features.jld2", Dict("std_mean_train_features" => std_mean_train_features))

In [None]:
M1 = StatsAPI.fit(MultivariateStats.PCA, std_mean_train_features; mean=0)

pca_std_train_features = StatsAPI.predict(M1, std_mean_train_features)
pca_std_test_features = StatsAPI.predict(M1, std_mean_test_features)

In [None]:
save("ace_qm9_215pca_train_features.jld2", Dict("pca_std_train_features" => pca_std_train_features))

In [None]:
# Nope too much data
#pca_train_distances = Distances.pairwise(Distances.Euclidean, pca_std_test_features)

In [None]:
Msmall = StatsAPI.fit(MultivariateStats.PCA, std_mean_train_features; mean=0,maxoutdim=2)

small_pca_std_train_features = StatsAPI.predict(Msmall, std_mean_train_features)
small_pca_std_test_features = StatsAPI.predict(Msmall, std_mean_test_features)

In [None]:
km_5  = kmeans(small_pca_std_train_features, 5, distance=Distances.Euclidean(), rng=Xoshiro(1))

In [None]:
small_train_dist2centers_5 = mapslices(x->reshape(normdists2centers(x,km_5),:,1), small_pca_std_train_features; dims=1)
small_train_assignments_5 = vec(mapslices(x->argmax(x), small_train_dist2centers_5; dims=1))
num_inclusters_5 = [length(findall(==(i), small_train_assignments_5)) for i in 1:5]

In [None]:
first_cluster = findall(==(1),small_train_assignments_5)

In [None]:
trial_indices = StatsBase.sample(first_cluster,100)
trial_idx = trial_indices[1]

dists = Vector{Float64}(undef, length(first_cluster)-1)
local_i = 1
for idx in first_cluster
     if idx == trial_idx
        continue
     end
    dists[local_i] = euclidean(pca_std_train_features[:,trial_idx], pca_std_train_features[:,idx])
    local_i += 1
end

In [None]:
hist(dists, bins=100)

In [None]:
my_clusters = dbscan(pca_std_train_features, 15.0, min_neighbors=10)

In [None]:
hist([cluster.size for cluster in my_clusters.clusters], bins=10)

In [None]:
sum([cluster.size for cluster in my_clusters.clusters])/size(pca_std_train_features)[2]

In [None]:
my_clusters.clusters

Manually varied min_neighbors and radius many times, and honestly either I ended up with 1-2 big clusters and most points accounted for, or like 10-20 reasonably balanced groups but most points excluded. Not sure there really is a clean separation in the data. I suspect that if I could compare atom-level features, I'd have more relevant atomic clusters.

Also, if there was a way to cluster by both locality and systemic bias

In [None]:
km_4  = kmeans(small_pca_std_train_features, 4, distance=Distances.Euclidean(), rng=Xoshiro(1))

OK let's go with the 2-dimensional PCA clusters obtained with kmeans. 

In [None]:
small_train_dist2centers_4 = mapslices(x->reshape(normdists2centers(x,km_4),:,1), small_pca_std_train_features; dims=1)
small_train_assignments_4 = vec(mapslices(x->argmax(x), small_train_dist2centers_4; dims=1))
num_inclusters_4 = [length(findall(==(i), small_train_assignments_4)) for i in 1:4]

In [None]:
function compute_cluster_residuals(idxs, raw_ds, lb)
    eref = get_all_energies(raw_ds[idxs])

    e_lds = compute_local_descriptors(raw_ds[idxs],lb.basis)
    ds = DataSet(raw_ds[idxs] .+ e_lds)
    epred = get_all_energies(ds,lb)

    num_atoms = length.(get_system.(raw_ds[idxs]))

    residuals = (epred .- eref) ./ num_atoms
    residuals
end

In [None]:
check_idx = 4
check_idxs = findall(==(check_idx), small_train_assignments_4)
residuals = compute_cluster_residuals(check_idxs, raw_data, lb)

In [None]:
1000*mean(residuals), 1000*sqrt(var(residuals))

In [None]:
hist(residuals, bins=1000)

In [None]:
function plot_pca_clusters(features, clusters; residuals=nothing)
    fig = Figure(resolution = (800, 600))
    ax = Axis(fig[1, 1],
        xlabel = "PC1",
        ylabel = "PC2",
        title = "PCA Scatter Plot"
    )

    # Create a color palette for 4 clusters
    colors = [:blue, :red, :green, :purple]
    clims = (-0.001, 0.001)

    if !isnothing(residuals)
        # Scatter plot with points colored by cluster
        scatter!(ax,
            features[1, :], features[2, :],
            color = residuals,
            colormap= :viridis,
            colorrange=clims,
            markersize = 3,
            alpha = 0.6
            )

    else
        # Scatter plot with points colored by cluster
        scatter!(ax,
            features[1, :], features[2, :],
            #color = [colors[c] for c in clusters],
            color= :black,
            markersize = 3,
            alpha = 0.6
            )
    end

    Colorbar(fig[1, 2],
    colormap = :viridis,
    limits = clims)

    return fig
end

In [None]:
fig = plot_pca_clusters(small_pca_std_train_features, small_train_assignments_4)
display(fig)

In [None]:
#all_residuals = compute_cluster_residuals(1:40_000, raw_data, lb) # OOOPSS! This should have been train_idxs.
all_residuals = compute_cluster_residuals(train_idxs, raw_data, lb)

In [None]:
fig = plot_pca_clusters(small_pca_std_train_features, small_train_assignments_4; residuals=all_residuals)

In [None]:
hist(all_residuals; bins=5000)

As Danny has pointed out, really I should be plotting more than one set of PCA axes, in one of the diagonal marginal plots. Also it turns out that Danny and Ayoub have explicitly been working on this problem of empty clusters with high-dimensional kmeans clustering. (Also apparently, if you start the kmeans centers, it's provable that you should not have empty clusters) 

In [None]:
M10 = StatsAPI.fit(MultivariateStats.PCA, std_mean_train_features; mean=0,maxoutdim=10)

pca10_std_train_features = StatsAPI.predict(M10, std_mean_train_features)
pca10_std_test_features = StatsAPI.predict(M10, std_mean_test_features)

In [None]:
function marginal_plots(samples; N_plots=5, residuals=nothing)
    # Assumes dataset `samples` of size (N_dim, N_data)
    clims = (-0.001, 0.001)
    fig = Figure(size=(500,500))
    for i in 1:N_plots
        axii = fig[i, i] = Axis(fig, aspect = 1)
        hidedecorations!(axii)
        hist!(axii, samples[i,:], bins = 50)
        for j in i+1:N_plots
            ax = fig[i, j] = Axis(fig, aspect = 1)
            hidedecorations!(ax)
            if isnothing(residuals)
                scatter!(ax, samples[[j,i],:], markersize = 2, color = (:black,0.2))
            else
                scatter!(ax, samples[[j,i],:], markersize = 1, color =residuals, colormap=:viridis, colorrange=clims)
            end
        end
    end
    rowgap!(fig.layout, 0)
    colgap!(fig.layout, 0)
    fig
end

In [None]:
marginal_plots(pca10_std_train_features, N_plots=10, residuals=residuals)