In [None]:
using Revise

using PotentialLearning, InteratomicPotentials
using Unitful
using Random
using AtomsBase
using DelimitedFiles
using Statistics: mean, var
using StatsBase
using Clustering, Distances
using Trapz
using LinearAlgebra: Symmetric, eigen, mul!, svd, cond, dot, norm

using MultivariateStats, StatsAPI

#using CairoMakie CairoMakie.activate!()
using GLMakie; GLMakie.activate!(inline=false)

In [None]:
ace = ACE(species           = [:C,:H,:O,:N],
          body_order        = 3,
          polynomial_degree = 10,
          wL                = 2.0,
          csp               = 1.0,
          r0                = 1.43,
          rcutoff           = 4.4 )
lb = LBasisPotential(ace)
length(ace)

In [None]:
qm9_file = "../files/QM9/qm9_fullset_alldata.xyz"
raw_data = load_data(qm9_file, ExtXYZ(u"eV", u"Å"))
raw_data = DataSet([config for config in raw_data if !(:F in atomic_symbol(get_system(config)))])

max_num_train = 120_001
master_perm_idxs = readdlm("./primary_permutation.txt", Int64)
possible_training_idxs = master_perm_idxs[1:max_num_train]
possible_test_idxs = master_perm_idxs[max_num_train+1:end]

num_train = 40_000
train_idxs = possible_training_idxs[1:num_train]

In [None]:
lb.β .= readdlm("qm9_4elem_3body_poly10_fit40K.txt", Float64)

In [None]:
etest_ref = get_all_energies(raw_data[possible_test_idxs])

etest_local_descrs = compute_local_descriptors(raw_data[possible_test_idxs],lb.basis)
ds_test = DataSet(raw_data[possible_test_idxs] .+ etest_local_descrs)
etest_pred = get_all_energies(ds_test,lb)

num_atoms_test = length.(get_system.(raw_data[possible_test_idxs]))

@show e_mae, e_rmse, e_rsq = calc_metrics(etest_pred,etest_ref)

In [None]:
function compute_mean_features(ds)
    mean_feature_perconfig = Vector{Float64}[]
    for (i,config) in enumerate(ds)
        if i % 100 == 0
            println(i)
        end
        mean_feature = mean(InteratomicPotentials.compute_local_descriptors(get_system(config), lb.basis))
        push!(mean_feature_perconfig,mean_feature)
    end

    reduce(hcat,mean_feature_perconfig)
end

In [None]:
mean_train_features = compute_mean_features(raw_data[train_idxs])
mean_test_features  = compute_mean_features(raw_data[possible_test_idxs])

dt = StatsBase.fit(ZScoreTransform, mean_train_features, dims=2, scale=false)
central_mean_train_features = StatsBase.transform(dt,mean_train_features)
central_mean_test_features = StatsBase.transform(dt,mean_test_features)

In [None]:
M1 = StatsAPI.fit(MultivariateStats.PCA, central_mean_train_features; mean=0)

pca_central_train_features = StatsAPI.predict(M1, central_mean_train_features)
pca_central_test_features = StatsAPI.predict(M1, central_mean_test_features)

In [None]:
train_features = pca_central_train_features
km_5  = kmeans(train_features, 5, distance=Distances.Euclidean(), rng=Xoshiro(1))
km_10 = kmeans(train_features, 10, distance=Distances.Euclidean(), rng=Xoshiro(1))
km_20 = kmeans(train_features, 20, distance=Distances.Euclidean(), rng=Xoshiro(1))
km_50 = kmeans(train_features, 50, distance=Distances.Euclidean(), rng=Xoshiro(1))

In [None]:
function normdists2centers(feature_vec, km)
    dists = mapslices(x->Distances.euclidean(feature_vec,x), km.centers, dims=1)
    normed_dists = dists ./ sum(dists)
end

In [None]:
train_dist2centers_20 = mapslices(x->reshape(normdists2centers(x,km_20),:,1), pca_central_train_features; dims=1)
train_assignments_20 = vec(mapslices(x->argmax(x), train_dist2centers_20; dims=1))
num_inclusters_20 = [length(findall(==(i), train_assignments_20)) for i in 1:20]

In [None]:
train_dist2centers_50 = mapslices(x->reshape(normdists2centers(x,km_50),:,1), pca_central_train_features; dims=1)
train_assignments_50 = vec(mapslices(x->argmax(x), train_dist2centers_50; dims=1))
num_inclusters_50 = [length(findall(==(i), train_assignments_50)) for i in 1:50]

In [None]:
train_dist2centers_10 = mapslices(x->reshape(normdists2centers(x,km_10),:,1), pca_central_train_features; dims=1)
train_assignments_10 = vec(mapslices(x->argmax(x), train_dist2centers_10; dims=1))
num_inclusters_10 = [length(findall(==(i), train_assignments_10)) for i in 1:10]

In [None]:
train_dist2centers_5 = mapslices(x->reshape(normdists2centers(x,km_5),:,1), pca_central_train_features; dims=1)
train_assignments_5 = vec(mapslices(x->argmax(x), train_dist2centers_5; dims=1))
num_inclusters_5 = [length(findall(==(i), train_assignments_5)) for i in 1:5]

In [None]:
alt_M = StatsAPI.fit(MultivariateStats.PCA, central_mean_train_features; mean=0, pratio=0.8, maxoutdim=5)
alt_pca_central_train_features = StatsAPI.predict(alt_M, central_mean_train_features)
alt_pca_central_test_features = StatsAPI.predict(alt_M, central_mean_test_features)

In [None]:
rng_idx = 7
alt_train_features = alt_pca_central_train_features
alt_km_4  = kmeans(alt_train_features, 4, distance=Distances.Euclidean(), rng=Xoshiro(rng_idx))
alt_km_5  = kmeans(alt_train_features, 5, distance=Distances.Euclidean(), rng=Xoshiro(rng_idx))
alt_km_10 = kmeans(alt_train_features, 10, distance=Distances.Euclidean(), rng=Xoshiro(rng_idx))
alt_km_20 = kmeans(alt_train_features, 20, distance=Distances.Euclidean(), rng=Xoshiro(rng_idx))
alt_km_50 = kmeans(alt_train_features, 50, distance=Distances.Euclidean(), rng=Xoshiro(rng_idx))

In [None]:
alt_train_dist2centers_4 = mapslices(x->reshape(normdists2centers(x,alt_km_4),:,1), alt_pca_central_train_features; dims=1)
alt_train_assignments_4 = vec(mapslices(x->argmax(x), alt_train_dist2centers_4; dims=1))
alt_num_inclusters_4 = [length(findall(==(i), alt_train_assignments_4)) for i in 1:4]

In [None]:
alt_train_dist2centers_5 = mapslices(x->reshape(normdists2centers(x,alt_km_5),:,1), alt_pca_central_train_features; dims=1)
alt_train_assignments_5 = vec(mapslices(x->argmax(x), alt_train_dist2centers_5; dims=1))
alt_num_inclusters_5 = [length(findall(==(i), alt_train_assignments_5)) for i in 1:5]

In [None]:
alt_train_dist2centers_10 = mapslices(x->reshape(normdists2centers(x,alt_km_10),:,1), alt_pca_central_train_features; dims=1)
alt_train_assignments_10 = vec(mapslices(x->argmax(x), alt_train_dist2centers_10; dims=1))
alt_num_inclusters_10 = [length(findall(==(i), alt_train_assignments_10)) for i in 1:10]

So unlike the standardized approach, where with the second PCA I was able to reliably get 4 clusters, here I seem to only be getting two clusters only (and everything else empty)

In [None]:
alt_train_dist2centers_50 = mapslices(x->reshape(normdists2centers(x,alt_km_50),:,1), alt_pca_std_train_features; dims=1)
alt_train_assignments_50 = vec(mapslices(x->argmax(x), alt_train_dist2centers_50; dims=1))
alt_num_inclusters_50 = [length(findall(==(i), alt_train_assignments_50)) for i in 1:50]

In [None]:
show(stdout, "text/plain", alt_num_inclusters_50)