In [None]:
!pip install numpy==1.26
!pip install quippy-ase
!pip install graph-pes chemiscope

In [None]:
!graph-pes-train -h

In [None]:
!pip install ipywidgets


# Part 1


## Working with atomic structures in Python


In [None]:
# load dataset

from load_atoms import load_dataset
import ase.io

structures = load_dataset("structures_filt.xyz")

In [None]:
# explore the contents of the dataset
structures

In [None]:
# visualise some of the structures in the dataset by changing the index
from load_atoms import view

idx = 30
view(structures[idx], show_bonds=True)

In [None]:
# write the structures to a file and inspect its content
# you can use the write function from ase.io

...

## Generate descriptors


In [None]:
from ase.neighborlist import neighbor_list

In [None]:
structure_0 = structures[0]

i, j, d = neighbor_list("ijd", structure_0, cutoff=3.7)
print(i)
print(j)
print(d)

In [None]:
import numpy as np

coordination_num = np.bincount(i)

In [None]:
coordination_num

In [None]:
import matplotlib.pyplot as plt

plt.hist(coordination_num, bins=20)

In [None]:
# plot the radial distribution function – look in the ASE docs for how to do this https://wiki.fysik.dtu.dk/ase/ase/neighborlist.html#ase.neighborlist.neighbor_list

In [None]:
# plot the angular distribution function – look at the ASE docs for how to do this https://wiki.fysik.dtu.dk/ase/ase/geometry.html#ase.geometry.analysis.Analysis.get_angles

from ase.geometry.analysis import Analysis

analysis = Analysis(structure_0)
CCCAngles = analysis.get_angles("C", "C", "C", unique=True)
CCCAngleValues = analysis.get_values(CCCAngles)

plt.hist(CCCAngleValues, bins=40)

# Part 2


In [None]:
from quippy.descriptors import Descriptor

desc = Descriptor("soap cutoff=3.7 n_max=4 l_max=4 atom_sigma=0.5")
soaps = desc.calc(structure_0)["data"]
soaps.shape

In [None]:
# Let's write a function to do dimentionality reduction for us
# Add other mothods to the below function

from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler


def do_analysis(data, method, **kwargs):
    """
    Function to perform a dimensionality reduction analysis on the
    descriptors.
    """

    # scale the data.
    scaler = StandardScaler()
    scaled_data = scaler.fit_transform(data)

    # perform the analysis.
    if method == "pca":
        pca = PCA(n_components=2)
        pca.fit(scaled_data)
        x_pca = pca.transform(scaled_data)
        # print(scaled_data.shape)
        # print(x_pca.shape)
        return x_pca

    else:
        print("Error: method not recognised.")
        return None

In [None]:
pca_data = do_analysis(soaps, "pca")

In [None]:
import chemiscope

properties = {
    "PCA": {
        "target": "atom",
        "values": pca_data,
        "description": "PCA of per-atom representation of the structures",
    },
    "coordination_num": {
        "target": "atom",
        "values": coordination_num,
        "description": "Coordination number of each atom",
    },
    "local_energy": {
        "target": "atom",
        "values": structure_0.arrays["local_energies"],
        "description": "Local energies predicted with C-GAP-17",
    },
}

# properties_total = properties | frame_properties
ats_envs = chemiscope.all_atomic_environments([structure_0], cutoff=3.7)
chemiscope.show(frames=[structure_0], properties=properties, environments=ats_envs)

In [None]:
# try again but for structures with higher densities

In [None]:
from quippy.descriptors import Descriptor

desc = Descriptor("soap cutoff=3.7 n_max=4 l_max=4 atom_sigma=0.5 average=T")
soaps = np.array([desc.calc(s)["data"] for s in structures])
soaps.shape
soaps = soaps.reshape(soaps.shape[0], -1)

In [None]:
pca_data = do_analysis(soaps, "pca")

In [None]:
structure_0.info

In [None]:
properties = {
    "PCA": {
        "target": "structure",
        "values": pca_data,
        "description": "PCA of per-atom representation of the structures",
    },
    "density": {
        "target": "structure",
        "values": [s.info["density"] for s in structures],
        "description": "Density of the structure",
    },
    "total_energy": {
        "target": "structure",
        "values": [s.info["energy"] for s in structures],
        "description": "Total energies predicted with C-GAP-17",
    },
    "anneal_T": {
        "target": "structure",
        "values": [s.info["anneal_T"] for s in structures],
        "description": "Annealing temperature of the structure",
    },
}

# properties_total = properties | frame_properties
chemiscope.show(frames=structures, properties=properties)

# Part 3: Predicting local energies


In [None]:
# load structures and split the data into training, validation and test
structures = load_dataset("structures_filt.xyz")
train, val, test = structures.random_split([0.8, 0.1, 0.1], seed=42)

In [None]:
# get the target labels


energies_train = train.arrays["local_energies"]
energies_val = val.arrays["local_energies"]
energies_test = test.arrays["local_energies"]

In [None]:
# generate the SOAP descriptors for the training, validation, and test sets

desc = Descriptor("soap cutoff=3.7 n_max=4 l_max=4 atom_sigma=0.5")
soaps_train = np.array([desc.calc(s)["data"] for s in train])
soaps_val = np.array([desc.calc(s)["data"] for s in val])
soaps_test = np.array([desc.calc(s)["data"] for s in test])

### linear model


you can implement your own linear model or use `scikit-learn`'s implementation; familiarise yourself with how the model is initialised, trained, and validated/tested


the descriptors are reshaped so that the first dimension matches that of the labels


In [None]:
from sklearn.linear_model import LinearRegression

model = LinearRegression()
model.fit(soaps_train.reshape(-1, soaps_train.shape[-1]), energies_train)

In [None]:
energies_train_pred = model.predict(soaps_train.reshape(-1, soaps_train.shape[-1]))
energies_val_pred = model.predict(soaps_val.reshape(-1, soaps_val.shape[-1]))
energies_test_pred = model.predict(soaps_test.reshape(-1, soaps_test.shape[-1]))

In [None]:
plt.scatter(energies_train, energies_train_pred, s=4, label="reference")
plt.scatter(energies_test, energies_test_pred, s=4, label="ML")
plt.axline((energies_train[0], energies_train[0]), slope=1, color="red", linestyle="--")

plt.legend()

In [None]:
# evaluate the model's performance by computing the mean absolute error (MAE) and the root mean square error (RMSE) on the test set

### ridge regression


the ridge regression differs from the linear regression by the introduction of the regularisation term, noted alpha $\alpha$ in `scikit-learn`


In [None]:
from sklearn.linear_model import Ridge

alpha = ...  # experiment with different values of alpha
model = Ridge(alpha=alpha)
model.fit(soaps_train.reshape(-1, soaps_train.shape[-1]), energies_train)

In [None]:
train_pred = model.predict(soaps_train.reshape(-1, soaps_train.shape[-1]))
val_pred = model.predict(soaps_val.reshape(-1, soaps_val.shape[-1]))
test_pred = model.predict(soaps_test.reshape(-1, soaps_test.shape[-1]))

In [None]:
plt.scatter(energies_train, energies_train_pred, s=4, label="reference")
plt.scatter(energies_test, energies_test_pred, s=4, label="ML")
plt.axline((energies_train[0], energies_train[0]), slope=1, color="red", linestyle="--")

plt.legend()

In [None]:
# using the validation set, find the optimal value of the regularisation and evaluate the performance metrics of this model on the test set

...

In [None]:
# try the neural network model from scikit-learn
# implement a simple kernel model (or Gaussian Process Regression model) as detailed in the GPR review; you can also use kernel ridge regression from scikit-learn.