## Fingerprint for Many-Body Tensor Representation

# Load data

In [1]:
import pandas as pd 
import numpy as np 
import os 
from ase import Atoms 
import json 
from dscribe.descriptors import MBTR
import matplotlib.pyplot as plt 

In [2]:
# Imports data 
data_path = "/Users/rasmusniemann/Desktop/materials_with_AI/final_project/MDML_Project/data/"
train_data = pd.DataFrame(json.load(open(data_path + "train.json", "rb")))
test_data = pd.DataFrame(json.load(open(data_path + "test.json", "rb")))


In [3]:
train_data.atoms = train_data.atoms.apply(lambda x: Atoms(**x))
test_data.atoms = test_data.atoms.apply(lambda x: Atoms(**x))

In [4]:
train_data

Unnamed: 0,id,formula,atoms,hform
0,5154,Zr2BO2,"(Atom('Zr', [1.6878604657, 0.892285133, 8.1345...",-2.037877
1,984,Ni2S6,"(Atom('Ni', [1.1571403137, 0.934725185, 12.234...",-0.091020
2,1634,Cd4K2Cl6O6Te2,"(Atom('Cd', [2.7840261751, 5.6922402759, 26.71...",-1.109751
3,1872,Hf4H2N3O2,"(Atom('Hf', [0.0, 1.8677827772, 12.2421466347]...",-1.609142
4,2977,Sc2Te2,"(Atom('Te', [0.0003850118, 0.0002469812, 7.461...",-0.854593
...,...,...,...,...
7995,2130,Hf3MoBr5I3,"(Atom('Br', [2.4481979468, 2.8854952652, 26.43...",-0.794205
7996,7940,AgCrAs2Te6,"(Atom('As', [5.9109525269, 1.6988849881, 17.97...",-0.040058
7997,14035,Cd4Rb2I6S8,"(Atom('Cd', [3.3569722126, 7.5099921369, 26.50...",-0.473134
7998,2842,Ba2CdCuTlS5,"(Atom('Cd', [4.427838878e-17, 3.263191886e-16,...",-0.769551


### MBTR

In [5]:
# I think I might only be using the species list tbh
species = []
number_of_atoms = []
atomic_numbers = []
for atom in train_data.atoms:
    species = list(set(species+atom.get_chemical_symbols()))
    atomic_numbers = list(set(atomic_numbers+list(atom.get_atomic_numbers())))
    number_of_atoms.append(len(atom))

max_number_of_atoms = np.max(number_of_atoms)
min_atomic_number = np.min(atomic_numbers)
max_atomic_number = np.max(atomic_numbers)
print('Max {} atoms'.format(max_number_of_atoms))

Max 20 atoms


In [6]:
# Setup Many-Body Tensor Representation object
mbtr = MBTR(
    # Species list
    species=species,
    # 
    geometry={"function": "inverse_distance"},
    grid={"min": 0, "max": 1, "n": 10, "sigma": 0.1},
    weighting={"function": "exp", "scale": 0.5, "threshold": 1e-3},
    periodic=True,
    normalization="l2",
) 

In [7]:
# Create MBTR output for the system

structure_test = train_data.atoms[0]

mbtr_water = mbtr.create(structure_test)

print(mbtr_water)
print(mbtr_water.shape)

[0. 0. 0. ... 0. 0. 0.]
(19530,)


In [8]:
# Transform data 
training = []
testing = []

for i, atom_train in enumerate(train_data.atoms):

    if i % 1000 == 0:
        print("Converted training data fingerprints:")
        print(i)

    train_fingerprint = mbtr.create(atom_train)
    

    training.append(train_fingerprint)
    
for i, atom_test in enumerate(test_data.atoms):
    if i % 1000 == 0:
        print("Converted test data fingerprints:")
        print(i)
    
    test_fingerprint = mbtr.create(atom_test)
    testing.append(test_fingerprint)


Converted training data fingerprints:
0
Converted training data fingerprints:
1000
Converted training data fingerprints:
2000
Converted training data fingerprints:
3000
Converted training data fingerprints:
4000
Converted training data fingerprints:
5000
Converted training data fingerprints:
6000
Converted training data fingerprints:
7000
Converted test data fingerprints:
0
Converted test data fingerprints:
1000
Converted test data fingerprints:
2000
Converted test data fingerprints:
3000


In [9]:
X_train1 = pd.DataFrame(data=training, index = train_data.index)
X_test1 = pd.DataFrame(data=testing, index = test_data.index)
y = train_data.hform

In [10]:
len(testing[0])

19530

In [11]:
print('X: {}'.format(X_train1.shape))
print('y: {}'.format(y.shape))
print('Xtest: {}'.format(X_test1.shape))

X: (8000, 19530)
y: (8000,)
Xtest: (4000, 19530)


In [17]:
# Reduce fingerprint dimensionality
from sklearn.decomposition import PCA 
n_comp_PCA = 450

pca = PCA(n_components = n_comp_PCA).fit(X_train1)
X_train = pca.transform(X_train1)
X_test = pca.transform(X_test1)
print("With {} PCA components {var:0.4f}% of the variance is explained".format(n_comp_PCA, var = 100*np.sum(pca.explained_variance_ratio_)))
print('X_train: {}'.format(X_train.shape))
print('X_test: {}'.format(X_test.shape))

With 450 PCA components 91.3573% of the variance is explained
X_train: (8000, 450)
X_test: (4000, 450)


In [16]:
# Check variance 
variance = 0
for i, var in enumerate(pca.explained_variance_ratio_):
    variance += var 
    if variance > 0.90:
        print(i)
        break

414


In [14]:
# dist = []
# for i in range(len(X_train)):
#     dist.append(np.linalg.norm(X_train[0]-X_train[i]))
    

In [15]:
# dist = np.array(dist)
# np.mean(dist)

In [None]:
# Log test



