In [16]:
# Import glosim2 (https://github.com/epfl-cosmo/glosim2)
import os
import sys
sys.path.insert(0, '/home/azadoks/git/glosim2')
# In my docker, use this:
# sys.path.insert(0, '/home/app/glosim2')

# Import Atoms objects
from ase import Atoms as ase_Atoms
from quippy import Atoms as quippy_Atoms

# Data management
import numpy as np
import pandas as pd
# If data is too large for memory, I suggest replacing (or augmenting)
# numpy Arrays and pandas DataFrames with dask arrays and dataframes

# SOAP
from libmatch.soap import get_Soaps
from libmatch.utils import ase2qp

# PCA
from sklearn.decomposition import PCA

DISPLAY_DATA = True

### Structure Input
---

In [2]:
# Load structures here

# For example:
from pymatgen import MPRester, Structure
from pymatgen.io.ase import AseAtomsAdaptor
from collections import OrderedDict
material_ids = OrderedDict([
    ('diamond', 'mp-66'),  # C
    ('rocksalt', 'mp-22862'),  # NaCl
    ('cubic_perovskite', 'mp-2998'),  # BaTiO3
    ('wurtzite', 'mp-10281'),  # ZnS
    ('fcc', 'mp-23'),  # Ni
    ('bcc', 'mp-13'),  # Fe
    ('hcp', 'mp-153'),  # Mg
    ('trigonal', 'mp-782'),  # Te2Pd
    ('tetragonal', 'mp-742'),  # Ti2Cu
    ('monoclinic', 'mp-684'),  # BaS2
    ('triclinic', 'mp-9122'),  # CaP3
    ('orthorhombic', 'mp-872')  # BaSn
])
# Convert material_ids dict into a DataFrame with a column for name and material_id
material_df = pd.DataFrame({'name': material_ids.keys(), 'material_id': material_ids.values()})
if DISPLAY_DATA: print 'Before applying mpr.get_structure_by_material_id'
if DISPLAY_DATA: display(material_df)

# .apply will call its argument (in this case, mpr.get_structure_by_material_id) 
#   on each of the entries of the object on which it's called (in this case, the material_id Series)
#   (DataFrame columns are Series)
# Here, it retrieves a pymatgen Structure from Materials Project for each material_id
with MPRester('0WqdPfXxloze6T9N') as mpr:
    material_df['structure'] = material_df['material_id'].apply(mpr.get_structure_by_material_id)

if DISPLAY_DATA: print ''.join(['=']*80)
if DISPLAY_DATA: print 'After applying the query function and making a new "structure" column'
if DISPLAY_DATA: display(material_df)

Before applying mpr.get_structure_by_material_id


Pymatgen will drop Py2k support from v2019.1.1. Pls consult the documentation
at https://www.pymatgen.org for more details.
  at https://www.pymatgen.org for more details.""")


Unnamed: 0,material_id,name
0,mp-66,diamond
1,mp-22862,rocksalt
2,mp-2998,cubic_perovskite
3,mp-10281,wurtzite
4,mp-23,fcc
5,mp-13,bcc
6,mp-153,hcp
7,mp-782,trigonal
8,mp-742,tetragonal
9,mp-684,monoclinic


After applying the query function and making a new "structure" column


Unnamed: 0,material_id,name,structure
0,mp-66,diamond,"[[-2.68028194 -2.68028194 -2.68028194] C, [0. ..."
1,mp-22862,rocksalt,"[[0. 0. 0.] Na, [2.32362417 1.64305041 4.02463..."
2,mp-2998,cubic_perovskite,"[[3.29508232 1.16714027 0.00434677] Ba, [2.508..."
3,mp-10281,wurtzite,"[[ 1.92625151 1.11212181 11.82194194] S, [-2...."
4,mp-23,fcc,[[0. 0. 0.] Ni]
5,mp-13,bcc,[[0. 0. 0.] Fe]
6,mp-153,hcp,[[-1.92521972e-08 1.83815209e+00 1.31235992e...
7,mp-782,trigonal,"[[0. 0. 0.] Pd, [-2.05816593e-08 2.37656531e+..."
8,mp-742,tetragonal,"[[1.74805434 1.87372632 0.88063532] Ti, [0.891..."
9,mp-684,monoclinic,"[[2.91949681 1.4086912 1.11515985] S, [4.1798..."


### Structure Conversion
---

In [3]:
# Convert structures to ase Atoms
material_df['ase'] = material_df['structure'].apply(AseAtomsAdaptor.get_atoms)
if DISPLAY_DATA: display(material_df)

Unnamed: 0,material_id,name,structure,ase
0,mp-66,diamond,"[[-2.68028194 -2.68028194 -2.68028194] C, [0. ...","(Atom('C', [-2.680281945, -2.680281945, -2.680..."
1,mp-22862,rocksalt,"[[0. 0. 0.] Na, [2.32362417 1.64305041 4.02463...","(Atom('Na', [0.0, 0.0, 0.0], index=0), Atom('C..."
2,mp-2998,cubic_perovskite,"[[3.29508232 1.16714027 0.00434677] Ba, [2.508...","(Atom('Ba', [3.2950823175, 1.167140265, 0.0043..."
3,mp-10281,wurtzite,"[[ 1.92625151 1.11212181 11.82194194] S, [-2....","(Atom('S', [1.9262515125958486, 1.112121808878..."
4,mp-23,fcc,[[0. 0. 0.] Ni],"(Atom('Ni', [0.0, 0.0, 0.0], index=0))"
5,mp-13,bcc,[[0. 0. 0.] Fe],"(Atom('Fe', [0.0, 0.0, 0.0], index=0))"
6,mp-153,hcp,[[-1.92521972e-08 1.83815209e+00 1.31235992e...,"(Atom('Mg', [-1.925219716980564e-08, 1.8381520..."
7,mp-782,trigonal,"[[0. 0. 0.] Pd, [-2.05816593e-08 2.37656531e+...","(Atom('Pd', [0.0, 0.0, 0.0], index=0), Atom('T..."
8,mp-742,tetragonal,"[[1.74805434 1.87372632 0.88063532] Ti, [0.891...","(Atom('Ti', [1.7480543427934925, 1.87372632353..."
9,mp-684,monoclinic,"[[2.91949681 1.4086912 1.11515985] S, [4.1798...","(Atom('S', [2.919496806907992, 1.4086911979637..."


In [4]:
# Convert ase Atoms to quippy Atoms
material_df['quippy'] = material_df['ase'].apply(ase2qp)
if DISPLAY_DATA: display(material_df)

Unnamed: 0,material_id,name,structure,ase,quippy
0,mp-66,diamond,"[[-2.68028194 -2.68028194 -2.68028194] C, [0. ...","(Atom('C', [-2.680281945, -2.680281945, -2.680...","(Atom('C', [-2.680281945, -2.680281945, -2.680..."
1,mp-22862,rocksalt,"[[0. 0. 0.] Na, [2.32362417 1.64305041 4.02463...","(Atom('Na', [0.0, 0.0, 0.0], index=0), Atom('C...","(Atom('Na', [0.0, 0.0, 0.0], index=0), Atom('C..."
2,mp-2998,cubic_perovskite,"[[3.29508232 1.16714027 0.00434677] Ba, [2.508...","(Atom('Ba', [3.2950823175, 1.167140265, 0.0043...","(Atom('Ba', [3.2950823175, 1.167140265, 0.0043..."
3,mp-10281,wurtzite,"[[ 1.92625151 1.11212181 11.82194194] S, [-2....","(Atom('S', [1.9262515125958486, 1.112121808878...","(Atom('S', [1.9262515125958486, 1.112121808878..."
4,mp-23,fcc,[[0. 0. 0.] Ni],"(Atom('Ni', [0.0, 0.0, 0.0], index=0))","(Atom('Ni', [0.0, 0.0, 0.0], index=0))"
5,mp-13,bcc,[[0. 0. 0.] Fe],"(Atom('Fe', [0.0, 0.0, 0.0], index=0))","(Atom('Fe', [0.0, 0.0, 0.0], index=0))"
6,mp-153,hcp,[[-1.92521972e-08 1.83815209e+00 1.31235992e...,"(Atom('Mg', [-1.925219716980564e-08, 1.8381520...","(Atom('Mg', [-1.925219716980564e-08, 1.8381520..."
7,mp-782,trigonal,"[[0. 0. 0.] Pd, [-2.05816593e-08 2.37656531e+...","(Atom('Pd', [0.0, 0.0, 0.0], index=0), Atom('T...","(Atom('Pd', [0.0, 0.0, 0.0], index=0), Atom('T..."
8,mp-742,tetragonal,"[[1.74805434 1.87372632 0.88063532] Ti, [0.891...","(Atom('Ti', [1.7480543427934925, 1.87372632353...","(Atom('Ti', [1.7480543427934925, 1.87372632353..."
9,mp-684,monoclinic,"[[2.91949681 1.4086912 1.11515985] S, [4.1798...","(Atom('S', [2.919496806907992, 1.4086911979637...","(Atom('S', [2.919496806907992, 1.4086911979637..."


### Soap Calculation
---
* `atoms` (`[quippy.Atoms]`): List of quippy Atoms structures
* `nocenters` (`[int]` or `None`): List of atomic numbers to ignore as centers
* `chem_channels` (`bool`): ??
* `centerweight` (`float`): Weight of gaussian on central atom
* `gaussian_width` (`float`): Width (sigma) of gaussian 
* `cutoff` (`float`): Distance (in units of input) to cut off overlap integration
* `cutoff_transition_width` (`float`): Width of sigmoid used to smooth integration cutoff
* `nmax` (`nmax`): Number of radial basis functions
* `lmax` (`int`): Number of spherical harmonics
* `spkitMax` (`dict`): "species kit maximum", `{Z: Nmax}` over all structures in `atoms`

        spkit = {}
        for atom in all:
            atomspecies = {}
            for z in atom.z:      
                if z in atomspecies: atomspecies[z]+=1
                else: atomspecies[z] = 1

            for (z, nz) in atomspecies.iteritems():
                if z in spkit:
                    if nz>spkit[z]: spkit[z] = nz
                else:
        spkit[z] = nz

* `nprocess`: Number of subprocesses spawned (best to use number of cores unless data is very large
* `chemicalProjection` (`None` or `???`)
* `dispbar` (`bool`)
* `is_fast_average` (`bool`): Return fast average if true, full soap (per site) if false. If true, will return `OrderedDict([('AVG': SOAP)])`

In [5]:
# default SOAP parameters
# get_Soaps(atoms, 
#           nocenters=None, chem_channels=False, 
#           centerweight=1.0, gaussian_width=0.5, 
#           cutoff=3.5, cutoff_transition_width=0.5, 
#           nmax=8, lmax=6, 
#           spkitMax=None, 
#           nprocess=1, 
#           chemicalProjection=None, 
#           dispbar=False, 
#           is_fast_average=False)

# Calculate SOAPs
material_df['soaps'] = get_Soaps(material_df['quippy'])
material_df['fast_average_soap'] = get_Soaps(material_df['quippy'], is_fast_average=True)
if DISPLAY_DATA: display(material_df)

# Use example_soap to investigate the structure of a SOAP
example_soap = material_df['soaps'][0]

HBox(children=(IntProgress(value=0, description=u'SOAP vectors', max=12), HTML(value=u'')))




HBox(children=(IntProgress(value=0, description=u'SOAP vectors', max=12), HTML(value=u'')))




Unnamed: 0,material_id,name,structure,ase,quippy,soaps,fast_average_soap
0,mp-66,diamond,"[[-2.68028194 -2.68028194 -2.68028194] C, [0. ...","(Atom('C', [-2.680281945, -2.680281945, -2.680...","(Atom('C', [-2.680281945, -2.680281945, -2.680...","[C0, C1]",[AVG]
1,mp-22862,rocksalt,"[[0. 0. 0.] Na, [2.32362417 1.64305041 4.02463...","(Atom('Na', [0.0, 0.0, 0.0], index=0), Atom('C...","(Atom('Na', [0.0, 0.0, 0.0], index=0), Atom('C...","[Na0, Cl0]",[AVG]
2,mp-2998,cubic_perovskite,"[[3.29508232 1.16714027 0.00434677] Ba, [2.508...","(Atom('Ba', [3.2950823175, 1.167140265, 0.0043...","(Atom('Ba', [3.2950823175, 1.167140265, 0.0043...","[Ba0, Ba1, Ti0, Ti1, O0, O1, O2, O3, O4, O5]",[AVG]
3,mp-10281,wurtzite,"[[ 1.92625151 1.11212181 11.82194194] S, [-2....","(Atom('S', [1.9262515125958486, 1.112121808878...","(Atom('S', [1.9262515125958486, 1.112121808878...","[S0, S1, S2, S3, Zn0, Zn1, Zn2, Zn3]",[AVG]
4,mp-23,fcc,[[0. 0. 0.] Ni],"(Atom('Ni', [0.0, 0.0, 0.0], index=0))","(Atom('Ni', [0.0, 0.0, 0.0], index=0))",[Ni0],[AVG]
5,mp-13,bcc,[[0. 0. 0.] Fe],"(Atom('Fe', [0.0, 0.0, 0.0], index=0))","(Atom('Fe', [0.0, 0.0, 0.0], index=0))",[Fe0],[AVG]
6,mp-153,hcp,[[-1.92521972e-08 1.83815209e+00 1.31235992e...,"(Atom('Mg', [-1.925219716980564e-08, 1.8381520...","(Atom('Mg', [-1.925219716980564e-08, 1.8381520...","[Mg0, Mg1]",[AVG]
7,mp-782,trigonal,"[[0. 0. 0.] Pd, [-2.05816593e-08 2.37656531e+...","(Atom('Pd', [0.0, 0.0, 0.0], index=0), Atom('T...","(Atom('Pd', [0.0, 0.0, 0.0], index=0), Atom('T...","[Pd0, Te0, Te1]",[AVG]
8,mp-742,tetragonal,"[[1.74805434 1.87372632 0.88063532] Ti, [0.891...","(Atom('Ti', [1.7480543427934925, 1.87372632353...","(Atom('Ti', [1.7480543427934925, 1.87372632353...","[Ti0, Ti1, Cu0]",[AVG]
9,mp-684,monoclinic,"[[2.91949681 1.4086912 1.11515985] S, [4.1798...","(Atom('S', [2.919496806907992, 1.4086911979637...","(Atom('S', [2.919496806907992, 1.4086911979637...","[S0, S1, S2, S3, Ba0, Ba1]",[AVG]


In [6]:
# Calculate average SOAPs
def get_average_soap(soaps):
    return np.mean(soaps.values(), axis=0)
material_df['average_soap'] = material_df['soaps'].apply(get_average_soap)
if DISPLAY_DATA: display(material_df)

Unnamed: 0,material_id,name,structure,ase,quippy,soaps,fast_average_soap,average_soap
0,mp-66,diamond,"[[-2.68028194 -2.68028194 -2.68028194] C, [0. ...","(Atom('C', [-2.680281945, -2.680281945, -2.680...","(Atom('C', [-2.680281945, -2.680281945, -2.680...","[C0, C1]",[AVG],"[0.08117251119231211, 4.4023286446335976e-35, ..."
1,mp-22862,rocksalt,"[[0. 0. 0.] Na, [2.32362417 1.64305041 4.02463...","(Atom('Na', [0.0, 0.0, 0.0], index=0), Atom('C...","(Atom('Na', [0.0, 0.0, 0.0], index=0), Atom('C...","[Na0, Cl0]",[AVG],"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
2,mp-2998,cubic_perovskite,"[[3.29508232 1.16714027 0.00434677] Ba, [2.508...","(Atom('Ba', [3.2950823175, 1.167140265, 0.0043...","(Atom('Ba', [3.2950823175, 1.167140265, 0.0043...","[Ba0, Ba1, Ti0, Ti1, O0, O1, O2, O3, O4, O5]",[AVG],"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
3,mp-10281,wurtzite,"[[ 1.92625151 1.11212181 11.82194194] S, [-2....","(Atom('S', [1.9262515125958486, 1.112121808878...","(Atom('S', [1.9262515125958486, 1.112121808878...","[S0, S1, S2, S3, Zn0, Zn1, Zn2, Zn3]",[AVG],"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
4,mp-23,fcc,[[0. 0. 0.] Ni],"(Atom('Ni', [0.0, 0.0, 0.0], index=0))","(Atom('Ni', [0.0, 0.0, 0.0], index=0))",[Ni0],[AVG],"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
5,mp-13,bcc,[[0. 0. 0.] Fe],"(Atom('Fe', [0.0, 0.0, 0.0], index=0))","(Atom('Fe', [0.0, 0.0, 0.0], index=0))",[Fe0],[AVG],"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
6,mp-153,hcp,[[-1.92521972e-08 1.83815209e+00 1.31235992e...,"(Atom('Mg', [-1.925219716980564e-08, 1.8381520...","(Atom('Mg', [-1.925219716980564e-08, 1.8381520...","[Mg0, Mg1]",[AVG],"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
7,mp-782,trigonal,"[[0. 0. 0.] Pd, [-2.05816593e-08 2.37656531e+...","(Atom('Pd', [0.0, 0.0, 0.0], index=0), Atom('T...","(Atom('Pd', [0.0, 0.0, 0.0], index=0), Atom('T...","[Pd0, Te0, Te1]",[AVG],"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
8,mp-742,tetragonal,"[[1.74805434 1.87372632 0.88063532] Ti, [0.891...","(Atom('Ti', [1.7480543427934925, 1.87372632353...","(Atom('Ti', [1.7480543427934925, 1.87372632353...","[Ti0, Ti1, Cu0]",[AVG],"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
9,mp-684,monoclinic,"[[2.91949681 1.4086912 1.11515985] S, [4.1798...","(Atom('S', [2.919496806907992, 1.4086911979637...","(Atom('S', [2.919496806907992, 1.4086911979637...","[S0, S1, S2, S3, Ba0, Ba1]",[AVG],"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."


### PCA
---

In [7]:
# Create the PCA data

# Create a numpy array with the same number of rows as structures
#   and the same number of columns as SOAP dimensions
len_entries = len(material_df['average_soap'])
len_soap = len(material_df['average_soap'][0])
# Here, concatenate makes an len_entries * len_soap x 1 vector which
#  is reshaped into a len_entries x len_soap array
pca_array = np.concatenate(material_df['average_soap']).reshape(len_entries, len_soap)

In [11]:
# Do a test PCA on the average_soaps

# This will do a PCA resulting in vectors of the same length (dimension) as the originals
# Do this to find the percent of variance explained by various numbers of dimensions
pca = PCA()
pca.fit(pca_array)
display(list(enumerate(pca.explained_variance_ratio_.cumsum())))

[(0, 0.12788960733894514),
 (1, 0.2557792146778902),
 (2, 0.383668822016835),
 (3, 0.4990234948363754),
 (4, 0.5966305909635524),
 (5, 0.6803930567811504),
 (6, 0.7568104243193866),
 (7, 0.8296958873987487),
 (8, 0.8955745468310818),
 (9, 0.959172653625048),
 (10, 1.0),
 (11, 1.0)]

In [9]:
# Do the final PCA

# 10 components gives practically 100% of variance explaned for the example data
pca = PCA(n_components=10) 
pca_soap = pca.fit_transform(pca_array)
if DISPLAY_DATA: material_df['pca_soap'] = pca_soap.tolist()

### Save
---

In [12]:
# Save only the necessary data
material_df[['material_id', 'pca_soap']].to_json('pca_soap_df.json')
np.save('pca_soap', pca_soap)  # should save to pca_soap.npy

### Load
---

In [13]:
# A numpy array is required to calculate the distance matrix
# The rows should represent structures, and the columns SOAP dimensions (or PCAed SOAP dimensions)
pca_soap = np.load('pca_soap.npy')

### Calculate Distance Matrix
---

In [18]:
# Calculate the distance matrix + write to disk using distance.f90 in structureREST/script/cluster/

# Add the distance f2py folder to the path
sys.path.insert(0, '../../script/cluster')
# Import the f2py average distance matrix function
from distance import average_distance_matrix
# File where distance matrix will be saved
filename = 'distances.dat'
# Remove file if it exists
if os.path.exists(filename):
    os.remove(filename)
# Write distance matrix directly to filename
average_distance_matrix(pca_soap, filename, "({}(F8.6, ' '))".format(len(pca_soap)))

In [26]:
# Load the distance matrix for viewing
distances = np.loadtxt(filename)
if DISPLAY_DATA: display(distances)

array([[0.      , 1.466368, 1.474199, 1.480894, 1.488533, 1.488533,
        1.488533, 1.468915, 1.479973, 1.503104, 1.471892, 1.476229],
       [0.      , 0.      , 1.444946, 1.436276, 1.466368, 1.466368,
        1.466368, 1.438565, 1.454083, 1.485721, 1.442905, 1.436923],
       [0.      , 0.      , 0.      , 1.454669, 1.474199, 1.474199,
        1.474199, 1.448435, 1.444135, 1.474066, 1.452466, 1.433138],
       [0.      , 0.      , 0.      , 0.      , 1.480894, 1.480894,
        1.480894, 1.444159, 1.466583, 1.074856, 1.451397, 1.547975],
       [0.      , 0.      , 0.      , 0.      , 0.      , 1.488533,
        1.488533, 1.468915, 1.479973, 1.503104, 1.471892, 1.476229],
       [0.      , 0.      , 0.      , 0.      , 0.      , 0.      ,
        1.488533, 1.468915, 1.479973, 1.503104, 1.471892, 1.476229],
       [0.      , 0.      , 0.      , 0.      , 0.      , 0.      ,
        0.      , 1.468915, 1.479973, 1.503104, 1.471892, 1.476229],
       [0.      , 0.      , 0.      , 0. 