In [1]:
from sys import path
path.insert(0, '/home/azadoks/git/glosim2/')
path.insert(0, '/home/azadoks/git/structureREST/lib/')
from pymatgen import MPRester, Structure
from pymatgen.io.cif import CifParser
from libmatch.soap import get_Soaps
from multiprocessing import Pool, cpu_count
from sklearn.decomposition import PCA
import numpy as np
import pandas as pd
import json
import zoap

Pymatgen will drop Py2k support from v2019.1.1. Pls consult the documentation
at https://www.pymatgen.org for more details.
  at https://www.pymatgen.org for more details.""")


In [2]:
def structure2quippy_wrap(structure):
    '''
    Wrapper to use zoap.structure2qp with multiprocessing.Pool.map()
    Anonymize _greatly_ speeds up SOAP calculation.
    If structures are not already primitive,
        primitivize can speed up SOAP calculation.
    '''
    return zoap.structure2quippy(structure,
                                 anonymize=True, scale=True,
                                 standardize=True, primitivize=True)

def average_soap(soap_dict):
    '''Calculate the average SOAP vector from a SOAP dictionary'''
    return np.mean(soap_dict.values(), axis=0)

In [3]:
# sources = ['matproj_query', 'matproj_query_json', 'matproj_nature_json']
data_source = 'matproj_query_json'

if data_source == 'matproj_query':
    with MPRester('0WqdPfXxloze6T9N') as mpr:
        criteria = {'elasticity': {'$ne': None}}
        properties = ['pretty_formula', 'structure', 'elasticity', 'material_id']
        results = mpr.query(criteria, properties, mp_decode=True)  # structures come as Sructures
    df = pd.DataFrame(results)
elif data_source == 'matproj_query_json':
    with open('../../data/elastic/ec_query.json', 'r') as f:
        results = json.load(f)
    df = pd.DataFrame(results)
    df['structure'] = df['structure'].apply(zoap.structure_from_dict_wrap)  # structures come as dictionaries
elif data_source == 'matproj_nature_json':
    with open('../../data/elastic/ec.json', 'r') as f:
        results = json.load(f)
    df = pd.DataFrame(results)
    df['cif'] = df['structure']
    df['structure'] = df['cif'].apply(zoap.cifparser_from_string_wrap)  # structures come as CIFs

In [4]:
N_CPUS = cpu_count() # lower processes if memory gets out of hand -OR- implement chunking
pool = Pool(processes=N_CPUS)  
df['quippy'] = pool.map(structure2quippy_wrap, df['structure'])
pool.close()
pool.join()

In [5]:
df['soap'] = get_Soaps(np.array(df['quippy']),
                        nocenters=None, chem_channels=False,
                        centerweight=1.0, gaussian_width= 0.35,
                        cutoff=2.0, cutoff_transition_width=0.25,
                        nmax=12, lmax=10,
                        spkitMax=None, nprocess=8,
                        chemicalProjection=None, dispbar=False,
                        is_fast_average=False)
df['average_soap'] = df['soap'].apply(average_soap)

HBox(children=(IntProgress(value=0, description=u'SOAP vectors', max=7122), HTML(value=u'')))




In [13]:
exp_var_condition = 0.999
soap_array = np.concatenate(df['average_soap']).reshape(len(df['average_soap']), len(df['average_soap'][0]))

pca = PCA()
pca.fit(soap_array)
exp_var = pca.explained_variance_ratio_.cumsum().tolist()
n_components = exp_var.index(next(x for x in exp_var if x > exp_var_condition))

pca = PCA(n_components)
pca_soap = pca.fit_transform(soap_array)

In [16]:
np.savetxt('pca_average_soap.dat', pca_soap)

In [None]:
def average_distance(average_soap1, average_soap2):
    '''
    Calculate distance between to averaged SOAP vectors
    using the average distance kernel from De et al. (2016)
    Args:
        average_soap1 (numpy.ndarray): numpy array of average SOAP vector 1
        average_soap2 (numpy.ndarray): numpy array of average SOAP vector 2
    Returns:
        float: normalized distance between average_soap1 and average_soap2
    '''
    k11 = np.linalg.norm(average_soap1)
    k22 = np.linalg.norm(average_soap2)
    k12 = np.linalg.norm(average_soap1 - average_soap2)
    d12 = np.sqrt(2 - 2 * (k12 / np.sqrt(k11 * k22)))
    return d12

def average_distance_matrix(vectors):
    '''
    Generate distance matrix for a set average SOAP vectors
        using the average distance kernel
    '''
    distance_matrix = np.zeros(len(vectors), len(vectors))
    for i, v1 in enumerate(vectors):
        for j, v2 in enumerate(vectors[i+1:]):
            distance_matrix[i,j] = average_distance(v1, v2)
    return distance_matrix