In [2]:
from sys import path
path.insert(0, '/home/azadoks/git/glosim2/')
path.insert(0, '/home/azadoks/git/structureREST/lib/')
from collections import OrderedDict
from libmatch.soap import get_Soaps
from multiprocessing import Pool, cpu_count
from quippy.io import AtomsWriter
from json import loads
from os import remove
import numpy as np
import cStringIO
import gzip
import zoap
# import quippy_utils
import pymongo

In [3]:
client = pymongo.MongoClient('mongodb://127.0.0.1:27018/')
db = client['structureREST']

icsd_coll = db['icsd']
cod_coll = db['cod']
mpds_coll = db['mpds']

stidy_coll = db['stidy']
matminer_coll = db['matminer']
soap_coll = db['soap']

args_coll = db['args']

In [5]:
from pymatgen import Structure
for name, coll in {'icsd': icsd_coll, 'cod': cod_coll, 'mpds': mpds_coll}.items():
    print('collection', name)
    print('<=64', coll.find({'n_sites': {'$lte': 64}}).count())
    count = 0
    for doc in coll.find({'n_sites': {'$lte': 64}}):
        structure = Structure.from_dict(doc['structure'])
        if structure.is_ordered:
            count +=1 
    print('ordered', count)

('collection', 'icsd')
('<=64', 148531)
('ordered', 87584)
('collection', 'mpds')
('<=64', 51072)
('ordered', 41974)
('collection', 'cod')
('<=64', 62113)
('ordered', 42205)


In [18]:
N_CPUS = cpu_count()
# Anonymize _greatly_ speeds up SOAP calculations
#     at the loss of species
# If structures are not already primitive,
#     primitivize can speed up SOAP calculations by
#     reducing the number of sites in the cell
quippy_args = {'anonymize': True, 'scale': True,
               'standardize': True, 'primitivize': True,
               'symprec': 1e-3}

soap_args = {'nocenters': None, 'chem_channels': False,
             'centerweight': 1.0, 'gaussian_width': 0.5,
             'cutoff': 3.5, 'cutoff_transition_width': 0.5,
             'nmax': 8, 'lmax': 6,
             'spkitMax': None, 'chemicalProjection': None,
             'is_fast_average': False}

args_doc = {'structure2quippy': quippy_args,
            'soap': soap_args}
args_result = args_coll.insert_one(args_doc)
args_id = args_result.inserted_id

In [19]:
def structure2quippy_wrap(structure):
    '''Wrapper to use zoap.structure2qp with multiprocessing.Pool.map() or pandas.DataFrame.apply()'''
    return zoap.structure2quippy(structure, from_dict=True, **quippy_args)

In [10]:
chunk_size = 100
colls = {'icsd': icsd_coll, 'mpds': mpds_coll, 'cod': cod_coll}
for source_name, source_coll in colls.iteritems():
    # query database + get info
    source_find = source_coll.find({'n_sites': {'$lte': 128}}, 
                                   projection=['structure'],
                                   modifiers={'$snapshot': True},
                                   batch_size=chunk_size)
    source_count = source_find.count()
    # create persistent objects
    pool = Pool(processes=N_CPUS)
    quippy_writer = AtomsWriter('string', format='string')
    # get bounds indices for chunks
    chunk_indices = zoap.get_chunk_indices(source_count, chunk_size)
    for indices in chunk_indices:
        # grab source docs chunk
        source_docs = np.array(list(source_find[indices[0]: indices[1]]))
        # convert Structures to Atoms      
        quippies = np.array(pool.map(structure2quippy_wrap, [d['structure'] for d in source_docs]))
        # remove dud (probably disordered) structure entries
        source_docs = source_docs[np.where(quippies != None)]
        quippies = quippies[np.where(quippies != None)]
        # write quippy atoms to strings using extended xyz format
        quippy_strs = [quippy_writer.write(quippy) for quippy in quippies]
        # calculate soaps ([OrderedDict[(str, np.ndarray([float]))]])
        soaps = get_Soaps(quippies, nprocess=N_CPUS, dispbar=False, **soap_args)
        # prepare soaps for serialization
        soaps = [{key: value.tolist() for key, value in soap.iteritems()} for soap in soaps]
        # TODO: fix too large source documents (soaps)
        # create soap docs
        soap_docs = [{'soap': soap, 'quippy': quippy_str,
                      'source_id': source_doc['_id'],
                      'source_collection': source_name,
                      'args_id': args_id}
                     for soap, quippy_str, source_doc in 
                         zip(soaps, quippy_strs, source_docs)]
#         soap_docs = [{'soap': soap,
#                       'source_id': source_doc['_id'],
#                       'source_collection': source_name,
#                       'args_id': args_id}
#                      for soap, source_doc in 
#                          zip(soaps, source_docs)]  # create soap docs
        result = soap_coll.insert_many(soap_docs)
        print(len(result.inserted_ids))
        # del source_docs, quippies, soaps, soap_docs, result
        raise Exception('Done')
        pool.join()
        pool.close()
        quippy_writer.close()

HBox(children=(IntProgress(value=0, description=u'SOAP vectors'), HTML(value=u'')))


100


Exception: Done