In [None]:
from sys import path
path.insert(0, '/home/azadoks/git/glosim2/')
path.insert(0, '/home/azadoks/git/structureREST/zocrys/lib/')
from collections import OrderedDict
from libmatch.soap import get_Soaps
from multiprocessing import Pool, n_cpus
import numpy as np
import zoap
import pymongo

In [1]:
def get_chunk_indices(total_size, chunk_size):
    '''
    Generate list of index tuples to chunk up a big set of data
    Args:
        total_size (int): number of items in big set
        chunk_size (int): number of items in all but the last chunk set
    Returns:
        [(int, int)]: list of index tuples
    '''
    i = 0
    indices = []
    while i < total_size:
        indices.append((i, i+chunk_size))
        i += chunk_size
    return indices

In [None]:
client = pymongo.MongoClient('mongodb://127.0.0.1:27018/')
db = client['structureREST']

icsd_coll = db['icsd']
cod_coll = db['cod']
mpds_coll = db['mpds']

stidy_coll = db['stidy']
matminer_coll = db['matminer']
soap_coll = db['soap']

args_coll = db['args']

In [None]:
# Anonymize _greatly_ speeds up SOAP calculation.
# If structures are not already primitive,
#     primitivize can speed up SOAP calculation.
quippy_args = {'anonymize': True, 'scale': True,
               'standardize': True, 'primitivize': True,
               'symprec': 1e-3}

soap_args = {'nocenters': None, 'chem_channels': False,
             'centerweight': 1.0, 'gaussian_width': 0.5,
             'cutoff': 3.5, 'cutoff_transition_width': 0.5,
             'nmax': 8, 'lmax': 6,
             'spkitMax': None, 'chemicalProjection': None,
             'is_fast_average': False}

args_doc = {'structure2quippy': quippy_args,
            'soap': soap_args}
args_result = args_coll.isert(args_doc)
args_id = args_result.inserted_id

In [None]:
def structure2quippy_wrap(structure):
    '''Wrapper to use zoap.structure2qp with multiprocessing.Pool.map() or pandas.DataFrame.apply()'''
    return zoap.structure2quippy(structure, **quippy_args, from_dict=True)

In [None]:
chunk_size = 10000
collections = {'icsd': icsd_coll, 'mpds': mpds_coll, 'cod': cod_coll}
for source_name, source_coll in colls.iteritems():
    source_find = source_coll.find(projection=['structure'])
    source_count = source_coll.find.count()
    pool = Pool(processes=n_cpus())
    for indices in get_chunk_indices(source_count, chunk_size):
        source_docs = list(source_find[indices[0], indices[1]])  # grab source docs chunk
        quippies = pool.map(structure2qp_wrap, [d['structure']])  # convert Structures to Atoms
        soaps = get_Soaps(quippies, **soap_args, nprocess=8, dispbar=False)  # calculate soaps
        soap_docs = [{'soap': soap, 'quippy': quippy,
                      'source_id': source_doc['_id'],
                      'source_collection': source_name,
                      'args_id': args_id} for soap, quippy in zip(soaps, quippies)]  # create soap docs
        result = soap_coll.insert_many(soap_docs)
        print(len(result.inserted_ids))
        del source_docs, quippies, soaps, soap_docs, result
    pool.join()
    pool.close()