### TODO
* Get 4 sets of structures:
  * spacegroup A
  * spacegroup B
  * random structures from DB
  * random (ideal gas) structures from np.random
* Comparing structures to ideal gas
  * ensure consistent density distribution of random structues and ideal gasses
  * optimize SOAP parameters so that HDBSCAN clustering does not cluster ideal
    gasses with structures 
* Ensuring cluster segregation between spacegroups
  * use sets of spacegroup A and spacegroup B (possibly more ~5 space groups)
  * fingerprint with SOAP and cluster with HDBSCAN
  * tune SOAP parameters and dimensionality reduction to ensure cluster segregation
  * optimize for wyckoff segregation within spacegroup clusters as well
* Try dimensionality reduction using PCA with Zimmermann / Matminer / Pymatgen fingerprints (1x48 vectors)
* Implement SOAP mean / std dev reduction and concatenation
  * reduce dimensionality with PCA trained on full set of individual soap vectors
  * ? include std dev in the training set ?

---

In [None]:
import pymongo
from pymatgen import Structure
import json

N = 10000
MAX_SITES = 64 # max number of sites in structure

client = pymongo.MongoClient('mongodb://127.0.0.1:27018/')
db = client['structureREST']

source_collection = db['icsd']
matminer_collection = db['matminer']
stidy_collection = db['stidy']

In [None]:
soap_docs = []
k = 1
for source_doc in source_collection.find():
    if len(source_doc['structure']['sites']) <= MAX_SITES:
        structure = Structure.from_dict(source_doc['structure'])
        if structure.is_ordered:
            matminer_doc = matminer_collection.find_one({'source_id': source_doc['_id'],
                                                         'matminer_fingerprint': {'$ne': None}})
            stidy_doc = stidy_collection.find_one({'source_id': source_doc['_id'],
                                                   'stidy_fingerprint': {'$ne': None}})
            if matminer_doc and stidy_doc:
                matminer_doc['_id'] = str(matminer_doc['_id'])
                matminer_doc['source_id'] = str(matminer_doc['source_id'])

                stidy_doc['_id'] = str(stidy_doc['_id'])
                stidy_doc['source_id'] = str(stidy_doc['source_id'])

                soap_doc = {'matminer': matminer_doc,
                            'stidy_doc': stidy_doc,
                            'structure': source_doc['structure']}
                soap_docs.append(soap_doc)
                if not len(soap_docs) % 100:
                    print '{}'.format(len(soap_docs)),
                if len(soap_docs) == 1000:
                    print '\nDumping {}'.format(k)
                    with open('soap_docs_new_{}.json'.format(k), 'w') as f:
                        json.dump(soap_docs, f)
                    k += 1
                    soap_docs = []

In [None]:
soap_docs = []
for sj in glob('soap_docs_new*.json'):
    with open(sj, 'r') as f:
        soap_docs += json.load(f)

with open('soap_docs_new.json'.format(k), 'w') as f:
    json.dump(soap_docs, f)

In [None]:
from glob import glob
for soap_json in glob('soap_docs_new.json'):
    soap_docs_mod = []
    with open(soap_json, 'r') as f:
        soap_docs = json.load(f)
    for soap_doc in soap_docs:
        soap_doc['space_group'] = int(soap_doc['stidy_doc']['stidy_fingerprint'].split('_')[0])
        soap_doc['n_sites'] = len(soap_doc['structure']['sites'])
        soap_doc['stidy_fingerprint'] = soap_doc['stidy_doc']['stidy_fingerprint']
        soap_docs_mod.append(soap_doc)
    with open(soap_json.split('.')[0]+'_mod.json', 'w') as f:
        json.dump(soap_docs_mod, f)

In [None]:
for soap_json in glob('soap_docs_new_*mod.json'):
    all_docs = []
    with open(soap_json, 'r') as f:
        soap_docs = json.load(f)
    all_docs += soap_docs
all_df = pd.DataFrame(all_docs)
all_df.to_json('all_soap_docs.json')

---

In [14]:
# soap_param = dict(nocenters=[], chem_channels=False, centerweight=1.0, 
#                   gaussian_width=0.4, cutoff=3., cutoff_transition_width=0.5, 
#                   nmax=20, lmax=8, nprocess=8, spkitMax=spkitMax,
#                   dispbar=True, is_fast_average=False)

In [15]:
from sys import path
path.insert(0,'/home/azadoks/git/glosim2/')
import libmatch.soap
import soap_utils
import zoap
import json
from pymatgen import Structure
import numpy as np
import multiprocessing as mp
from fractions import gcd
from itertools import chain, product, combinations
import hdbscan
import pandas as pd
import json

Pymatgen will drop Py2k support from v2019.1.1. Pls consult the documentation
at https://www.pymatgen.org for more details.
  at https://www.pymatgen.org for more details.""")


In [26]:
soap_df = pd.read_json('data/soap_docs_new_mod.json')

In [28]:
space_groups = [2, 14, 62, 139, 194, 221, 225]
soap_df_sg = soap_df.loc[soap_df['space_group'].isin(space_groups)]

In [33]:
%%time
soap_structures = soap_df_sg['structure'].tolist()
soap_vector_sets = zoap.zoap(soap_structures, nmax=10, lmax=8)

TypeError: 'NoneType' object has no attribute '__getitem__'

In [None]:
def dist_wrapper(vects):
    return zoap.average_distance(*vects)

In [None]:
svs_combis = []
for i, x in enumerate(soap_vector_sets):
    for j, y in enumerate(soap_vector_sets):
        svs_combis.append((x, y))
pool = mp.Pool()
average_distances = pool.map(dist_wrapper, svs_combis)
distance_matrix = np.array(average_distances).reshape((len(soap_vector_sets), len(soap_vector_sets)))
pool.close()
pool.join()

In [None]:
# pool = mp.Pool()
# svs_combis = [(i, j) for x, i in enumerate(soap_vector_sets) for y, j in enumerate(soap_vector_sets) if x != y]
# average_distances = pool.map(dist_wrapper, svs_combis)
# pool.close()
# pool.join()
# # =======
# svs_combis = []
# for i, x in enumerate(soap_vector_sets):
#     for j, y in enumerate(soap_vector_sets[i:]):
#         if i == j:
#             svs_combis.append((None, None))
#         else:
#             svs_combis.append((x, y))
# average_distances = pool.map()

---

In [None]:
from sklearn.decomposition import PCA
## nmax = 20
## lmax = 10
# 37 > 0.999
# 19 > 0.99
# 6 > 0.9
# 3 > 0.8
all_vectors = np.concatenate(np.array([svs.values() for svs in soap_vector_sets]))
pca = PCA(n_components=19)
pc = pca.fit_transform(all_vectors)
pca.explained_variance_ratio_.cumsum()