# Main Aggregation Script - calls lab.py and crystals.py

In [5]:
#Reloads the lab.py and crystals.py modules to update any changes (after saving)
#If a new method or object is created, autoreload doesn't work and the 
#kernel needs to be closed and halted after saving and making a 'checkpoint'
#in this notebook

%load_ext autoreload
%autoreload 2

In [6]:
import ipas 
import numpy as np
import dask
from dask_jobqueue import SLURMCluster
from dask.distributed import Client, progress
from dask import delayed
from dask import dataframe as dd
import functools
import sys
import ast
from struct import *
import pickle
import glob
import random
import pandas as pd
import time
from dask.distributed import as_completed
from joblib import Parallel, delayed, parallel_backend
import matplotlib.pyplot as plt

In [3]:
cluster = SLURMCluster(
    queue='kratos',
    walltime='04-23:00:00',
    cores=1,
    memory='10000MiB', #1 GiB = 1,024 MiB
    processes=1)

#cluster.adapt(minimum=3, maximum=20)
cluster.scale(10)

Port 8787 is already in use. 
Perhaps you already have a cluster running?
Hosting the diagnostics dashboard on a random port instead.


In [4]:
client = Client(cluster)

In [5]:
client

0,1
Client  Scheduler: tcp://169.226.65.160:37835  Dashboard: http://169.226.65.160:38031/status,Cluster  Workers: 0  Cores: 0  Memory: 0 B


# Initialize databases for queries

In [7]:
files = [f for f in glob.glob("../instance_files/createdb_iceagg_rand*")]

In [8]:
def shape(a,b,c):
    if (b-c) <= (a-b):
        return 'prolate'
    else:
        return 'oblate'

In [9]:
%%time 
data = []
for file in files:
    print(file)
    #dictionary = pickle.load(f)
    data.append(pd.read_pickle(file, None))
datapd = [pd.DataFrame(i) for i in data]
df = pd.concat(datapd, axis=0, ignore_index=True)

../instance_files/createdb_iceagg_rand_r500_1000
../instance_files/createdb_iceagg_rand_r1_5
../instance_files/createdb_iceagg_rand_r6_10
../instance_files/createdb_iceagg_rand_r20_70
../instance_files/createdb_iceagg_rand_r80_400
CPU times: user 1min 12s, sys: 25.8 s, total: 1min 38s
Wall time: 3min 52s


In [10]:
df['agg_r'] = np.power((np.power(df['a'], 2) * df['c']), (1./3.))
df = df[df.agg_r < 5000]
#speed up shape function 
vfunc = np.vectorize(shape)
df['shape'] = vfunc(df['a'], df['b'], df['c'])
df['agg_phi'] = df.c/df.a

In [None]:
np.amax(df['cplx']), np.amin(df['cplx'])

In [None]:
files = [f for f in glob.glob("../instance_files/createdb_iceagg_rand*")]

In [None]:
%%time 
data = []
for file in files:
    print(file)
    #dictionary = pickle.load(f)
    data.append(pd.read_pickle(file, None))
datapd = [pd.DataFrame(i) for i in data]
df = pd.concat(datapd, axis=0, ignore_index=True)
df['agg_r'] = np.power((np.power(df['a'], 2) * df['c']), (1./3.))
df['agg_r'] = df['agg_r'][df.agg_r < 5000]
df['shape'] = df.apply(lambda row: 'prolate' if (row.b - row.c) <= (row.a - row.b) else 'oblate', axis=1)
df['agg_phi'] = df.c/df.a

# Main 

In [11]:
ch_dist='gamma'         #anything other than gamma uses the characteristic from the best distribution pdf (lowest SSE)
rand_orient = True      #randomly orient the seed crystal and new crystal: uses first random orientation
save_plots = False 
agg_phi_bins = 20
agg_r_bins = 20
nclusters = 300

In [12]:
def main():
    
    output = np.empty((agg_phi_bins,agg_r_bins),dtype=object)
    hold_clusters1  = np.empty((agg_phi_bins,agg_r_bins,nclusters), dtype=object)
    hold_clusters2  = np.empty((agg_phi_bins,agg_r_bins,nclusters), dtype=object)

    res, phi_bins = pd.qcut(df.agg_phi, agg_phi_bins, retbins=True)

    for i in range(1, agg_phi_bins):
        #print('agg phi range: ', phi_bins[i], phi_bins[i+1])
        #return a df that only queries within an aspect ratio bin
        df_phi = df[(df.agg_phi > phi_bins[i]) & (df.agg_phi < phi_bins[i+1])]  
        #to ensure at least 2 crystals within agg since ncrystals=1 not in db
        #now break that aspect ratio bin into 20 equal r bins
        
        res, r_bins = pd.qcut(df_phi.agg_r, agg_r_bins, retbins=True)
        for r in range(agg_r_bins):   #agg r
               
            #print('r = ', r_bins[r], r_bins[r+1])
            df_r = df_phi[(df_phi.agg_r > r_bins[r]) & (df_phi.agg_r < r_bins[r+1])]
            #plt.hist(df_r.mono_phi)
            #plt.xscale('log')
            #plt.show()

            samples1 = df_r.sample(nclusters)
            samples2 = df_r.sample(nclusters)
            
            for n, agg in enumerate(samples1.itertuples()):
                hold_clusters1[i,r,n] = ipas.Cluster_Calculations(agg)
            for n, agg in enumerate(samples2.itertuples()):
                hold_clusters2[i,r,n] = ipas.Cluster_Calculations(agg)
            ipas.collect_clusters(hold_clusters1[i,r,:], hold_clusters2[i,r,:], rand_orient=rand_orient)
            #output[i,r] = dask.delayed(ipas.collect_clusters)(hold_clusters1[i,r,:],
            #                                                     hold_clusters2[i,r,:], rand_orient=rand_orient)

    return output, hold_clusters1, hold_clusters2
    

In [13]:
def compute():
    agg_as = np.empty((agg_phi_bins, agg_r_bins, nclusters))
    agg_bs = np.empty((agg_phi_bins, agg_r_bins, nclusters))
    rzs = np.empty((agg_phi_bins, agg_r_bins, nclusters))
    phi2Ds = np.empty((agg_phi_bins, agg_r_bins, nclusters))
    cplxs = np.empty((agg_phi_bins, agg_r_bins, nclusters))
    dds = np.empty((agg_phi_bins, agg_r_bins, nclusters))

    gather = client.compute([*output.tolist()]) 
    gather = client.gather(gather)
    gather = np.array(gather)
    print(np.shape(gather))
    agg_as = gather[:,:,0,:]
    agg_bs = gather[:,:,1,:]
    agg_cs = gather[:,:,2,:]
    phi2Ds = gather[:,:,3,:]
    cplxs = gather[:,:,4,:] 
    dds = gather[:,:,5,:]

    print('DONE!')
    return agg_as, agg_bs, agg_cs, phi2Ds, cplxs, dds

In [14]:
if __name__ == '__main__':
    
    output, hold_clusters1, hold_clusters2 = main()
    agg_as, agg_bs, agg_cs, phi2Ds, cplxs, dds= compute()
    results = {'agg_as': agg_as, 'agg_bs':agg_bs, 'agg_cs':agg_cs, 'phi2Ds':phi2Ds, \
               'cplxs':cplxs, 'dds':dds}


TypeError: collect_clusters() missing 1 required positional argument: 'clusters'

In [None]:
filename = '../instance_files/pulled_clusters_aggagg_rand'
filehandler = open(filename, 'wb')
to_file = np.append(hold_clusters1, hold_clusters2).reshape(20,20,nclusters*2)
pickle.dump(to_file, filehandler)
filehandler.close()
print('finished!')

filename = '../instance_files/instance_db_aggagg_rand'
filehandler = open(filename, 'wb')
pickle.dump(results, filehandler)
filehandler.close()
print('finished!')

# LOAD DATA

In [None]:
f = open('../instance_files/instance_db_aggagg_rand', 'rb')
results = pickle.load(f)
agg_as, agg_bs, agg_cs, phi2Ds, cplxs, dds= \
                results['agg_as'], results['agg_bs'], results['agg_cs'], results['phi2Ds'], results['cplxs'], results['dds']
f.close()

f = open('../instance_files/pulled_clusters_aggagg_rand', 'rb')
pulled_clus = pickle.load(f)
f.close()