In [1]:
%matplotlib widget
%matplotlib inline

In [2]:
import pandas as pd
import dask as dask
from dask import dataframe as dd
import numpy as np
from dask_jobqueue import SLURMCluster
from dask.distributed import Client
import dask.array as da
import functools
import sys
import ast
from struct import *
import pickle
import glob
import random
import seaborn as sns
import ipywidgets as widgets
import matplotlib
import matplotlib.cm as cm
import matplotlib.pyplot as plt

In [None]:
cluster = SLURMCluster(
    queue='kratos',
    walltime='04-23:00:00',
    cores=1,
    memory='7000MiB', #1 GiB = 1,024 MiB
    processes=1)

cluster.scale(10)
#cluster.adapt(minimum=3, maximum=20)

In [None]:
client = Client(cluster)

In [None]:
client

In [6]:
%%time
#READ DATABASE FOR RANDOM ORIENTATION
df = dd.read_parquet("../instance_files/parquet_files/createdb_iceagg_rand*", engine="pyarrow").compute()

CPU times: user 4.49 s, sys: 585 ms, total: 5.07 s
Wall time: 4.4 s


In [8]:
df.info(memory_usage='deep')

<class 'pandas.core.frame.DataFrame'>
Int64Index: 4872000 entries, 0 to 1043999
Data columns (total 8 columns):
 #   Column     Dtype  
---  ------     -----  
 0   ncrystals  int64  
 1   mono_phi   float64
 2   mono_r     int64  
 3   a          float64
 4   b          float64
 5   c          float64
 6   phi2D      float64
 7   cplx       float64
dtypes: float64(6), int64(2)
memory usage: 334.5 MB


In [None]:
%%time

#OLD WAY OF READING DATABASE 
files = ['sqlite:///'+f for f in glob.glob("../db_files/IPAS_*_flat.sqlite")]
tables = ['aggregates', 'crystals']
df=[]
for table in tables:
    
    #read tables in parallel on client 
    read_files = [dask.delayed(dd.read_sql_table)(table=table, uri=file, index_col='id') for file in files]
    
    compute_read = client.compute(read_files)
    print('done with compute')
    ddfs = client.gather(compute_read)
    print('done with gather')
    #concatenate all sqlite files vertically (axis=0 default) (same columns)
    gathered_reads = client.scatter(ddfs)
    ddf = client.submit(dd.concat, gathered_reads).result()
    print('done with submit')
    #append combined dask df for each table
    df.append(ddf)


## Memory info and df stats

In [None]:
df[1].info(memory_usage='deep')

In [None]:
#see partitions
print(df[0].npartitions)
print(df[1].npartitions)

In [None]:
len(df[0]), len(df[1]) 

In [None]:
df_agg = client.persist(df[0]) 

In [None]:
df_mono = client.persist(df[1]) 

In [None]:
agg_stats = df[0]['agg_phi'].describe().round(2).compute()
agg_stats

In [None]:
r_stats = df[0]['agg_r'].describe().round(2).compute()
r_stats

In [None]:
r_stats = df[1]['r'].describe().round(2).compute()
r_stats

In [None]:
phi_stats = df[1]['phi'].describe().round(2).compute()
phi_stats

In [None]:
df[1].columns
#phi_stats = df[1]['phi'].describe().round(2).compute()

In [None]:
#len(df_crys[df_crys.r == 50])
# frequency count of mono r
count = df[1].r.value_counts() 
# Multi-column frequency count 
count = df_repart.groupby(['agg_phi']).count().compute()
print(count)

## Concat, Repartition, and Clean Up DFs

In [None]:
%time df_concat = dd.concat([df[0], df[1]], axis=1)

In [None]:
df_concat.agg_r = np.power((np.power(df_concat.a, 2) * df_concat.c), (1./3.))
df_concat.agg_phi = 1/df_concat.agg_phi
agg_stats = df_concat['agg_phi'].describe().round(2).compute()
agg_stats

In [None]:
def query_r_5000(df):
    return df[(df.agg_r > 5000)]

df_concat_query = df_concat.map_partitions(query_r_5000)
len(df_concat_query) #86% of dataset

In [None]:
df_repart = df_concat.repartition(partition_size="100MB").persist()
df_repart.npartitions

In [None]:
res.value_counts()

In [None]:
agg_stats = df_repart['agg_phi'].describe().round(2).compute()
agg_stats

In [None]:
#df_concat._meta.dtypes
#df_concat.divisions
print(df_concat.npartitions)
print(df_concat.memory_usage(deep=True).sum().compute() / 1024**2)  #5.1 GB

## Test agg agg queries for collection 

In [None]:
plt.hist(df_repart.agg_phi.compute(), bins =10)
plt.show()

In [None]:
#FLAT
file = open('../instance_files/instance_db_aggagg_flat_returnclus1', 'rb')
bflat = pickle.load(file)

#a=z b=y c=x

#dunnavan: a>= b >= c
#in our case z >= y >= x

nclusters = 300
rxs_flat = np.zeros((20,20,nclusters))
rys_flat = np.zeros((20,20,nclusters))
rzs_flat = np.zeros((20,20,nclusters))
phi2Ds_flat = np.zeros((20,20,nclusters))
cplxs_flat = np.zeros((20,20,nclusters))
dds_flat = np.zeros((20,20,nclusters))
cluster1_ncrystals_flat = np.zeros((20,20,nclusters))
cluster2_ncrystals_flat = np.zeros((20,20,nclusters))

counter=0
for phi in range(20):
    for r in range(20):
        rxs_flat[phi, r, :] = np.array(bflat[counter][0])[:,0]
        rys_flat[phi, r, :] = np.array(bflat[counter][0])[:,1]
        rzs_flat[phi, r, :] = np.array(bflat[counter][0])[:,2]
        phi2Ds_flat[phi, r, :] = bflat[counter][1]
        cplxs_flat[phi, r, :] = bflat[counter][2]
        dds_flat[phi, r, :] = bflat[counter][3]
        cluster1_ncrystals_flat[phi, r, :] = bflat[counter][4]
        cluster2_ncrystals_flat[phi, r, :]= bflat[counter][5]
        counter+=1
phiba_flat = rys_flat/rzs_flat
phica_flat = rxs_flat/rzs_flat

print(np.max(cplxs_flat), np.min(cplxs_flat[cplxs_flat!=-999.0]))

In [None]:
#RAND
file = open('../instance_files/instance_db_aggagg_rand_returnclus1', 'rb')
brand = pickle.load(file)

#a=z b=y c=x

#dunnavan: a>= b >= c
#in our case z >= y >= x

nclusters = 300
rxs_rand = np.zeros((20,20,nclusters))
rys_rand = np.zeros((20,20,nclusters))
rzs_rand = np.zeros((20,20,nclusters))
phi2Ds_rand = np.zeros((20,20,nclusters))
cplxs_rand = np.zeros((20,20,nclusters))
dds_rand = np.zeros((20,20,nclusters))
cluster1_ncrystals_rand = np.zeros((20,20,nclusters))
cluster2_ncrystals_rand = np.zeros((20,20,nclusters))

counter=0
for phi in range(20):
    for r in range(20):
        rxs_rand[phi, r, :] = np.array(brand[counter][0])[:,0]
        rys_rand[phi, r, :] = np.array(brand[counter][0])[:,1]
        rzs_rand[phi, r, :] = np.array(brand[counter][0])[:,2]
        phi2Ds_rand[phi, r, :] = brand[counter][1]
        cplxs_rand[phi, r, :] = brand[counter][2]
        dds_rand[phi, r, :] = brand[counter][3]
        cluster1_ncrystals_rand[phi, r, :] = brand[counter][4]
        cluster2_ncrystals_rand[phi, r, :]= brand[counter][5]
        counter+=1
phiba_rand = rys_rand/rzs_rand
phica_rand = rxs_rand/rzs_rand
#np.shape(phica_rand)
print(np.max(cplxs_rand), np.min(cplxs_rand[cplxs_rand!=-999.0]))

In [None]:
#FLAT Ncrystals
file = open('instance_files/pulled_clusters_flat1', 'rb')
bflat = pickle.load(file)

nclusters = 301
cluster1_ncrystals_flat = np.zeros((20,20,nclusters))

counter=0
for phi in range(20):
    for r in range(20):
        for n in range(301):
            cluster1_ncrystals_flat[phi, r, :] = [n.ncrystals for n in bflat[phi, r, :]]

        counter+=1


In [None]:
#FLAT Ncrystals
file = open('instance_files/pulled_clusters_rand', 'rb')
brand = pickle.load(file)

nclusters = 301
cluster1_ncrystals_rand = np.zeros((20,20,nclusters))

counter=0
for phi in range(20):
    for r in range(20):
        for n in range(301):
            cluster1_ncrystals_rand[phi, r, :] = [n.ncrystals for n in brand[phi, r, :]]

        counter+=1


In [None]:
def query_ncrystals(df_phi, r_bins):
    avg_ncrystals = []
    for r in range(len(r_bins)-1):
        df = df_phi[(df_phi.agg_r > r_bins[r]) & (df_phi.agg_r < r_bins[r+1])]
        avg_ncrystals.append(df.r.mean().compute())
    return avg_ncrystals


In [None]:
def avg_ncrystals_afteragg(df_phi, r_bins):
    avg_ncrystals = []
    for r in range(len(r_bins)-1):
        avg_ncrystals.append(np.mean(cluster1_ncrystals_flat[i,r,:])+np.mean(cluster2_ncrystals_flat[i,r,:]))
    return avg_ncrystals

In [None]:
def avg_cplx(i, r_bins):
    avg_cplx = []
    for r in range(len(r_bins)-1):
        avg_cplx.append(np.mean(cplxs_flat[i,r,:]))
    return avg_cplx


In [None]:
#FLAT
res, phi_bins_flat = pd.qcut(df_repart.agg_phi.compute(), 20, retbins=True)
%store phi_bins_flat
#print(phi_bins)
phi_bin_labs = []
avg_ncrystals=np.empty((len(phi_bins_flat)-1,len(phi_bins_flat)-1))
avg_cplxs=np.empty((len(phi_bins_flat)-1,len(phi_bins_flat)-1))
all_r_bins_flat = np.empty((len(phi_bins_flat),len(phi_bins_flat)))


for i in range(len(phi_bins_flat)-1):
    print('i = ', i)
    phi_bin_labs.append('[%.3f-%.3f]' %(phi_bins_flat[i],phi_bins_flat[i+1]))
    #return a df that only queries within an aspect ratio bin
    df_phi = df_repart[(df_repart.agg_phi > phi_bins_flat[i]) & (df_repart.agg_phi < phi_bins_flat[i+1])]
    #now break that aspect ratio bin into 20 equal r bins
    res, r_bins_flat = pd.qcut(df_phi.agg_r.compute(), 20, retbins=True)
    
    all_r_bins_flat[i,:]=r_bins_flat
    #now use those r bins from the output of queried r and phi to find # of monomers per bin
    avg_ncrystals[i,:] = query_ncrystals(df_phi, r_bins_flat)
    avg_ncrystals[i,:] = avg_ncrystals_afteragg(i, r_bins_flat)
    avg_cplxs[i,:] = avg_cplx(i, r_bins_flat)
    
%store all_r_bins_flat    

In [None]:
#RAND
res, phi_bins_rand = pd.qcut(df_repart.agg_phi.compute(), 20, retbins=True)
%store phi_bins_rand
#print(phi_bins)
phi_bin_labs = []
avg_ncrystals=np.empty((len(phi_bins_rand)-1,len(phi_bins_rand)-1))
avg_cplxs=np.empty((len(phi_bins_rand)-1,len(phi_bins_rand)-1))
all_r_bins_rand = np.empty((len(phi_bins_rand),len(phi_bins_rand)))


for i in range(len(phi_bins_rand)-1):
    print('i = ', i)
    phi_bin_labs.append('[%.3f-%.3f]' %(phi_bins_rand[i],phi_bins_rand[i+1]))
    #return a df that only queries within an aspect ratio bin
    df_phi = df_repart[(df_repart.agg_phi > phi_bins_rand[i]) & (df_repart.agg_phi < phi_bins_rand[i+1])]
    #now break that aspect ratio bin into 20 equal r bins
    res, r_bins_rand = pd.qcut(df_phi.agg_r.compute(), 20, retbins=True)
    
    all_r_bins_rand[i,:] = r_bins_rand
    #now use those r bins from the output of queried r and phi to find # of monomers per bin
    avg_ncrystals[i,:] = query_ncrystals(df_phi, r_bins_rand)
    avg_ncrystals[i,:] = avg_ncrystals_afteragg(i, r_bins_rand)
    avg_cplxs[i,:] = avg_cplx(i, r_bins_rand)
    
%store all_r_bins_rand    

In [None]:
fig, ax = plt.subplots(figsize=(10,8))
cmap = plt.cm.jet
# vmin=np.amin(avg_cplxs)
# vmax=np.amax(avg_cplxs)
# print(vmin, vmax) 
#1.3914486210085406 907.974759288473   flat r max min
#1.2451487710219922 640.6597671410091  rand r

#flat
#mono_r = vmin=np.amin(avg_ncrystals), vmax=np.amax(avg_ncrystals)
#norm = matplotlib.colors.LogNorm(vmin=1.2, vmax=900)
norm = matplotlib.colors.Normalize(vmin=4, vmax=100)

#norm = matplotlib.colors.LogNorm(vmin=0.5, vmax=1.0)

for i in range(len(phi_bins)-1): 
    print('i= ', i)
    for r in range(len(r_bins)-2):
        if r != 0:
            plt.bar([i]*len(r_bins), all_r_bins[i,r], bottom= all_r_bins[i,r-1],  color=cmap(norm(np.mean(cluster1_ncrystals_flat[i,r,:]+cluster2_ncrystals_flat[i,r,:]))),edgecolor='k')

        else:
            plt.bar([i]*len(r_bins), all_r_bins[i,r], color=cmap(norm(np.mean(cluster1_ncrystals_flat[i,r,:]+cluster2_ncrystals_flat[i,r,:]))), edgecolor='k')
        
#     for x,y in zip([i]*len(r_bins), r_bins):

#         label = "{:.2f}".format(y)

#         plt.annotate(label, # this is the text
#                      (x,y), # this is the point to label
#                      textcoords="offset points", # how to position the text
#                      xytext=(0,1), # distance from text to points (x,y)
#                      ha='center') # horizontal alignment can be left, right or center

    
plt.yscale('log')
plt.xticks(np.arange(len(phi_bin_labs)), phi_bin_labs, rotation=90, ha="center",fontsize=16,family='serif')
plt.ylabel("Aggregate Radius Bins",fontsize=16,family='serif')
plt.xlabel("Aggregate Aspect Ratio ($\phi$) bins",fontsize=16,family='serif')  
cb = plt.cm.ScalarMappable(cmap=cmap)
cbar = plt.colorbar(cb,format='%.2f')
#cbar.ax.set_ylabel('Average # of monomers per bin', fontsize=16, family='serif')
cbar.ax.set_ylabel('Average # of monomers per bin', fontsize=16, family='serif')
#plt.title('Quasi-Horizontal Orientation',fontsize=16, family='serif')
plt.title('Flat Orientation',fontsize=16, family='serif')
plt.tight_layout()
#plt.savefig('bins_rand_meanmono_r_5000rad_logy.pdf')

#sm.set_label('Average # of monomers in the aggregates')
#textstr = '$n$ values per $phi$ :', str(res.value_counts()[0])
#ax.text(0.05, 0.95, textstr, transform=ax.transAxes, fontsize=14,
#        vertical alignment='top', bbox=dict(boxstyle='round'))
#17716 values per phi, per r

In [None]:
#switch a and c
col_list = list(df_repart)
print(col_list)
col_list[3], col_list[5] = col_list[5], col_list[3]
# assign back, the order will now be swapped
df_repart.columns = col_list

In [None]:
# df_repart['c'] = pd.DataFrame([df_repart['a'], df_repart.c]).min(axis=1)
# df_repart['a'] = pd.DataFrame([df_repart['a'], df_repart.c]).max(axis=1)
df_repart['agg_phi'] = df_repart.c/df_repart.a
df_repart['shape'] = df_repart.apply(lambda row: 'prolate' if (row.b - row.c) <= (row.a - row.b) else 'oblate', axis=1)

In [None]:
res, phi_bins = pd.qcut(df_repart.agg_phi.compute(), 20, retbins=True)
shape = np.empty((len(phi_bins), len(r_bins)), dtype=str)
print(phi_bins)
for i in range(len(phi_bins)-1):
    print('i = ', phi_bins[i], phi_bins[i+1])
    #return a df that only queries within an aspect ratio bin
    df_phi = df_repart[(df_repart.agg_phi > phi_bins[i]) & (df_repart.agg_phi < phi_bins[i+1])]
    #now break that aspect ratio bin into 20 equal r bins
    res, r_bins = pd.qcut(df_phi.agg_r.compute(), 20, retbins=True)
    for r in range(len(r_bins)-1):
        print('r =', r_bins[r], r_bins[r+1])
        df_r = df_phi[(df_phi.agg_r > r_bins[r]) & (df_phi.agg_r < r_bins[r+1])].compute() 

        oblates = df_r['shape'][df_r['shape'] == 'oblate'].count()
        prolates = df_r['shape'][df_r['shape'] == 'prolate'].count()
       
        print(oblates, prolates)
        shape[i,r] = 'oblates' if oblates > prolates else 'prolates'
 

In [None]:
oblates = df_repart['shape'][df_repart['shape']  == 'oblate'].count().compute()
prolates = df_repart['shape'][df_repart['shape']  == 'prolate'].count().compute()
print(oblates, prolates)