In [1]:
import numpy as np
import pandas as pd
import glob
import seaborn as sns
import pickle
import matplotlib.pyplot as plt
import agg_properties
import sys
sys.path.append("../collection_from_db")
import ipas.cluster_calculations as cc
from pandarallel import pandarallel
pandarallel.initialize(progress_bar=False)
import shapely.geometry as geom
import shapely.ops as shops
from shapely.geometry import Point
from multiprocessing import Pool
import tables
from dask import dataframe as dd
import swifter
%load_ext memory_profiler
%load_ext autoreload
%autoreload 2

INFO: Pandarallel will run on 28 workers.
INFO: Pandarallel will use Memory file system to transfer data between the main process and workers.


In [2]:
%%time 
#read in database of aggs (all the same monomers) with points 
#need points to calculate area ratio
files = [f for f in glob.glob("../instance_files/createdb_iceagg_rand*")]
dfs = []
for file in files:
    print(file)
    dfs.append(pd.read_pickle(file, None))
dfs = [pd.DataFrame(i) for i in dfs]
df = pd.concat(dfs, axis=0, ignore_index=True)

../instance_files/createdb_iceagg_rand_r500_1000
../instance_files/createdb_iceagg_rand_r1_5
../instance_files/createdb_iceagg_rand_r6_10
../instance_files/createdb_iceagg_rand_r20_70
../instance_files/createdb_iceagg_rand_r80_400
CPU times: user 1min 12s, sys: 23.3 s, total: 1min 36s
Wall time: 3min 22s


In [3]:
def shape(a,b,c):
    """
    calculate if an aggregate is oblate or 
    prolate based on axis lengths (a,b,c)
    from a fit ellipsoid"""
    
    if (b-c) <= (a-b):
        return 'prolate'
    else:
        return 'oblate'

In [4]:
#post-process for aggregates < 5'mm' and append shape and aspect ratio to db
df['agg_r'] = np.power((np.power(df['a'], 2) * df['c']), (1./3.))
df = df[df.agg_r < 5000]
#speed up shape function 
vfunc = np.vectorize(shape)
df['shape'] = vfunc(df['a'], df['b'], df['c'])
df['agg_phi'] = df.c/df.a


In [5]:
def filled_circular_area_ratio(row,  dims=['x', 'z']):
        '''returns the area of the largest contour divided by the area of
        an encompassing circle

        useful for spheres that have reflection spots that are not captured
        by the largest contour and leave a horseshoe pattern (for CPI data)'''
  
        polygons = [geom.MultiPoint(row.points[n][dims]).convex_hull for n in range(row.ncrystals)]
        agg = shops.cascaded_union(polygons)
        area = agg.area
        poly = shops.cascaded_union(agg).convex_hull
        x, y = poly.exterior.xy
        c = cc.Cluster_Calculations(row)
        circ = c.make_circle([x[i], y[i]] for i in range(len(x)))
        circle = Point(circ[0], circ[1]).buffer(circ[2])
        x, y = circle.exterior.xy
        Ac = circle.area
        
        return area/Ac

## test fastest way to calculate area ratio

In [None]:
ddf = dd.from_pandas(df, npartitions=8)


In [18]:
%%time
ar = df.swifter.apply(lambda row:filled_circular_area_ratio(row), axis=1)

RayTaskError(ModuleNotFoundError): [36mray::deploy_ray_func()[39m (pid=4738, ip=169.226.65.50)
  File "python/ray/_raylet.pyx", line 425, in ray._raylet.execute_task
  File "python/ray/_raylet.pyx", line 447, in ray._raylet.execute_task
ray.exceptions.RayTaskError: [36mray::deploy_ray_func()[39m (pid=4738, ip=169.226.65.50)
  File "python/ray/_raylet.pyx", line 425, in ray._raylet.execute_task
  File "python/ray/_raylet.pyx", line 442, in ray._raylet.execute_task
  File "/network/rit/lab/sulialab/share/bin/miniconda3/envs/IPAS/lib/python3.7/site-packages/ray/serialization.py", line 310, in deserialize_objects
    self._deserialize_object(data, metadata, object_ref))
  File "/network/rit/lab/sulialab/share/bin/miniconda3/envs/IPAS/lib/python3.7/site-packages/ray/serialization.py", line 248, in _deserialize_object
    return self._deserialize_msgpack_data(data, metadata_fields)
  File "/network/rit/lab/sulialab/share/bin/miniconda3/envs/IPAS/lib/python3.7/site-packages/ray/serialization.py", line 226, in _deserialize_msgpack_data
    python_objects = self._deserialize_pickle5_data(pickle5_data)
  File "/network/rit/lab/sulialab/share/bin/miniconda3/envs/IPAS/lib/python3.7/site-packages/ray/serialization.py", line 216, in _deserialize_pickle5_data
    obj = pickle.loads(in_band)
  File "/network/rit/lab/sulialab/share/bin/miniconda3/envs/IPAS/lib/python3.7/site-packages/ray/cloudpickle/cloudpickle.py", line 562, in subimport
    __import__(name)
ModuleNotFoundError: No module named 'ipas'

In [6]:
%%time
ar = dd.map_partitions(lambda ddf: ddf.apply((lambda row: filled_circular_area_ratio(row)),\
                                           meta=pd.Series(dtype='float', name='area ratio'),\
                                           axis=1)).compute(scheduler='processes')

NameError: name 'ddf' is not defined

In [None]:
ddf = ddf.apply(
    filled_circular_area_ratio, 
    args=('col_1', 'col_2'), 
    axis=1, 
    meta=('result', int)
).compute(get=get)

In [None]:
for pos,row in df.iterrows():
    filled_circular_area_ratio(row)

In [None]:
#end tests 

In [None]:
%%time
#takes ~7 hours!
df_ar = df.apply(lambda x: filled_circular_area_ratio(x), axis=1)

In [None]:
#save area ratio dataframe so we don't have to rerun
df_ar.to_hdf('df_rand_only_area_ratio.h5', key='area_ratio', mode='w')

In [33]:
#read back in
df_ar = pd.read_hdf('df_rand_only_area_ratio.h5').reset_index(drop=True) 
#convert h5 to pandas and rename 0 column name to area ratio
df_ar = pd.DataFrame(df_ar).rename(columns={0:'area_ratio'})

Unnamed: 0,area_ratio
0,0.502850
1,0.322119
2,0.513344
3,0.659826
4,0.430349
...,...
3820663,0.008185
3820664,0.013691
3820665,0.008017
3820666,0.017230


In [9]:
#concatenate area ratio with IPAS dataframe (but without points to save time reading in)
df = df.drop(columns='points').reset_index(drop=True)

(<bound method DataFrame.info of          ncrystals             a             b            c      cplx  \
 0                2   4390.335660   2638.983911  1372.497469  0.284048   
 1                3   4215.110609   3840.583495  2691.720922  0.466069   
 2                4   4236.778797   4027.762791  3259.435449  0.500040   
 3                5   4476.428973   3949.552427  3443.726551  0.845821   
 4                6   5777.481021   4247.288069  3547.473542  0.570485   
 ...            ...           ...           ...          ...       ...   
 3820663          2  20050.293967   6003.404260   139.947231  0.851033   
 3820664          2  12594.735587  11735.112627   670.682840  0.877201   
 3820665          2  22472.644668   3606.589951   148.674056  0.824910   
 3820666          2  14494.748871   6256.676534   210.564535  0.910116   
 3820667          2  13943.301554   7232.848654   489.071254  0.909983   
 
             phi2D  mono_phi  mono_r        agg_r    shape   agg_phi  
 0     

In [34]:
dfc = pd.concat([df, df_ar], axis=1)
dfc

Unnamed: 0,ncrystals,a,b,c,cplx,phi2D,mono_phi,mono_r,agg_r,shape,agg_phi,area_ratio
0,2,4390.335660,2638.983911,1372.497469,0.284048,0.422614,0.01,500,2979.675712,prolate,0.312618,0.502850
1,3,4215.110609,3840.583495,2691.720922,0.466069,0.673506,0.01,500,3629.799700,oblate,0.638588,0.322119
2,4,4236.778797,4027.762791,3259.435449,0.500040,0.654913,0.01,500,3882.141148,oblate,0.769319,0.513344
3,5,4476.428973,3949.552427,3443.726551,0.845821,0.662195,0.01,500,4101.701186,prolate,0.769302,0.659826
4,6,5777.481021,4247.288069,3547.473542,0.570485,0.718758,0.01,500,4910.571889,prolate,0.614017,0.430349
...,...,...,...,...,...,...,...,...,...,...,...,...
3820663,2,20050.293967,6003.404260,139.947231,0.851033,0.148380,100.00,400,3831.792062,prolate,0.006980,0.008185
3820664,2,12594.735587,11735.112627,670.682840,0.877201,0.436470,100.00,400,4738.400525,oblate,0.053251,0.013691
3820665,2,22472.644668,3606.589951,148.674056,0.824910,0.065646,100.00,400,4218.724712,prolate,0.006616,0.008017
3820666,2,14494.748871,6256.676534,210.564535,0.910116,0.242220,100.00,400,3536.732437,prolate,0.014527,0.017230


In [35]:
#save df of IPAS attributes with area ratio
dfc.to_hdf('df_IPAS_rand_area_ratio_no_points.h5', key='df_IPAS_att', mode='w')

In [12]:
#verify_IPAS.ipynb now concatenates with CPI data