In [None]:
import numpy as np
import pandas as pd
import glob
import seaborn as sns
import pickle
import matplotlib.pyplot as plt
import agg_properties
%load_ext autoreload
%autoreload 2

In [None]:
def read(file):
    df = pd.read_pickle(file)
    df = pd.DataFrame(df)
    print(file, " loaded")
    return df

def shape(a,b,c):
    if (b-c) <= (a-b):
        return 'prolate'
    else:
        return 'oblate'

In [None]:
%%time 
#read in database of aggs (all the same monomers)
files = [f for f in glob.glob("../instance_files/createdb_iceagg_rand*")]
dfs = []
for file in files:
    print(file)
    dfs.append(pd.read_pickle(file, None))
dfs = [pd.DataFrame(i) for i in dfs]
df = pd.concat(dfs, axis=0, ignore_index=True)

In [None]:
df['agg_r'] = np.power((np.power(df['a'], 2) * df['c']), (1./3.))
df = df[df.agg_r < 5000]
#speed up shape function 
vfunc = np.vectorize(shape)
df['shape'] = vfunc(df['a'], df['b'], df['c'])
df['agg_phi'] = df.c/df.a

In [None]:
df['agg_r'].describe().round(2)

In [None]:
%%time 
#creates instance of Agg property class for each row in database
out = df.apply(lambda x: agg_properties.Agg(x).get_list(), axis=1) #returns dictionary of attributes

In [None]:
out

In [None]:
out1 = out.to_list() #convert dict to list

In [None]:
#convert dict of attributes for all rows of database to DataFrame
df_att = pd.DataFrame(out1, columns=['area_ratio', 'convex_perim', 'circularity',\
                           'roundness', 'perim_area_ratio','convexity', 'complexity', 'hull_area', 'solidity', 'equiv_d'])


In [None]:
#save df of IPAS attributes
df_att.to_hdf('df_rand_attributes_include_monomers.h5', key='df_rand', mode='w')

# ANALYZE DATAFRAME OF IPAS ATTRIBUTES

In [None]:
#read in IPAS attribute dataframe
df_att = pd.read_hdf('df_rand_attributes.h5')

In [None]:
df_att.info

In [None]:
#read in CPI data 
#all campaings in one file
df_CPI = pd.read_csv('all_campaigns.csv')
#only use aggregates
df_CPI = df_CPI[(df_CPI['classification'] == 'agg')]

In [None]:
df_CPI.info

In [None]:
#gather columns that are the same as IPAS dataframe to merge
df_CPI =  df_CPI[['filled_circular_area_ratio', 'complexity']]
df_CPI.rename(columns={"filled_circular_area_ratio": "area ratio"}, inplace=True)

In [None]:
df_att.rename(columns={"filled_circ_area_ratio": "area ratio"}, inplace=True)

In [None]:
#combine CPI and IPAS data to plot
cdf = pd.concat([df_CPI, df_att], keys=['CPI', 'IPAS'], names=["Source"]).reset_index().drop(columns='level_1')
cdf

In [None]:
xlarge=24; large = 20; med = 16; small = 14
params = {'axes.titlesize': xlarge,
          'legend.fontsize': small,
          'figure.figsize': (7,7),
          'axes.labelsize': med,
          'xtick.labelsize': med,
          'ytick.labelsize': med,
          'figure.titlesize': large,
          "font.family": "serif"}
plt.rcParams.update(params)

In [None]:
from sklearn import preprocessing
#normalize data
source = pd.DataFrame(cdf['Source'])
x = cdf.drop(columns=['Source']).values #returns a numpy array
min_max_scaler = preprocessing.MinMaxScaler()
x_scaled = min_max_scaler.fit_transform(x)
norm = pd.DataFrame(x_scaled, columns=['area ratio', 'complexity', 'solidity'])

In [None]:
#for plotting. need a source column for hue argument in seaborn
df = pd.melt(cdf,id_vars=['Source'],value_vars=['area ratio', 'complexity'],
             value_name='value')
df

In [None]:
fig, ax = plt.subplots(figsize=(13,7))
sns.boxplot(x="variable", y="Value", hue="Source", data=df, palette="Set1")
ax.set_xticklabels(ax.get_xticklabels(),rotation=90);

In [None]:
#not normalized
fig, ax = plt.subplots(figsize=(7,7))
sns.boxplot(x="variable", y="value", hue="Source", data=df, palette="Set1", showfliers=False)
ax.set_xticklabels(ax.get_xticklabels(),rotation=90);
ax.set_ylim(0.0, 1.1)