In [None]:
import pandas as pd
import plotly.express as px
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from lh_sampling.sampler import lhs
from lh_sampling.util import inv_cdf, ecdf
from scipy.stats import norm, multivariate_normal
from lh_sampling import get_data

In [None]:
def str2float(val):
    if val == '':
        return np.nan
    else:
        return float(val)

def str2cat(val):
    if val.find('CAL') != -1:
        return 1
    return 0
        
df = pd.read_csv(get_data('/data/SR2021-12 Spreadsheet_IVESPA.csv'), usecols=[1,4,5,6,7,8,9,11,12,13],
                 converters={6:str2float, 7:str2cat, 8:str2float,
                             9:str2float, 11:str2cat, 12:str2float,
                             13:str2cat})

In [None]:
df.rename(columns={'MER_kg/s':'MER', 'Vent_elevation_km':'Vent_elevation',
                   'Column_height_km':'Column_height', 'Duration_hr':'Duration',
                   'Magma_type':'Magma type'},
          inplace=True)

# remove calculated values
df['Column_height'] = np.where(df.Column_height_note==1, np.nan, df['Column_height'])
df['MER'] = np.where(df.MER_note==1, np.nan, df['MER'])
df['Duration'] = np.where(df.Duration_note==1, np.nan, df['Duration'])

# Add log columns
df['Volume'] = df['MER'] * df['Duration']
df['log Volume'] = np.log(df['Volume'])
df['log MER'] = np.log(df['MER'])
df['log Column height'] = np.log(df['Column_height'])
df['log Duration'] = np.log(df['Duration'])

In [None]:
np.isnan(df['log Column height'].values).sum()

In [None]:
df

In [None]:
print("mafic, steam-driven: ", len(df[(df['Magma type']== 'Mafic')&(df['Eruption_type']=='Steam-driven')]))
print("mafic, small-to-moderate: ", len(df[(df['Magma type']== 'Mafic')&(df['Eruption_type']=='Magmatic small to moderate')]))
print("mafic, large: ", len(df[(df['Magma type']== 'Mafic')&(df['Eruption_type']=='Magmatic large')]))
print("intermediate, steam-driven: ", len(df[(df['Magma type']== 'Intermediate')&(df['Eruption_type']=='Steam-driven')]))
len1 = len(df[(df['Magma type']== 'Intermediate')&(df['Eruption_type']=='Magmatic small to moderate')])
len2 = len(df[(df['Magma type']== 'Intermediate')&(df['Eruption_type']=='Magmatic small to moderate*')])
print("intermediate, small-to-moderate: ", len1 + len2)
print("intermediate, large: ", len(df[(df['Magma type']== 'Intermediate')&(df['Eruption_type']=='Magmatic large')]))
print("silicic, steam-driven: ", len(df[(df['Magma type']== 'Silicic')&(df['Eruption_type']=='Steam-driven')]))
print("silicic, small-to-moderate: ", len(df[(df['Magma type']== 'Silicic')&(df['Eruption_type']=='Magmatic small to moderate')]))
print("silicic, large: ", len(df[(df['Magma type']== 'Silicic')&(df['Eruption_type']=='Magmatic large')]))

In [None]:
len(df)

In [None]:
# Interactive scatter matrix plot
fig = px.scatter_matrix(df, dimensions=['Column_height', 'MER', 'Duration', 'Volume'], color="Magma type")
fig.update_layout(width=1000, height=1000)

In [None]:
import matplotlib as mpl
def my_pair_grid_plot(data, hue):
    g = sns.PairGrid(data, vars=['log Duration [h]', 'log MER [kg/s]', 'log Column height [km]'], hue=hue,
                     diag_sharey=False, height=0.1, layout_pad=5, despine=True, corner=False)
    # Monkey patch the figure instance. Ugly but I couldn't find any other way
    height=3
    aspect=1
    figsize = 3 * height * aspect, 3 * height
    with mpl.rc_context({"figure.autolayout": False}):
        fig = plt.figure(figsize=figsize)
    axes = fig.subplots(3, 3, sharex="col", sharey=False, squeeze=False)
    fig.tight_layout(pad=2)
    g.axes = axes
    g._figure = fig
    g.map_upper(sns.scatterplot, s=15)
    g.map_lower(sns.kdeplot)
    g.map_diag(sns.ecdfplot, lw=2)

    g.axes[0,0].set_ylabel('Cumulative Duration')
    g.axes[1,1].set_ylabel('Cumulative MER')
    g.axes[2,2].set_ylabel('Cumulative Column height')
    g.axes[1, 2].set_yticks(g.axes[1,0].get_yticks())
    g.axes[1, 2].set_ylim(g.axes[1,0].get_ylim())
    g.axes[0, 2].set_yticks(g.axes[0,1].get_yticks())
    g.axes[0, 2].set_ylim(g.axes[0,1].get_ylim())
    g.axes[2, 1].set_yticks(g.axes[2,0].get_yticks())
    g.axes[2, 1].set_ylim(g.axes[2,0].get_ylim())
    g.add_legend()
    return fig

fig = my_pair_grid_plot(df, 'Magma type')
fig.savefig('plots/scatter_matrix.png', bbox_inches='tight', dpi=300)

In [None]:
df_m = df[['log Column height', 'log MER', 'log Duration']].where(df['Magma type'] == 'Mafic')
df_i = df[['log Column height', 'log MER', 'log Duration']].where(df['Magma type'] == 'Intermediate')
df_tmp = df_i
lsmp = lhs(3, 20, centered=True)
U = np.linalg.cholesky(df_tmp.cov().values)
mean = df_tmp.mean()

if False:
    def resample(df, key, samples):
        x = df[key].dropna().values
        x_sorted, cdf = ecdf(x)
        x_new = inv_cdf(x, samples)
        return x_new

    lh_samples_ch = resample(df_tmp, 'log Column height', lsmp[:,0])
    lh_samples_mer = resample(df_tmp, 'log MER', lsmp[:,1])
    lh_samples_d = resample(df_tmp, 'log Duration', lsmp[:,2])
    lh_samples = np.vstack((lh_samples_ch, lh_samples_mer, lh_samples_d))

if True:
    lh_samples_x = norm.ppf(lsmp[:, 0], loc=0, scale=1)
    lh_samples_y = norm.ppf(lsmp[:, 1], loc=0, scale=1)
    lh_samples_z = norm.ppf(lsmp[:, 2], loc=0, scale=1)
    lh_samples = np.vstack((lh_samples_x, lh_samples_y, lh_samples_z))
    lh_samples = np.dot(U, lh_samples)
    lh_samples = lh_samples+mean.values[:, np.newaxis]
    
dfr = pd.DataFrame(lh_samples.T, columns=df_tmp.columns)
dfr['Category'] = ['resampled']*dfr.shape[0]
df_tmp['Category'] = ['original']*df_tmp.shape[0]
dfr = pd.concat((df_tmp, dfr)).reset_index()

In [None]:
print("{:g} {:g}".format(np.exp(df_tmp['log MER'].min()), np.exp(df_tmp['log MER'].max())))

In [None]:
fig = my_pair_grid_plot(dfr, 'Category')
fig.savefig('plots/scatter_matrix_resample.png', bbox_inches='tight', dpi=300)

In [None]:
dfr[dfr.Category == 'original'].cov()

In [None]:
dfr[dfr.Category == 'resampled'].cov()

## Etna analysis

In [None]:
df_etna = df[df['Volcano']=='Etna']
df_etna = df_etna.sort_values(axis=0, by='MER')

In [None]:
s = sns.pairplot(df_etna, vars=['Duration', 'MER', 'Column_height'], hue='Magma type')
for i in range(3):
    for j in range(3):
        s.axes[i,j].set(xscale="log", yscale="log")

In [None]:
s = sns.displot(df_etna, x="MER", kind="kde")
s.set(xscale="log", yscale="log")