In [1]:
import h5py
import numpy as np
import matplotlib.pyplot as plt
import pickle
import os
import gudhi as gd
%matplotlib notebook
## IMPORTING OS COMMANDS
import os
## PICKLE FUNCTIONS
from extraction_scripts import load_pickle_general
## LOADING NOMENCLATURE
from core.nomenclature import read_combined_name, extract_sampling_inputs, extract_instance_names
import pickle
import os  # for makedirs
import homcloud.interface as hc  # HomCloud 
import homcloud.paraview_interface as pv # HomCloud <-> paraview interface
import numpy as np  # Numerical array library
import matplotlib.pyplot as plt  # Plotting
import sklearn.linear_model as lm  # Machine learning
from sklearn.decomposition import PCA  # for PCA
from sklearn.model_selection import train_test_split
from tqdm.notebook import tqdm  # For progressbar
import plotly.express as px
import pandas as pd
from sklearn.svm import LinearSVC
import gudhi as gd
from numpy import inf
import scipy
from scipy.spatial.distance import pdist
from matplotlib.ticker import FormatStrFormatter
from matplotlib.ticker import StrMethodFormatter
import matplotlib
from scipy import ndimage
matplotlib.rcParams['mathtext.fontset'] = 'stix'
matplotlib.rcParams['font.family'] = 'STIXGeneral'
matplotlib.rcParams['font.size'] = 20
matplotlib.rcParams['axes.axisbelow'] = True
matplotlib.rcParams.update({'figure.autolayout': True})

In [2]:
## MAIN FUNCTION
if __name__ == "__main__":
    ## DEFINING PATH TO SIMULATION FOLDER
    sim_folder=r"C:\Users\adsmith23\Desktop"
    
    ## DEFINING SPECIFIC SIM
    specific_sim = r"20200210-SolventNet_For_Bruce"
    
    ## DEFINING SOLVENT NET SIM FOLDER
    training_folder=r"20_20_20_20ns_oxy_3chan-split_avg_nonorm-10-strlearn-0.80-solvent_net-500-CEL_ETBE_FRU_LGA_PDO_XYL_tBuOH-10_25_50_75-DIO_GVL_THF"
    
    ## DEFINING INSTANCES FILE
    instances_file = r"20_20_20_20ns_oxy_3chan-split_avg_nonorm-10-CEL_ETBE_FRU_LGA_PDO_XYL_tBuOH-DIO_GVL_THF-10_25_50_75"
    
    testing_file = r"3D_CNN_TEST_SETS.pickle"
    
    ## DEFINING PATHS
    path_to_sim = os.path.join(sim_folder,
                               specific_sim)
    path_training_folder = os.path.join(path_to_sim,
                                        training_folder)
    
    ## DEFINING PATH TO INSTANCES
    path_instances = os.path.join(path_to_sim,
                                  instances_file)
    
    test_instances = os.path.join(path_to_sim,
                                  testing_file)
    
    ## DEFINING THE MODELS 
    model_list = [
            'model_fold_0.hdf5',
            'model_fold_1.hdf5',
            'model_fold_2.hdf5',
            'model_fold_3.hdf5',
            'model_fold_4.hdf5',
            ]

In [3]:
#%% LOADING ALL INSTANCES
instances = load_pickle_general(path_instances)
test_instances = load_pickle_general(test_instances)

In [4]:
## GETTING NAME INFORMATION
combined_name_info = read_combined_name(training_folder)


## GENERATING SAMPLING DICT
sampling_dict = extract_sampling_inputs( sampling_type = combined_name_info['sampling_type'], 
                                         sampling_inputs = [ combined_name_info['sampling_inputs'] ])


In [5]:
x_data = instances[0]
y_data = instances[1]
x_tdata = test_instances[0]
y_tdata = test_instances[1]

test_instances[2]

len(test_instances[2])

32

In [6]:
x_array = np.asarray(x_data)
x_tarray = np.asarray(x_tdata)

In [7]:
y_data = np.asarray(y_data)
y_tarray = np.asarray(y_tdata)

In [8]:
def Create_Laplacian_Functions(grid_size,maximum,minimum):
    bins = np.arange(minimum,maximum,grid_size)
    tpoints = [] 
    weights = []
    for i in range(len(bins)-1):
        for j in range(i):
            tpoints.append([j,i])       
    tpoints = np.vstack(tpoints)
    #Create Eigenfunctions
    distmat = scipy.spatial.distance.squareform(pdist(tpoints))
    Adj = (distmat <= 3)
    Adj = Adj.astype(np.int)
    D = np.diag(np.sum(Adj,axis = 1))
    L = D - Adj
    w,v = np.linalg.eig(L)
    return w,v,tpoints,bins

w,v,tpoints,bins = Create_Laplacian_Functions(.007,.4,0)

In [9]:
def pers_intervals(data,Hdim,inv):
    intervals = []
    data = (data - np.min(data))/(np.max(data) - np.min(data))
    if inv == 1:
        x_inv = np.absolute(data - np.max(data))
        cubeplex = gd.CubicalComplex(dimensions = [np.shape(data)[0],np.shape(data)[0],np.shape(data)[0]],top_dimensional_cells=np.ndarray.flatten(x_inv))
    else:
        cubeplex = gd.CubicalComplex(dimensions = [np.shape(data)[0],np.shape(data)[0],np.shape(data)[0]],top_dimensional_cells=np.ndarray.flatten(data))
    store = cubeplex.persistence()
    pers = cubeplex.persistence_intervals_in_dimension(Hdim)
    pers[pers == inf] = np.max(data)
    intervals.append(pers) 
    return intervals

def Create_Graphs(dots,bins,weight):   
    ftime = []    
    hist = np.histogram2d(dots[:,0],dots[:,1],bins = bins)   
    hist = hist[0]
    f = []   
    if weight == 1:       
        for i in range(len(bins)-1):
            for j in range(i):
                c = np.abs(i - j)/2 + j                
                weight = (np.sqrt((i-c)**2 + (j-c)**2))                
                f.append(hist[j,i]*weight)        
        ftime.append(f)    
    else:        
        for i in range(len(bins)-1):
            for j in range(i):                
                f.append(hist[j,i])       
        ftime.append(f)
    return ftime

def Diffuse_graphs(w,v,ftime,sigma,tpoints):   
    diffusers = []  
    for f in ftime:    
        deg = len(tpoints)       
        fnew = np.asarray(f)             
        exp = np.zeros(len(fnew))        
        for i in range(deg):
            coef = np.matmul(np.transpose(v[:,i]),fnew)/(len(fnew))
            four = np.exp(-w[i]*sigma)*coef*v[:,i]
            exp = exp + four         
        exp = exp/exp.max()        
        diffusers.append(exp)    
    return diffusers

def pd_vectors(data,inv,sigma,samples,hdim,weight,y_data,instindx,instances):
    pdvects = []
    label = []
    solv = []
    for i in range(samples):
        interval = pers_intervals(data[i],hdim,inv)
        ftime = Create_Graphs(np.vstack(interval),bins,weight)
        diff = Diffuse_graphs(w,v,ftime,sigma,tpoints)
        diff = (diff - np.min(diff))/(np.max(diff) - np.min(diff))
        pdvects.append(diff)
        if y_data[instindx] >= 0:
            label.append(y_data[instindx])
        else:
            label.append(0)
        solv.append(instances[2][instindx].split('_')[-2])
        
    return pdvects,label,solv

In [10]:
instances[2][5].split('_')[-2]

'GVL'

In [None]:
sigma = .2
samples = 10
hdim = 1
weight = 0
inv = 0

vects = []
labels = []
i = 0
for x in tqdm(x_array):
    x = x[:,:,:,:,0] + x[:,:,:,:,1]
#     x = x[:,:,:,:,0]
    vect,label,solv = pd_vectors(x,inv,sigma,samples,hdim,weight,y_data,i,instances)
    vect = np.vstack(vect)
    vects.append(vect)
    labels.append(label)
    i += 1


HBox(children=(FloatProgress(value=0.0, max=76.0), HTML(value='')))

In [None]:

allvects = np.vstack(vects)

labels = np.hstack(labels)

labels = labels[~np.isnan(allvects).any(axis=1)]

allvects = allvects[~np.isnan(allvects).any(axis=1)]

np.shape(labels)

In [None]:
pca = PCA()
X = pca.fit_transform(allvects)
plt.figure()
plt.scatter(X[:,0],X[:,1], c = labels, cmap = "coolwarm")

plt.colorbar().set_label("$\sigma$ Value")
coef = pca.components_
plt.grid(True,ls = '--')
plt.ylabel("Principal Component 1")
plt.xlabel("Principal Component 2")
plt.savefig("md_pca.eps",transparent = True)

In [None]:
from sklearn.model_selection import train_test_split

x_train,x_test,y_train,y_test = train_test_split(allvects,labels,test_size = .3)

from sklearn.svm import SVR

mdl = SVR(kernel = "rbf",C = 1)

mdl.fit(x_train,y_train)

a = mdl.predict(x_test)

plt.figure()

plt.plot(y_test,a,'o',label = "Data")

plt.plot([np.min(y_test),np.max(y_test)],[np.min(y_test),np.max(y_test)])

plt.ylabel("$\sigma$ Predicted")
plt.xlabel("$\sigma$ Actual")
plt.grid(True,ls = '--')

plt.savefig("regression1.svg",transparent = True)

mse = np.absolute(a - y_test)

np.sum(mse)/len(y_test)



In [None]:

tvects = []
tlabels = []
tsolv = []
i = 0
for x in tqdm(x_tarray):
    x = x[:,:,:,:,0] + x[:,:,:,:,1]
    vect,label,solv = pd_vectors(x,inv,sigma,samples,hdim,weight,y_tarray,i,test_instances)
    vect = np.vstack(vect)
    tvects.append(vect)
    tlabels.append(label)
    tsolv.append(solv)
    i += 1


In [None]:
tallvects = np.vstack(tvects)

tlabels = np.hstack(tlabels)

tlabels = tlabels[~np.isnan(tallvects).any(axis=1)]

tsolv = np.hstack(tsolv)

tsolv = tsolv[~np.isnan(tallvects).any(axis=1)]

tallvects = tallvects[~np.isnan(tallvects).any(axis=1)]


In [None]:

tallvects = tallvects[~np.isnan(tallvects).any(axis=1)]

mdl = SVR(kernel = "rbf",C = 1)

mdl.fit(allvects,labels)

a = mdl.predict(tallvects)

mse = np.absolute(a - tlabels)

np.sum(mse)/len(tlabels)

In [None]:



for i in np.unique(tsolv):
    j = 0
    plt.figure()
    plt.plot(tlabels[tsolv == i],a[tsolv == i],'o',label = i)
    plt.plot([np.min(tlabels),np.max(tlabels)],[np.min(tlabels),np.max(tlabels)])
    plt.savefig("regression{}.svg".format(i),transparent = True)
    j+=1

# plt.plot([np.min(tlabels),np.max(tlabels)],[np.min(tlabels),np.max(tlabels)])

# plt.ylabel("$\sigma$ Predicted")
# plt.xlabel("$\sigma$ Actual")
# plt.grid(True,ls = '--')

# plt.savefig("regression2.svg",transparent = True)

In [None]:
np.unique(tsolv)

In [None]:
solvent = np.hstack(solv)

In [None]:
np.unique(solvent)