In [1]:
!pip install SciExpeM-API 
!pip install SciExpeM-API --upgrade

In [3]:
from SciExpeM_API.SciExpeM import SciExpeM
import numpy as np
import matplotlib.pyplot as plt
import os 
db = SciExpeM(username='manuel.peracci', password='mdp2022_',verify=False,warning=False)
db.testConnection(verbose=True)


In [126]:
from scipy.interpolate import CubicSpline
import matplotlib.pyplot as plt
import random
random.seed(2)
import pandas as pd
import json

# ex.__dict__

def list_duplicates(seq):
  seen = set()
  seen_add = seen.add
  seen_twice = set( x for x in seq if x in seen or seen_add(x) )
  return list( seen_twice )

def normalizeArray(ar):
    return (ar - np.mean(ar))/(np.std(ar))

def deNorm(ar,std,mean):
    return ar * std + mean 

def random_matrix(data_aug,a,b,arr): 
    listoflists = []
    for ele in range(0,data_aug):
        a_list = []
        for ala in range(0,32):
            a_list.append(random.uniform(a*np.std(arr),b*np.std(arr)))
        listoflists.append(a_list)
    return np.array(listoflists)

# it extracts some data from the experiment type laminar burning velocity measurement
# ex_id = it is the experiment identifier
# num_data_aug = it is the number of desired data augmentation (notice that is always mulplied by 3)
# chemModel_id = the model identifier which can be omitted 
def prova(ex_id,num_data_aug,chemModel_id = 0): 
    
    if(chemModel_id == 0):
        ex = db.filterDatabase(model_name='Execution', experiment__id=ex_id)
    else:
        ex = db.filterDatabase(model_name='Execution', experiment__id=ex_id, chemModel__id=chemModel_id)
        
    #if it is empty, return an empty df
    if not ex:
        return pd.DataFrame()
    else:
        ex = ex[0]
    
    # find the right data_columns
    exp =[ex.experiment.pairs[0]["x"],ex.experiment.pairs[0]["y"]]
    
    if(exp[0].units != 'unitless' or exp[1].units != 'cm/s'):
        print("Warning: different units")
        return pd.DataFrame()
    
    xx = np.round(exp[0].data,7)
    yy = np.round(exp[1].data,7)
    dupl = list_duplicates(xx)

    # In case of duplicates, we decided to leave only the first one 
    # It caused problems to the CubicSpline function
    if dupl:
        for i in list_duplicates(xx):

            c = np.where(xx == np.round(i,7))[0]
            c = c[1:np.size(c)]

            yy = np.delete(yy,c)
            xx = np.delete(xx,c)
            
        dupl = list_duplicates(xx)  
        # if there are still duplicates, send a warning 
        if dupl:
            print("Warning: duplicates found")
            print("duplicates:",dupl)
    
    # compute mean and variance that there will come in handy later 
    exp_mean = np.column_stack((np.mean(xx),np.mean(yy)))[0]
    exp_std = np.column_stack((np.std(xx),np.std(yy)))[0]
    
    # normalize the values on the x-axis and y-axis
    orig = np.column_stack((normalizeArray(np.array(xx)),normalizeArray(np.array(yy))))
    
    # order the values for the CubicSpline function 
    orig_s = orig[np.argsort(orig[:,0])].T

    # compute some random values that will be used either to traslate or dilate the original function
    # these values are based on the standard deviation of x 
    rando = random_matrix(num_data_aug,-1,1,orig_s[1]) #np.array([random.uniform((-1)*np.std(orig_s[1]),np.std(orig_s[1])) for ele in range(0,num_data_aug)])  
    d = random_matrix(num_data_aug,0.5,2,orig_s[1]) # np.array([random.uniform((0.5)*np.std(orig_s[1]),2*np.std(orig_s[1])) for ele in range(0,num_data_aug)])
    rando2 = random_matrix(num_data_aug,0.1,1,orig_s[1]) # np.array([random.uniform(0.1,1*np.std(orig_s[1])) for ele in range(0,num_data_aug)])
    
    # in order not to use always the same points on the x-axis for the curvematching, we came up with this solution:
    # we split the array on the x-axis into sub-arrays and on each of them we picked a random value 
    # we used all these new random values as new points on the x-axis
    # notice that we chose this because we wanted to keep some kind of uniform distribution 
    # for the points and also the same number of points
    newarr = np.array_split(orig_s[0], len(orig_s[0]))
    x_n = []
    for ele in range(0,len(newarr)):
        if ele == (len(newarr) - 1): break
        x_n.append(random.uniform(newarr[ele],newarr[ele+1])[0])
    
    # we decided out of coherence to use the cubic interpolation because it was used also in the curve matching algorithm
    cs = CubicSpline(orig_s[0],orig_s[1]) 
    ls_f1 = []
    ls_f2 = []
    ls_f3 = []
    ls_f4 = []
    ls_f5 = []
    df = pd.DataFrame(columns=["score","error","d0L2","d1L2","d0Pe","d1Pe","shift"])
     
    
    # data augmentation on the curve created from the original experiment 
    # notice that everything is being normalized because the experiments have different scales 
    # and then everything is denormalized in order to store correctly the data on the dataframe
    for data_aug in range(0,num_data_aug):  
        
        rando_de = deNorm(rando[data_aug],exp_std[1],exp_mean[1])
        
        # 1 vertical traslation
        x_de_n = np.ndarray.tolist(deNorm(np.array(x_n),exp_std[0],exp_mean[0]))
        y_n = cs(x_n)
        tras_y_n = cs(x_n) + rando[data_aug,1]
        y_de_n = np.ndarray.tolist(deNorm(np.array(y_n),exp_std[1],exp_mean[1]))
        tras_y_de_n = np.ndarray.tolist(deNorm(np.array(tras_y_n),exp_std[1],rando_de[1]))

        # 2 vertical dilatation
        tras_2_y_n = cs(x_n) * d[data_aug,2]
        tras_2_y_de_n = np.ndarray.tolist(deNorm(np.array(tras_2_y_n),exp_std[1] * d[data_aug,2],exp_mean[1]))

        # 3 vertical traslation + vertical dilatation
        tras_3_y_n = (cs(x_n) + rando[data_aug,3]) * d[data_aug,3]
        tras_3_y_de_n = np.ndarray.tolist(deNorm(np.array(tras_3_y_n),exp_std[1] * d[data_aug,3],exp_mean[1] + exp_std[1] * rando[data_aug,3] * d[data_aug,3]))
       
        # 4 max as outlier -> incremented with a random number between 0 and 1 std
        pos_max = np.argmax(y_n)
        tras_4_y_n = y_n
        tras_4_y_n[pos_max] = tras_4_y_n[pos_max] * (1 + rando2[data_aug,4])
        tras_4_y_de_n = np.ndarray.tolist(deNorm(np.array(tras_4_y_n),exp_std[1],exp_mean[1]))
        
        # 5 vertical traslation + max as outlier
        tras_5_y_n = y_n + rando[data_aug,5]
        tras_5_y_n[pos_max] = tras_5_y_n[pos_max] * (1 + rando2[data_aug,5])
        tras_5_y_de_n = np.ndarray.tolist(deNorm(np.array(tras_5_y_n),exp_std[1],rando_de[5]))
        
        # 6 vertical dilatation + max as outlier
        tras_6_y_n = y_n * d[data_aug,6]
        tras_6_y_n[pos_max] = tras_6_y_n[pos_max] * (1 + rando2[data_aug,6])
        tras_6_y_de_n = np.ndarray.tolist(deNorm(np.array(tras_6_y_n),exp_std[1] * d[data_aug,6],exp_mean[1]))
        
        # 7 vertical traslation + vertical dilatation + max as outlier
        tras_7_y_n = (cs(x_n) + rando[data_aug,7]) * d[data_aug,7]
        tras_7_y_n[pos_max] = tras_7_y_n[pos_max] * (1 + rando2[data_aug,7])
        tras_7_y_de_n = np.ndarray.tolist(deNorm(np.array(tras_7_y_n),exp_std[1] * d[data_aug,7],exp_mean[1] + exp_std[1] * rando[data_aug,7] * d[data_aug,7]))
      
        # 8 min as outlier -> decreased by a random number between 0 and 1 std
        pos_min = np.argmin(y_n)
        tras_8_y_n = y_n
        tras_8_y_n[pos_min] = tras_8_y_n[pos_min] * (1 - rando2[data_aug,8])
        tras_8_y_de_n = np.ndarray.tolist(deNorm(np.array(tras_8_y_n),exp_std[1],exp_mean[1]))
        
        # 9 vertical traslation + min as outlier
        tras_9_y_n = y_n + rando[data_aug,9]
        tras_9_y_n[pos_min] = tras_9_y_n[pos_min] * (1 - rando2[data_aug,9])
        tras_9_y_de_n = np.ndarray.tolist(deNorm(np.array(tras_9_y_n),exp_std[1],rando_de[9]))
        
        # 10 vertical dilatation + min as outlier
        tras_10_y_n = y_n * d[data_aug,10]
        tras_10_y_n[pos_min] = tras_10_y_n[pos_min] * (1 - rando2[data_aug,10])
        tras_10_y_de_n = np.ndarray.tolist(deNorm(np.array(tras_10_y_n),exp_std[1] * d[data_aug,10],exp_mean[1]))
        
        # 11 vertical traslation + vertical dilatation + min as outlier
        tras_11_y_n = (cs(x_n) + rando[data_aug,11]) * d[data_aug,11]
        tras_11_y_n[pos_min] = tras_11_y_n[pos_min] * (1 - rando2[data_aug,11])
        tras_11_y_de_n = np.ndarray.tolist(deNorm(np.array(tras_11_y_n),exp_std[1] * d[data_aug,11],exp_mean[1] + exp_std[1] * rando[data_aug,11] * d[data_aug,11]))
      
        # 12 min and max as outliers 
        tras_12_y_n = y_n
        tras_12_y_n[pos_min] = tras_12_y_n[pos_min] * (1 - rando2[data_aug,12])
        tras_12_y_n[pos_max] = tras_12_y_n[pos_max] * (1 + rando2[data_aug,12])
        tras_12_y_de_n = np.ndarray.tolist(deNorm(np.array(tras_12_y_n),exp_std[1],exp_mean[1]))
        
        # 13 vertical traslation + min and max as outliers 
        tras_13_y_n = y_n + rando[data_aug,13]
        tras_13_y_n[pos_min] = tras_13_y_n[pos_min] * (1 - rando2[data_aug,13])
        tras_13_y_n[pos_max] = tras_13_y_n[pos_max] * (1 + rando2[data_aug,13])
        tras_13_y_de_n = np.ndarray.tolist(deNorm(np.array(tras_13_y_n),exp_std[1],rando_de[13]))
        
        # 14 vertical dilatation + min and max as outliers 
        tras_14_y_n = y_n * d[data_aug,14]
        tras_14_y_n[pos_min] = tras_14_y_n[pos_min] * (1 - rando2[data_aug,14])
        tras_14_y_n[pos_max] = tras_14_y_n[pos_max] * (1 + rando2[data_aug,14])
        tras_14_y_de_n = np.ndarray.tolist(deNorm(np.array(tras_14_y_n),exp_std[1] * d[data_aug,14],exp_mean[1]))
        
        # 15 vertical traslation + vertical dilatation + min and max as outliers
        tras_15_y_n = (cs(x_n) + rando[data_aug,15]) * d[data_aug,15]
        tras_15_y_n[pos_min] = tras_15_y_n[pos_min] * (1 - rando2[data_aug,15])
        tras_15_y_n[pos_max] = tras_15_y_n[pos_max] * (1 + rando2[data_aug,15])
        tras_15_y_de_n = np.ndarray.tolist(deNorm(np.array(tras_15_y_n),exp_std[1] * d[data_aug,15],exp_mean[1] + exp_std[1] * rando[data_aug,15] * d[data_aug,15]))      
       
        # 16 random point -> random modification of a point 
        pos_ran = random.randint(0, len(orig_s[0]) - 2)
        tras_16_y_n = y_n
        tras_16_y_n[pos_ran] = tras_16_y_n[pos_ran] * (1 + rando[data_aug,16])
        tras_16_y_de_n = np.ndarray.tolist(deNorm(np.array(tras_16_y_n),exp_std[1],exp_mean[1]))
        
        # 17 vertical traslation + random point 
        pos_ran = random.randint(0, len(orig_s[0]) - 2)
        tras_17_y_n = y_n + rando[data_aug,17]
        tras_17_y_n[pos_ran] = tras_17_y_n[pos_ran] * (1 + rando[data_aug,17])
        tras_17_y_de_n = np.ndarray.tolist(deNorm(np.array(tras_17_y_n),exp_std[1],rando_de[17]))
        
        # 18 vertical dilatation + random point 
        pos_ran = random.randint(0, len(orig_s[0]) - 2)
        tras_18_y_n = y_n * d[data_aug,18]
        tras_18_y_n[pos_ran] = tras_18_y_n[pos_ran] * (1 + rando[data_aug,18])
        tras_18_y_de_n = np.ndarray.tolist(deNorm(np.array(tras_18_y_n),exp_std[1] * d[data_aug,18],exp_mean[1]))
        
        # 19 max as outlier + random point 
        pos_ran = random.randint(0, len(orig_s[0]) - 2)
        tras_19_y_n = y_n 
        tras_19_y_n[pos_ran] = tras_19_y_n[pos_ran] * (1 + rando[data_aug,19])
        tras_19_y_n[pos_max] = tras_19_y_n[pos_max] * (1 + rando2[data_aug,19])
        tras_19_y_de_n = np.ndarray.tolist(deNorm(np.array(tras_19_y_n),exp_std[1],exp_mean[1]))
        
        # 20 min as outlier + random point 
        pos_ran = random.randint(0, len(orig_s[0]) - 2)
        tras_20_y_n = y_n 
        tras_20_y_n[pos_ran] = tras_20_y_n[pos_ran] * (1 + rando[data_aug,20])
        tras_20_y_n[pos_min] = tras_20_y_n[pos_min] * (1 - rando2[data_aug,20])        
        tras_20_y_de_n = np.ndarray.tolist(deNorm(np.array(tras_20_y_n),exp_std[1],exp_mean[1]))
        
        # 21 max and min as outliers + random point 
        pos_ran = random.randint(0, len(orig_s[0]) - 2)
        tras_21_y_n = y_n 
        tras_21_y_n[pos_ran] = tras_21_y_n[pos_ran] * (1 + rando[data_aug,21])
        tras_21_y_n[pos_min] = tras_21_y_n[pos_min] * (1 - rando2[data_aug,21])  
        tras_21_y_n[pos_max] = tras_21_y_n[pos_max] * (1 + rando2[data_aug,21])
        tras_21_y_de_n = np.ndarray.tolist(deNorm(np.array(tras_21_y_n),exp_std[1],exp_mean[1]))
        
        # 22 vertical traslation + vertical dilatation + random point 
        pos_ran = random.randint(0, len(orig_s[0]) - 2)
        tras_22_y_n = (cs(x_n) + rando[data_aug,22]) * d[data_aug,22]
        tras_22_y_n[pos_ran] = tras_22_y_n[pos_ran] * (1 + rando[data_aug,22])
        tras_22_y_de_n = np.ndarray.tolist(deNorm(np.array(tras_22_y_n),exp_std[1] * d[data_aug,22],exp_mean[1] + exp_std[1] * rando[data_aug,22] * d[data_aug,22]))
        
        # 23 vertical traslation + max as outlier + random point 
        pos_ran = random.randint(0, len(orig_s[0]) - 2)
        tras_23_y_n = cs(x_n) + rando[data_aug,23] 
        tras_23_y_n[pos_ran] = tras_23_y_n[pos_ran] * (1 + rando[data_aug,23])
        tras_23_y_n[pos_max] = tras_23_y_n[pos_max] * (1 + rando2[data_aug,23])
        tras_23_y_de_n = np.ndarray.tolist(deNorm(np.array(tras_23_y_n),exp_std[1],rando_de[23]))
        
        # 24 vertical traslation + min as outlier + random point 
        pos_ran = random.randint(0, len(orig_s[0]) - 2)
        tras_24_y_n = cs(x_n) + rando[data_aug,24] 
        tras_24_y_n[pos_ran] = tras_24_y_n[pos_ran] * (1 + rando[data_aug,24])
        tras_24_y_n[pos_min] = tras_24_y_n[pos_min] * (1 - rando2[data_aug,24])  
        tras_24_y_de_n = np.ndarray.tolist(deNorm(np.array(tras_24_y_n),exp_std[1],rando_de[24]))
        
        # 25 vertical dilatation + max as outlier + random point 
        pos_ran = random.randint(0, len(orig_s[0]) - 2)
        tras_25_y_n = y_n * d[data_aug,25] 
        tras_25_y_n[pos_ran] = tras_25_y_n[pos_ran] * (1 + rando[data_aug,25])
        tras_25_y_n[pos_max] = tras_25_y_n[pos_max] * (1 + rando2[data_aug,25])
        tras_25_y_de_n = np.ndarray.tolist(deNorm(np.array(tras_25_y_n),exp_std[1] * d[data_aug,25],exp_mean[1]))
        
        # 26 vertical dilatation + min as outlier + random point 
        pos_ran = random.randint(0, len(orig_s[0]) - 2)
        tras_26_y_n = y_n * d[data_aug,26]
        tras_26_y_n[pos_ran] = tras_26_y_n[pos_ran] * (1 + rando[data_aug,26])
        tras_26_y_n[pos_min] = tras_26_y_n[pos_min] * (1 - rando2[data_aug,26])  
        tras_26_y_de_n = np.ndarray.tolist(deNorm(np.array(tras_26_y_n),exp_std[1] * d[data_aug,26],exp_mean[1]))
        
        # 27 vertical traslation + vertical dilatation + max as outlier + random point 
        pos_ran = random.randint(0, len(orig_s[0]) - 2)
        tras_27_y_n = (cs(x_n) + rando[data_aug,27]) * d[data_aug,27]
        tras_27_y_n[pos_ran] = tras_27_y_n[pos_ran] * (1 + rando[data_aug,27])
        tras_27_y_n[pos_max] = tras_27_y_n[pos_max] * (1 + rando2[data_aug,27])
        tras_27_y_de_n = np.ndarray.tolist(deNorm(np.array(tras_27_y_n),exp_std[1] * d[data_aug,27],exp_mean[1] + exp_std[1] * rando[data_aug,27] * d[data_aug,27]))      
       
        # 28 vertical traslation + vertical dilatation + min as outlier + random point 
        pos_ran = random.randint(0, len(orig_s[0]) - 2)
        tras_28_y_n = (cs(x_n) + rando[data_aug,28]) * d[data_aug,28]
        tras_28_y_n[pos_ran] = tras_28_y_n[pos_ran] * (1 + rando[data_aug,28])
        tras_28_y_n[pos_min] = tras_28_y_n[pos_min] * (1 - rando2[data_aug,28])  
        tras_28_y_de_n = np.ndarray.tolist(deNorm(np.array(tras_28_y_n),exp_std[1] * d[data_aug,28],exp_mean[1] + exp_std[1] * rando[data_aug,28] * d[data_aug,28]))      
       
        # 29 vertical traslation + min and max as outliers + random point 
        pos_ran = random.randint(0, len(orig_s[0]) - 2)
        tras_29_y_n = cs(x_n) + rando[data_aug,29] 
        tras_29_y_n[pos_ran] = tras_29_y_n[pos_ran] * (1 + rando[data_aug,29])
        tras_29_y_n[pos_max] = tras_29_y_n[pos_max] * (1 + rando2[data_aug,29])
        tras_29_y_n[pos_min] = tras_29_y_n[pos_min] * (1 - rando2[data_aug,29])
        tras_29_y_de_n = np.ndarray.tolist(deNorm(np.array(tras_29_y_n),exp_std[1],rando_de[29]))
        
        # 30 vertical dilatation + min and max as outliers + random point 
        pos_ran = random.randint(0, len(orig_s[0]) - 2)
        tras_30_y_n = cs(x_n) * d[data_aug,30]
        tras_30_y_n[pos_ran] = tras_30_y_n[pos_ran] * (1 + rando[data_aug,30])
        tras_30_y_n[pos_max] = tras_30_y_n[pos_max] * (1 + rando2[data_aug,30])
        tras_30_y_n[pos_min] = tras_30_y_n[pos_min] * (1 - rando2[data_aug,30])
        tras_30_y_de_n = np.ndarray.tolist(deNorm(np.array(tras_30_y_n),exp_std[1] * d[data_aug,30],exp_mean[1]))
        
        # 31 vertical traslation + vertical dilatation + min and max as outliers + random point 
        pos_ran = random.randint(0, len(orig_s[0]) - 2)
        tras_31_y_n = (cs(x_n) + rando[data_aug,31]) * d[data_aug,31]
        tras_31_y_n[pos_ran] = tras_31_y_n[pos_ran] * (1 + rando[data_aug,31])
        tras_31_y_n[pos_max] = tras_31_y_n[pos_max] * (1 + rando2[data_aug,31])
        tras_31_y_n[pos_min] = tras_31_y_n[pos_min] * (1 - rando2[data_aug,31])  
        tras_31_y_de_n = np.ndarray.tolist(deNorm(np.array(tras_31_y_n),exp_std[1] * d[data_aug,31],exp_mean[1] + exp_std[1] * rando[data_aug,31] * d[data_aug,31]))      
        
    
        # compute the curvematching for the three transformations
        a = db.executeCurveMatching(x_sim=x_de_n, y_sim=y_de_n, x_exp=x_de_n, y_exp=tras_y_de_n, numberOfBootstrapVariations=1)
        b = db.executeCurveMatching(x_sim=x_de_n, y_sim=y_de_n, x_exp=x_de_n, y_exp=tras_2_y_de_n, numberOfBootstrapVariations=1)
        c = db.executeCurveMatching(x_sim=x_de_n, y_sim=y_de_n, x_exp=x_de_n, y_exp=tras_3_y_de_n, numberOfBootstrapVariations=1)
        e = db.executeCurveMatching(x_sim=x_de_n, y_sim=y_de_n, x_exp=x_de_n, y_exp=tras_4_y_de_n, numberOfBootstrapVariations=1)        
        f = db.executeCurveMatching(x_sim=x_de_n, y_sim=y_de_n, x_exp=x_de_n, y_exp=tras_5_y_de_n, numberOfBootstrapVariations=1)
        g = db.executeCurveMatching(x_sim=x_de_n, y_sim=y_de_n, x_exp=x_de_n, y_exp=tras_6_y_de_n, numberOfBootstrapVariations=1)
        h = db.executeCurveMatching(x_sim=x_de_n, y_sim=y_de_n, x_exp=x_de_n, y_exp=tras_7_y_de_n, numberOfBootstrapVariations=1)
        i = db.executeCurveMatching(x_sim=x_de_n, y_sim=y_de_n, x_exp=x_de_n, y_exp=tras_8_y_de_n, numberOfBootstrapVariations=1)
        j = db.executeCurveMatching(x_sim=x_de_n, y_sim=y_de_n, x_exp=x_de_n, y_exp=tras_9_y_de_n, numberOfBootstrapVariations=1)        
        k = db.executeCurveMatching(x_sim=x_de_n, y_sim=y_de_n, x_exp=x_de_n, y_exp=tras_10_y_de_n, numberOfBootstrapVariations=1)
        l = db.executeCurveMatching(x_sim=x_de_n, y_sim=y_de_n, x_exp=x_de_n, y_exp=tras_11_y_de_n, numberOfBootstrapVariations=1)
        m = db.executeCurveMatching(x_sim=x_de_n, y_sim=y_de_n, x_exp=x_de_n, y_exp=tras_12_y_de_n, numberOfBootstrapVariations=1)
        n = db.executeCurveMatching(x_sim=x_de_n, y_sim=y_de_n, x_exp=x_de_n, y_exp=tras_13_y_de_n, numberOfBootstrapVariations=1)
        o = db.executeCurveMatching(x_sim=x_de_n, y_sim=y_de_n, x_exp=x_de_n, y_exp=tras_14_y_de_n, numberOfBootstrapVariations=1)
        p = db.executeCurveMatching(x_sim=x_de_n, y_sim=y_de_n, x_exp=x_de_n, y_exp=tras_15_y_de_n, numberOfBootstrapVariations=1)
        
        q = db.executeCurveMatching(x_sim=x_de_n, y_sim=y_de_n, x_exp=x_de_n, y_exp=tras_16_y_de_n, numberOfBootstrapVariations=1)
        r = db.executeCurveMatching(x_sim=x_de_n, y_sim=y_de_n, x_exp=x_de_n, y_exp=tras_17_y_de_n, numberOfBootstrapVariations=1)
        s = db.executeCurveMatching(x_sim=x_de_n, y_sim=y_de_n, x_exp=x_de_n, y_exp=tras_18_y_de_n, numberOfBootstrapVariations=1)
        t = db.executeCurveMatching(x_sim=x_de_n, y_sim=y_de_n, x_exp=x_de_n, y_exp=tras_19_y_de_n, numberOfBootstrapVariations=1)        
        u = db.executeCurveMatching(x_sim=x_de_n, y_sim=y_de_n, x_exp=x_de_n, y_exp=tras_20_y_de_n, numberOfBootstrapVariations=1)
        v = db.executeCurveMatching(x_sim=x_de_n, y_sim=y_de_n, x_exp=x_de_n, y_exp=tras_21_y_de_n, numberOfBootstrapVariations=1)
        w = db.executeCurveMatching(x_sim=x_de_n, y_sim=y_de_n, x_exp=x_de_n, y_exp=tras_22_y_de_n, numberOfBootstrapVariations=1)
        x = db.executeCurveMatching(x_sim=x_de_n, y_sim=y_de_n, x_exp=x_de_n, y_exp=tras_23_y_de_n, numberOfBootstrapVariations=1)
        y = db.executeCurveMatching(x_sim=x_de_n, y_sim=y_de_n, x_exp=x_de_n, y_exp=tras_24_y_de_n, numberOfBootstrapVariations=1)        
        z = db.executeCurveMatching(x_sim=x_de_n, y_sim=y_de_n, x_exp=x_de_n, y_exp=tras_25_y_de_n, numberOfBootstrapVariations=1)
        a2 = db.executeCurveMatching(x_sim=x_de_n, y_sim=y_de_n, x_exp=x_de_n, y_exp=tras_26_y_de_n, numberOfBootstrapVariations=1)
        b2 = db.executeCurveMatching(x_sim=x_de_n, y_sim=y_de_n, x_exp=x_de_n, y_exp=tras_27_y_de_n, numberOfBootstrapVariations=1)
        c2 = db.executeCurveMatching(x_sim=x_de_n, y_sim=y_de_n, x_exp=x_de_n, y_exp=tras_28_y_de_n, numberOfBootstrapVariations=1)
        e2 = db.executeCurveMatching(x_sim=x_de_n, y_sim=y_de_n, x_exp=x_de_n, y_exp=tras_29_y_de_n, numberOfBootstrapVariations=1)
        f2 = db.executeCurveMatching(x_sim=x_de_n, y_sim=y_de_n, x_exp=x_de_n, y_exp=tras_30_y_de_n, numberOfBootstrapVariations=1)
        g2 = db.executeCurveMatching(x_sim=x_de_n, y_sim=y_de_n, x_exp=x_de_n, y_exp=tras_31_y_de_n, numberOfBootstrapVariations=1)
                
        
        # the output of the curvematching is in json format
        a_1 = json.loads(a)  # 1 trasl
        b_1 = json.loads(b)  # 2 dil
        c_1 = json.loads(c)  # 3 trasl + dil
        e_1 = json.loads(e)  # 4 max as outlier
        f_1 = json.loads(f)  # 5 trasl + max as outlier
        g_1 = json.loads(g)  # 6 dil + max as outlier
        h_1 = json.loads(h)  # 7 trasl + dil + max as outlier
        i_1 = json.loads(i)  # 8 min as outlier
        j_1 = json.loads(j)  # 9 trasl + min as outlier
        k_1 = json.loads(k)  # 10 dil + min as outlier
        l_1 = json.loads(l)  # 11 trasl + dil + min as outlier
        m_1 = json.loads(m)  # 12 min and max as outliers
        n_1 = json.loads(n)  # 13 trasl +  min and max as outliers
        o_1 = json.loads(o)  # 14 dil + min and max as outliers
        p_1 = json.loads(p)  # 15 trasl + dil + min and max as outliers
        
        q_1 = json.loads(q)  # 16 random point
        r_1 = json.loads(r)  # 17 trasl + random point
        s_1 = json.loads(s)  # 18 dil + random point
        t_1 = json.loads(t)  # 19 max as outlier + random point
        u_1 = json.loads(u)  # 20 min as outlier + random point
        v_1 = json.loads(v)  # 21 max and min as outliers + random point
        w_1 = json.loads(w)  # 22 trasl + dil + random point
        x_1 = json.loads(x)  # 23 trasl + max as outlier + random point
        y_1 = json.loads(y)  # 24 trasl + min as outlier + random point
        z_1 = json.loads(z)  # 25 dil + max as outlier + random point
        a2_1 = json.loads(a2)  # 26 dil + min as outlier + random point
        b2_1 = json.loads(b2)  # 27 trasl + dil + max as outlier + random point
        c2_1 = json.loads(c2)  # 28 trasl + dil + min as outlier + random point
        e2_1 = json.loads(e2)  # 29 trasl + min and max as outliers + random point
        f2_1 = json.loads(f2)  # 30 dil + min and max as outliers + random point
        g2_1 = json.loads(g2)  # 31 trasl + dil + min and max as outliers + random point

        
        # the final format is a dataframe
        df = df.append(pd.DataFrame([a_1]))
        df = df.append(pd.DataFrame([b_1]))
        df = df.append(pd.DataFrame([c_1]))
        df = df.append(pd.DataFrame([e_1]))
        df = df.append(pd.DataFrame([f_1]))
        df = df.append(pd.DataFrame([g_1]))
        df = df.append(pd.DataFrame([h_1]))
        df = df.append(pd.DataFrame([i_1]))
        df = df.append(pd.DataFrame([j_1]))
        df = df.append(pd.DataFrame([k_1]))
        df = df.append(pd.DataFrame([l_1]))
        df = df.append(pd.DataFrame([m_1]))
        df = df.append(pd.DataFrame([n_1]))
        df = df.append(pd.DataFrame([o_1]))
        df = df.append(pd.DataFrame([p_1]))
        
        df = df.append(pd.DataFrame([q_1]))
        df = df.append(pd.DataFrame([r_1]))
        df = df.append(pd.DataFrame([s_1]))
        df = df.append(pd.DataFrame([t_1]))
        df = df.append(pd.DataFrame([u_1]))
        df = df.append(pd.DataFrame([v_1]))
        df = df.append(pd.DataFrame([w_1]))
        df = df.append(pd.DataFrame([x_1]))
        df = df.append(pd.DataFrame([y_1]))
        df = df.append(pd.DataFrame([z_1]))
        df = df.append(pd.DataFrame([a2_1]))
        df = df.append(pd.DataFrame([b2_1]))
        df = df.append(pd.DataFrame([c2_1]))
        df = df.append(pd.DataFrame([e2_1]))
        df = df.append(pd.DataFrame([f2_1]))
        df = df.append(pd.DataFrame([g2_1]))
        
        # 1 vertical traslation
        ls_f1.append(rando_de[1])
        ls_f2.append(0)
        ls_f3.append(0)
        ls_f4.append(0)
        ls_f5.append(0)
        
        # 2 vertical dilatation
        ls_f1.append(0)
        ls_f2.append(d[data_aug,2])
        ls_f3.append(0)
        ls_f4.append(0)
        ls_f5.append(0)
        
        # 3 vertical traslation + vertical dilatation
        ls_f1.append(rando_de[3] * d[data_aug,3])
        ls_f2.append(d[data_aug,3])
        ls_f3.append(0)
        ls_f4.append(0)
        ls_f5.append(0)
        
        # 4 max as outlier
        ls_f1.append(0)
        ls_f2.append(0)
        ls_f3.append(rando2[data_aug,4])
        ls_f4.append(0)
        ls_f5.append(0)
       
        # 5 vertical traslation + max as outlier
        ls_f1.append(rando_de[5])
        ls_f2.append(0)
        ls_f3.append(rando2[data_aug,5])
        ls_f4.append(0)
        ls_f5.append(0)
        
        # 6 vertical dilatation + max as outlier
        ls_f1.append(0)
        ls_f2.append(d[data_aug,6])
        ls_f3.append(rando2[data_aug,6])
        ls_f4.append(0)
        ls_f5.append(0)
        
        # 7 vertical traslation + vertical dilatation + max as outlier
        ls_f1.append(rando_de[7] * d[data_aug,7])
        ls_f2.append(d[data_aug,7])
        ls_f3.append(rando2[data_aug,7])
        ls_f4.append(0)
        ls_f5.append(0)
        
        # 8 min as outlier
        ls_f1.append(0)
        ls_f2.append(0)
        ls_f3.append(0)
        ls_f4.append(-rando2[data_aug,8])
        ls_f5.append(0)
        
        # 9 vertical traslation + min as outlier
        ls_f1.append(rando_de[9])
        ls_f2.append(0)
        ls_f3.append(0)
        ls_f4.append(-rando2[data_aug,9])
        ls_f5.append(0)
        
        # 10 vertical dilatation + min as outlier
        ls_f1.append(0)
        ls_f2.append(d[data_aug,10])
        ls_f3.append(0)
        ls_f4.append(-rando2[data_aug,10])
        ls_f5.append(0)
        
        # 11 vertical traslation + vertical dilatation + min as outlier
        ls_f1.append(rando_de[11] * d[data_aug,11])
        ls_f2.append(d[data_aug,11])
        ls_f3.append(0)
        ls_f4.append(-rando2[data_aug,11])
        ls_f5.append(0)
        
        # 12 min and max as outliers
        ls_f1.append(0)
        ls_f2.append(0)
        ls_f3.append(rando2[data_aug,12])
        ls_f4.append(-rando2[data_aug,12])
        ls_f5.append(0)
        
        # 13 vertical traslation + min and max as outliers
        ls_f1.append(rando_de[13])
        ls_f2.append(0)
        ls_f3.append(rando2[data_aug,13])
        ls_f4.append(-rando2[data_aug,13])
        ls_f5.append(0)
        
        # 14 vertical dilatation + min and max as outliers
        ls_f1.append(0)
        ls_f2.append(d[data_aug,14])
        ls_f3.append(rando2[data_aug,14])
        ls_f4.append(-rando2[data_aug,14])
        ls_f5.append(0)
        
        # 15 vertical traslation + vertical dilatation + min and max as outliers
        ls_f1.append(rando_de[15] * d[data_aug,15])
        ls_f2.append(d[data_aug,15])
        ls_f3.append(rando2[data_aug,15])
        ls_f4.append(-rando2[data_aug,15])
        ls_f5.append(0)
        
        # 16 random point
        ls_f1.append(0)
        ls_f2.append(0)
        ls_f3.append(0)
        ls_f4.append(0)
        ls_f5.append(rando[data_aug,16])
        
        # 17 vertical traslation + random point
        ls_f1.append(rando_de[17])
        ls_f2.append(0)
        ls_f3.append(0)
        ls_f4.append(0)
        ls_f5.append(rando[data_aug,17])
        
        # 18 vertical dilatation + random point
        ls_f1.append(0)
        ls_f2.append(d[data_aug,18])
        ls_f3.append(0)
        ls_f4.append(0)
        ls_f5.append(rando[data_aug,18])
        
        # 19 max as outlier + random point
        ls_f1.append(0)
        ls_f2.append(0)
        ls_f3.append(rando2[data_aug,19])
        ls_f4.append(0)
        ls_f5.append(rando[data_aug,19])
        
        # 20 min as outlier + random point
        ls_f1.append(0)
        ls_f2.append(0)
        ls_f3.append(0)
        ls_f4.append(-rando2[data_aug,20])
        ls_f5.append(rando[data_aug,20])
        
        # 21 max and min as outliers + random point
        ls_f1.append(0)
        ls_f2.append(0)
        ls_f3.append(rando2[data_aug,21])
        ls_f4.append(-rando2[data_aug,21])
        ls_f5.append(rando[data_aug,21])
        
        # 22 vertical traslation + vertical dilatation + random point
        ls_f1.append(rando_de[22] * d[data_aug,22])
        ls_f2.append(d[data_aug,22])
        ls_f3.append(0)
        ls_f4.append(0)
        ls_f5.append(rando[data_aug,22])
        
        # 23 vertical traslation + max as outlier + random point
        ls_f1.append(rando_de[23])
        ls_f2.append(0)
        ls_f3.append(rando2[data_aug,23])
        ls_f4.append(0)
        ls_f5.append(rando[data_aug,23])
        
        # 24 vertical traslation + min as outlier + random point
        ls_f1.append(rando_de[24])
        ls_f2.append(0)
        ls_f3.append(0)
        ls_f4.append(-rando2[data_aug,24])
        ls_f5.append(rando[data_aug,24])
        
        # 25 vertical dilatation + max as outlier + random point
        ls_f1.append(0)
        ls_f2.append(d[data_aug,25])
        ls_f3.append(rando2[data_aug,25])
        ls_f4.append(0)
        ls_f5.append(rando[data_aug,25])
        
        # 26 vertical dilatation + min as outlier + random point
        ls_f1.append(0)
        ls_f2.append(d[data_aug,26])
        ls_f3.append(0)
        ls_f4.append(-rando2[data_aug,26])
        ls_f5.append(rando[data_aug,26])
        
        # 27 vertical traslation + vertical dilatation + max as outlier + random point
        ls_f1.append(rando_de[27] * d[data_aug,27])
        ls_f2.append(d[data_aug,27])
        ls_f3.append(rando2[data_aug,27])
        ls_f4.append(0)
        ls_f5.append(rando[data_aug,27])
        
        # 28 vertical traslation + vertical dilatation + min as outlier + random point
        ls_f1.append(rando_de[28] * d[data_aug,28])
        ls_f2.append(d[data_aug,28])
        ls_f3.append(0)
        ls_f4.append(-rando2[data_aug,28])
        ls_f5.append(rando[data_aug,28])
        
        # 29 vertical traslation + min and max as outliers + random point
        ls_f1.append(rando_de[29])
        ls_f2.append(0)
        ls_f3.append(rando2[data_aug,29])
        ls_f4.append(-rando2[data_aug,29])
        ls_f5.append(rando[data_aug,29])
        
        # 30 vertical dilatation + min and max as outliers + random point
        ls_f1.append(0)
        ls_f2.append(d[data_aug,30])
        ls_f3.append(rando2[data_aug,30])
        ls_f4.append(-rando2[data_aug,30])
        ls_f5.append(rando[data_aug,30])
        
        # 31 vertical traslation + vertical dilatation + min and max as outliers + random point
        ls_f1.append(rando_de[31] * d[data_aug,31])
        ls_f2.append(d[data_aug,31])
        ls_f3.append(rando2[data_aug,31])
        ls_f4.append(-rando2[data_aug,31])
        ls_f5.append(rando[data_aug,31])
        
    # add the new features as columns    
    features1 = "vert_trans_f"
    features2 = "dil_vert_f"
    features3 = "outlier on the max"
    features4 = "outlier on the min"
    features5 = "modfication on a single random point"
    
    df[features1] = ls_f1
    df[features2] = ls_f2
    df[features3] = ls_f3
    df[features4] = ls_f4
    df[features5] = ls_f5
    
    df = df.reset_index()
    df = df.drop(["error","index"],axis=1)

    # if the curvematching yields a wrong result, drop it
    ind = df[df["score"] < 0].index
    df = df.drop(ind,axis = 0)
    
    # reorder the indexes of the dataframe
    df = df.reset_index()
    df = df.drop(["index"],axis=1) 
    
    return df

In [140]:
aaa2 = prova(1061,2)

In [132]:
# multiple estraction from 	laminar burning velocity measurement
# num_id = the number of experiments that we want to extract from the db 
# num_data_aug = it is the number of desired data augmentation (notice that is always mulplied by 3)
# Notice that this function will call many times the function laminar_burning_velocity_ex and it will
# store only the dataframe that follow the following rules:
# - the experiment must have these two columns:  'equivalence ratio' and 'laminar burning velocity'
# - result from the curve matching must be not negative
# - the used units are 'unitless' and 'cm/s' respectively for the 'equivalence ratio' and 'laminar burning velocity'
# All those db results that don't follow these rules,they will be dropped
def laminar_burning_velocity_multiple_ex(num_id,num_data_aug):
    
    ex = db.filterDatabase(model_name='Experiment', experiment_type = 'laminar burning velocity measurement')

    ex_id = [ex[i].id for i in range(0,len(ex))]
    count = 0
    df_list = []
    
    for i in ex_id:
        
        print(i)
        df = prova(i,num_data_aug)# laminar_burning_velocity_ex(i,num_data_aug)

        # drop the empty/irregular dataframe
        if df.empty:
            print('DataFrame is empty!')
        else:
            count = count + 1
            df_list.append(df)
            
        if count == num_id:
            break
       
    # concatenate the dataframes
    df_tot = pd.concat(df_list)
    
    # reset the indexes
    df_tot = df_tot.reset_index()
    df_tot = df_tot.drop(["index"],axis=1)
    
    # write a csv
    df_tot.to_csv('laminar_burning_velocity.csv', index=False)
    
    # to read a csv
    # df2 = pd.read_csv('laminar_burning_velocity.csv')
    
    return df_tot
    

In [133]:
aaa = laminar_burning_velocity_multiple_ex(15,15)

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt   #Data visualisation libraries 
import seaborn as sns
import statsmodels.api as sm
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from statsmodels.formula.api import ols
from statsmodels.stats.outliers_influence import variance_inflation_factor
%matplotlib inline

SciExp = pd.read_csv('../input/laminar1/laminar_burning_velocity (1).csv')
SciExp.head()
SciExp.info()
SciExp.describe()
SciExp.columns


In [None]:
# graphics 
# sns.heatmap(SciExp.corr())

In [None]:
# vert.trans.f regression
# we are trying to determine whether the dataset that we created can be used for regression or not 
X = SciExp[['score','d0L2','d1L2','d0Pe','d1Pe','shift','vert.trans.f']]

X = X[X['vert.trans.f'] != 0]
#X = X[['score','d0L2','d1L2','d0Pe','d1Pe','shift']]
X[['vert_trans_f']] = X[['vert.trans.f']]
X = X.drop(['vert.trans.f'], axis= 1)

lm = ols("vert_trans_f ~ score + d0L2 + d1L2 + d0Pe + d1Pe + shift", data=X).fit()

In [None]:

# check on the outliers with the Cook's distance and the relative influence

out_d = [True] #just to enter in the do-while
while any(out_d):
    
    lm = ols("vert_trans_f ~ score + d0L2 + d1L2 + d0Pe + d1Pe + shift", data=X).fit()
    
    fig = sm.graphics.influence_plot(lm, criterion="cooks")
    fig.tight_layout(pad=1.0)

    lm_cooksd = lm.get_influence().cooks_distance[0]

    #print(lm_cooksd)
    n = len(X["vert_trans_f"])

    # calculate critical d
    critical_d = 4/n
    print('Critical Cooks distance:', critical_d)

    # identification of potential outliers with leverage
    out_d = lm_cooksd > critical_d

    # output potential outliers with leverage
  #  print(X.index[out_d], "\n",lm_cooksd[out_d])

    X = X.drop(X[out_d].index, axis = 0)

In [None]:
y = X[['vert_trans_f']]
X = X.drop(['vert_trans_f'], axis= 1)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)


In [None]:
lm_pred = lm.predict()
plt.figure(figsize=(12,6))
plt.plot(X["d1L2"],X["vert_trans_f"],'o')
plt.plot(X["d1L2"],lm_pred,'o',linewidth=2)
#score + d0L2 + d1L2 + d0Pe + d1Pe + shift

In [None]:
# shapiro test to check normality on the residuals

from scipy import stats
shapiro_test = stats.shapiro(lm.resid)
shapiro_test

In [None]:
# scatterplot with residuals and predictions following the model without outliers

residuals = lm.resid
predictions = lm.predict(X)
X["predicted"] = lm.predict(X)
X["residuals"] = lm.resid
sns.scatterplot(data=X, x="predicted", y="residuals")
plt.axhline(y=0)
plt.axhline(np.mean(residuals))
# from sklearn.linear_model import LinearRegression
# from yellowbrick.regressor import ResidualsPlot

# # Instantiate and fit the visualizer
# model = LinearRegression()
# visualizer_residuals = ResidualsPlot(model)
# visualizer_residuals.fit(X,y)
# visualizer_residuals.show()

In [None]:
import tensorflow as tf
from tensorflow.keras import Model
from tensorflow.keras import Sequential
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.losses import MeanSquaredLogarithmicError
tfk = tf.keras

In [None]:
hidden_units1 = 160
hidden_units2 = 480
hidden_units3 = 256

# Creating model using the Sequential in tensorflow
def build_model_using_sequential():
  model = Sequential([
    Dense(hidden_units1, kernel_initializer='normal', activation='relu'),
    Dropout(0.2),
    Dense(hidden_units2, kernel_initializer='normal', activation='relu'),
    Dropout(0.2),
    Dense(hidden_units3, kernel_initializer='normal', activation='relu'),
    Dense(1, kernel_initializer='normal', activation='linear')
  ])
  return model
# build the model
model = build_model_using_sequential()

In [None]:
# loss function
msle = MeanSquaredLogarithmicError()
learning_rate = 0.01

model.compile(
    loss=msle, 
    optimizer=Adam(learning_rate=learning_rate), 
    metrics=[msle]
)

# train the model
history = model.fit(
    X_train.values, 
    y_train.values, 
    epochs=200, 
    batch_size=64,
    validation_split=0.2,
    callbacks = [
        tfk.callbacks.EarlyStopping(monitor='val_loss', mode='min', patience=10, restore_best_weights=True),
        tfk.callbacks.ReduceLROnPlateau(monitor='val_loss', mode='min', patience=5, factor=0.5, min_lr=1e-5)
    ]
)

In [None]:
#X_test['prediction'] = model.predict(X_test)
X_test['target'] = y_test

In [None]:
X_test

https://stackoverflow.com/questions/66177666/how-do-you-create-a-design-matrix-in-python