In [3]:
!cd

D:\Ansys Simulations\Project\2D


# SCALING NOTEBOOK
This notebook is used to develop the functions needed to script the scaling of the data

In [1]:
## imports
from pathlib import Path
import pandas as pd
import numpy as np
from PREPROCESSING_splitting import get_number

splitting functions imported


In [268]:
## Create a function to get the data
def get_sample_dfs(samples_folder_path, sample_number):
    ## returns the input and output dataframe for the sample specified
    
    input_folder_path = Path(samples_folder_path, 'input')
    output_folder_path = Path(samples_folder_path, 'output')
    
    glob_string = "*_" + str(sample_number) + ".csv"
    
    input_sample_glob = input_folder_path.glob(glob_string)
    
    for i, sample_input_file in enumerate(input_sample_glob):
        if i == 0:
            sample_input_df = pd.read_csv(sample_input_file, index_col = 0)
        else:
            raise Exception('error: more than one input sample with label' + str(sample_number))
    
    output_sample_glob = output_folder_path.glob(glob_string)
    
    for i, sample_output_file in enumerate(output_sample_glob):
        if i == 0:
            sample_output_df = pd.read_csv(sample_output_file, index_col = 0)
        else:
            raise Exception('error: more than one output sample with label' + str(sample_number))
    
    return sample_input_df, sample_output_df

data_folder_path =  Path('D:/Ansys Simulations/Project/2D/data') 
print(data_folder_path)
raw_input_data, raw_output_data = get_sample_dfs(data_folder_path, 26)

D:\Ansys Simulations\Project\2D\data


In [32]:
raw_output_data

Unnamed: 0,node_number,x_loc,y_loc,z_loc,x_disp,y_disp,z_disp
0,1,-0.186530,1.13540,0.0,-0.000264,-0.000852,0.0
1,2,0.000000,0.00000,0.0,-0.000586,-0.000198,0.0
2,3,-0.178300,1.08530,0.0,-0.000265,-0.000852,0.0
3,4,-0.170060,1.03510,0.0,-0.000264,-0.000852,0.0
4,5,-0.161820,0.98496,0.0,-0.000261,-0.000849,0.0
...,...,...,...,...,...,...,...
569,570,0.169450,0.33947,0.0,0.000249,-0.000553,0.0
570,571,0.140460,0.33622,0.0,0.000280,-0.000460,0.0
571,572,0.104840,0.44160,0.0,0.000606,-0.000288,0.0
572,573,0.556150,0.75061,0.0,0.000115,-0.000399,0.0


# Pi Theorem
We can use the pi theorem to undimensionalize the data. Using the pint package to do so:

In [51]:
from pint import pi_theorem, formatter, UnitRegistry

In [60]:
ureg = UnitRegistry()
pi_groups = ureg.pi_theorem({'nodal_force': '[force]',
                        'disp': '[length]',
                        'youngs_modulus':'[pressure]'})
for group in pi_groups:
      print(formatter(group.items()))

disp ** 2 * youngs_modulus / nodal_force


In [61]:
pi_groups

[{'nodal_force': -1.0, 'disp': 2.0, 'youngs_modulus': 1.0}]

Thus we arrive at a problem. With only the input values it is not possible to arrive at representative pi groups. Some alternative approaches could be:
* With the idea of having a data-centric approach for the model, what can be done is to scale all the forces by the maximum force in the dataset and all displacements by the maximum displacement in the dataset, and make these maximums be a parameter in the dataset for the model to learn the non-linear scaling required from the data itself. 

* Trying to use non-scaled data might be a good option if some way to initialize weights effectively is found, afterall, the data is expected to always have the same behaviour as it is supposed to represent real world physics which should be a stable dataset.

* An "energy" term  that agglomerates dsplacements and forces cumulatively could also be possibly crafted to measure how much "energy" is being provided to the sample, and scaling that data using pi groups that way, although to avoid introducing non-data values, this energy term would just be a term with the dimensions of energy, instead of an actual energy value.

In [65]:
pi_groups = ureg.pi_theorem({'nodal_force': '[force]',
                        'disp': '[length]',
                        'youngs_modulus':'[pressure]',
                        'energy': '[energy]'})
for group in pi_groups:
      print(formatter(group.items()))

disp ** 2 * youngs_modulus / nodal_force
disp * nodal_force / energy


For a small dataset such as the one we have for the proof of concept, it would likely be useful to use some form of scaling which is more information dense such as introducing some engineering constant from outside of the data(yield stress, youngs modulus, etc). However, we don't really care for the performance of the proof of concept model, as long as it is able to learn some correct behaviour, so we're going to act as if I did have a dataset that can be considered "exhaustive" and simply scale it by the largest value of the dataset in that variable. This 'naive' approach involves inspecting the dataset to find that largest value, and is the one that likely scales best with a larger dataset, since in such datasets it's likely that the maximum force and displacement have physical meaning.

In [60]:
## Create generator function to iterate through all samples
def sample_iterator(samples_folder_path):
    ## generates dataframes for each of the samples in the samples folder
    
    input_folder_path = Path(samples_folder_path, 'input')
    output_folder_path = Path(samples_folder_path, 'output')
    
    glob_string = "*.csv"
    
    ## gets all input files
    input_sample_glob = input_folder_path.glob(glob_string)
    
    for input_sample in input_sample_glob:
        sample_number = get_number(input_sample.name)
        input_data, output_data = get_sample_dfs(data_folder_path, sample_number)
        yield sample_number, input_data, output_data

In [142]:
## Create function to run through all data and get max values of displacement and force
def get_max_disp_force(samples_folder_path):
    ## iterates through all data to get the max force and displacement
    samples = sample_iterator(samples_folder_path)
    
    max_force = 0
    max_disp = 0
    
    for sample in samples:
        sample_number, input_data, output_data = sample
        
        ## run through input data for displacement and force
        # first absolute, then max in the columns, then max over the three directions 
        updated = False
        max_force_temp = (input_data.loc[:,['x_force','y_force','z_force']].abs().max()).max()
        if max_force_temp > max_force:
            max_force = max_force_temp
            updated = True
            
        max_disp_temp = (input_data.loc[:,['x_disp','y_disp','z_disp']].abs().max()).max()
        if max_disp_temp > max_disp:
            max_disp = max_disp_temp
            updated = True
            
        ## run through output data for displacement
        max_disp_temp = (output_data.loc[:,['x_disp','y_disp','z_disp']].abs().max()).max()
        if max_disp_temp > max_disp:
            max_disp = max_disp_temp
            updated = True
            
        if updated:
            print(f'UPDATED MAX \t sample #{sample_number} \t force: {max_force:.2f} \t displacement: {max_disp:.6f} ')
            
    return max_force, max_disp

In [149]:
max_force, max_disp = get_max_disp_force(data_folder_path)

UPDATED MAX 	 sample #1 	 force: 0.00 	 displacement: 0.002053 
UPDATED MAX 	 sample #10 	 force: 152.79 	 displacement: 0.003020 
UPDATED MAX 	 sample #100 	 force: 152.79 	 displacement: 0.019382 
UPDATED MAX 	 sample #101 	 force: 242.87 	 displacement: 0.019382 
UPDATED MAX 	 sample #11 	 force: 287.27 	 displacement: 0.019382 
UPDATED MAX 	 sample #55 	 force: 295.32 	 displacement: 0.019382 


Now that we have the maximum values of the dataset, we can use it to scale a dataframe

In [267]:
## create function to scale a dataframe based on max values

def scale_dataframe(df_unscaled, max_force, max_disp):
    df = df_unscaled.copy()
    
    try: 
        df.loc[:,['x_disp','y_disp','z_disp']] = df.loc[:,['x_disp','y_disp','z_disp']].values/max_disp
    except:
        raise Exception('error: displacement data error during scaling. Check sample')
    
    ## handle the error that happens when it's an output because it doesnt force columns
    try: 
        df.loc[:,['x_force','y_force','z_force']] = df.loc[:,['x_force','y_force','z_force']].values/max_force
    except:
        pass
    
    return df

In [283]:
scaled_input_data = scale_dataframe(raw_input_data, max_force, max_disp)
with pd.option_context("display.max_rows", None):
    display(scaled_input_data.iloc[187:197,:])

Unnamed: 0,node_number,named_selection,x_loc,y_loc,z_loc,x_disp,y_disp,z_disp,x_force,y_force,z_force
187,188,1,0.1318,0.43224,0.0,0.031275,-0.014854,0.0,0.0,0.0,0.0
188,189,-1,0.57219,0.72822,0.0,0.0,0.0,0.0,0.0,0.0,0.0
189,190,9,0.54142,0.85372,0.0,0.0,0.0,0.0,-0.250166,0.139079,0.0
190,191,-1,-0.005765,0.98492,0.0,0.0,0.0,0.0,0.0,0.0,0.0
191,192,-1,0.45286,0.66107,0.0,0.0,0.0,0.0,0.0,0.0,0.0
192,193,-1,0.11496,0.24569,0.0,0.0,0.0,0.0,0.0,0.0,0.0
193,194,9,0.54011,0.773,0.0,0.0,0.0,0.0,-0.250166,0.139079,0.0
194,195,-1,0.40589,0.86991,0.0,0.0,0.0,0.0,0.0,0.0,0.0
195,196,-1,0.074352,0.64324,0.0,0.0,0.0,0.0,0.0,0.0,0.0
196,197,-1,0.18725,0.32169,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [284]:
scaled_output_data = scale_dataframe(raw_output_data, max_force, max_disp)
with pd.option_context("display.max_rows", None):
    display(scaled_output_data.iloc[187:197,:])

Unnamed: 0,node_number,x_loc,y_loc,z_loc,x_disp,y_disp,z_disp
187,188,0.1318,0.43224,0.0,0.031275,-0.014853,0.0
188,189,0.57219,0.72822,0.0,0.003903,-0.021648,0.0
189,190,0.54142,0.85372,0.0,0.019122,-0.024433,0.0
190,191,-0.005765,0.98492,0.0,-0.01313,-0.044923,0.0
191,192,0.45286,0.66107,0.0,-0.019974,-0.015173,0.0
192,193,0.11496,0.24569,0.0,0.002163,-0.02437,0.0
193,194,0.54011,0.773,0.0,0.007538,-0.019851,0.0
194,195,0.40589,0.86991,0.0,-0.000164,-0.00845,0.0
195,196,0.074352,0.64324,0.0,0.012624,-0.023477,0.0
196,197,0.18725,0.32169,0.0,0.009299,-0.033534,0.0


And these functions are all we need to scale the data. It is not necessary to write these dataframes to new files as it is more memory efficient to use the original input and output data files and then scale the dataframe values as needed for the training batches