In [1]:
import numpy as np
import pandas as pd

# User Input Principle (UIP)

In [2]:
def uip_augmente_data(raw_df, x_cols, n_augmented=0, x_std=0, id_col='sID', da_col='DA#', random_state=None):

    all_cols = df_train.columns.tolist()
    nontarget_cols = [item for item in all_cols if item not in ([id_col]+x_cols)]
    
    x_std_df = np.ones(raw_df[x_cols].values.shape) * x_std

    raw_df = raw_df.copy()
    raw_df[da_col] = 0

    noisy_dfs = [raw_df, ]
    rng = np.random.default_rng(random_state)

    for i in range(n_augmented):
        noisy_x = pd.DataFrame(rng.normal(raw_df[x_cols], x_std_df), columns=x_cols)
        nontarget_df = raw_df[nontarget_cols]
        noisy_sid = raw_df[[id_col]].copy()
        noisy_sid[da_col] = i + 1
        
        noisy_df = noisy_sid.join(noisy_x).join(nontarget_df).reset_index(drop=True)
        noisy_dfs.append(noisy_df)
    final_df = pd.concat(noisy_dfs, ignore_index=True).sort_values(by=[id_col, da_col]).reset_index(drop=True)
    return final_df[[id_col, da_col] + x_cols + nontarget_cols]

In [3]:
df_train = pd.read_csv('data/active_learning_training_data.csv')
target_cols = ['CNT', 'MXN', 'GNP', 'PVA']

uip_augmente_data(df_train, target_cols, n_augmented=1000, x_std=0.02, random_state=0)

Unnamed: 0,sID,DA#,CNT,MXN,GNP,PVA,1D,2D,2D1D,2D2D,Pre-stretch,thickness,S0,eps10
0,bSC-0001,0,0.200000,0.610000,0.150000,0.040000,1,0,0,0,0,1200,5.574136,0.670663
1,bSC-0001,1,0.202515,0.607358,0.162808,0.042098,1,0,0,0,0,1200,5.574136,0.670663
2,bSC-0001,2,0.197911,0.593954,0.132632,0.048508,1,0,0,0,0,1200,5.574136,0.670663
3,bSC-0001,3,0.215230,0.642719,0.163175,0.028769,1,0,0,0,0,1200,5.574136,0.670663
4,bSC-0001,4,0.209021,0.615124,0.149993,-0.002652,1,0,0,0,0,1200,5.574136,0.670663
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
146141,bSC-0830,996,0.698432,0.145841,0.155980,0.066471,0,0,0,1,300,1600,7.794232,5.666363
146142,bSC-0830,997,0.690565,0.125560,0.109816,0.055085,0,0,0,1,300,1600,7.794232,5.666363
146143,bSC-0830,998,0.658160,0.142843,0.166173,0.068821,0,0,0,1,300,1600,7.794232,5.666363
146144,bSC-0830,999,0.669109,0.095376,0.155236,0.029360,0,0,0,1,300,1600,7.794232,5.666363


# Synthetic Minority Oversampling Technique (SMOTE)

In [4]:
def smote_augmente_data(data, target_columns, n_augmented=10, std_dev=0.1, k_neighbors=3, id_col='sID', da_col='DA#', random_seed=None):
    
    if random_seed is not None:
        np.random.seed(random_seed)
        
    augmented_data = []
    original_data = data.copy()
    target_data = data[target_columns]
    all_cols = data.columns.tolist()
    nontarget_cols = [item for item in all_cols if item not in ([id_col]+target_columns)]

    for index, single_data in target_data.iterrows():
        single_data_array = single_data.to_numpy()
        neighbors = single_data_array + np.random.normal(0, std_dev, (k_neighbors, single_data_array.shape[0]))
        augmented_data.append(np.append(0,single_data))

        for i in range(n_augmented):
            neighbor_idx = np.random.randint(0, k_neighbors)
            neighbor = neighbors[neighbor_idx]
            lam = np.random.uniform(0, 1)
            synthetic_sample = single_data_array + lam * (neighbor - single_data_array)
            augmented_data.append(np.append(i+1, synthetic_sample))

    augmented_target_df = pd.DataFrame(augmented_data, columns=([da_col] + target_columns))
    repeated_original_data = pd.concat([data] * (n_augmented+1), ignore_index=True).sort_values(by=[id_col]).reset_index(drop=True).drop(target_columns, axis=1)
    augmented_data_df = pd.concat([repeated_original_data.iloc[:, [0]], augmented_target_df, repeated_original_data.iloc[:, 1:]], axis=1)   
    augmented_data_df[da_col] = augmented_data_df[da_col].astype(int)
    
    return augmented_data_df

In [5]:
df_train = pd.read_csv('data/active_learning_training_data.csv')
target_columns = ['CNT', 'MXN', 'GNP', 'PVA']

smote_augmente_data(df_train, target_columns=target_columns, n_augmented=1000, std_dev=0.02, k_neighbors=5, random_seed=0)

Unnamed: 0,sID,DA#,CNT,MXN,GNP,PVA,1D,2D,2D1D,2D2D,Pre-stretch,thickness,S0,eps10
0,bSC-0001,0,0.200000,0.610000,0.150000,0.040000,1,0,0,0,0,1200,5.574136,0.670663
1,bSC-0001,1,0.220072,0.599497,0.160211,0.038373,1,0,0,0,0,1200,5.574136,0.670663
2,bSC-0001,2,0.218411,0.614176,0.160215,0.063388,1,0,0,0,0,1200,5.574136,0.670663
3,bSC-0001,3,0.214152,0.608057,0.152965,0.031910,1,0,0,0,0,1200,5.574136,0.670663
4,bSC-0001,4,0.211216,0.611793,0.156542,0.044918,1,0,0,0,0,1200,5.574136,0.670663
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
146141,bSC-0830,996,0.692140,0.118706,0.143884,0.051525,0,0,0,1,300,1600,7.794232,5.666363
146142,bSC-0830,997,0.700736,0.119494,0.136229,0.036651,0,0,0,1,300,1600,7.794232,5.666363
146143,bSC-0830,998,0.698540,0.123743,0.141600,0.034667,0,0,0,1,300,1600,7.794232,5.666363
146144,bSC-0830,999,0.708032,0.098684,0.126810,0.019169,0,0,0,1,300,1600,7.794232,5.666363
