In [1]:
from scripts.pipeline_tools import (
    prepare_data,
    regressor_uncertainty,
    select_unstable_data,
    retrain_regressor,
    uncertainty_change,
    mse_change,
)
from scripts.Models import ITGDatasetDF, load_model, ITGDataset
from sklearn.preprocessing import StandardScaler
from torch.utils.data import DataLoader
from scripts.utils import train_keys
import copy
import numpy as np
from tqdm.auto import tqdm
import matplotlib.pyplot as plt
import yaml 

In [2]:
with open('pipeline_config.yaml') as f:
    cfg = yaml.load(f, Loader=yaml.FullLoader)

pretrained = cfg['pretrained']
paths = cfg['data']

In [3]:
train_data, val_data = prepare_data(
    paths['train'], paths['validation'], target_column="efiitg_gb", target_var="itg"
)

scaler = StandardScaler()
scaler.fit_transform(train_data.drop(["itg"], axis=1))

train_dataset = ITGDatasetDF(train_data, target_column="efiitg_gb", target_var="itg")
valid_dataset = ITGDatasetDF(val_data, target_column="efiitg_gb", target_var="itg")

# # TODO: further testing of the scale function
train_dataset.scale(scaler)
valid_dataset.scale(scaler)

In [4]:
# Load pretrained models
print("Loaded the following models:\n")
models = {}
for model in pretrained:
    if pretrained[model]["trained"] == True:
        trained_model = load_model(model, pretrained[model]["save_path"])
        models[model] = trained_model

Loaded the following models:

Model Loaded: ITG_class
Model Loaded: ITG_reg


In [5]:
# TODO: Needs to be the true training samples used!!!
train_sample = train_dataset.sample(10_000)

In [None]:
init_epoch = 25
iterations = 4
train_losses = []
test_losses = []
for i in range(iterations):
    print(f"\nIteration: {i}\n")
    valid_sample = valid_dataset.sample(10_000)

    # remove the sampled data points from the dataset
    valid_dataset.remove(valid_sample.data.index)

    select_unstable_data(valid_sample, batch_size=100, classifier=models["ITG_class"])

    uncertain_datset, uncert_before, data_idx = regressor_uncertainty(
    valid_sample, models["ITG_reg"], n_runs=15, keep=0.25,valid_dataset=valid_dataset
)
    train_sample_origin, train_uncert_before, train_uncert_idx = regressor_uncertainty(
        train_sample, models["ITG_reg"], n_runs=15,train_data=True,
        )

    train_sample.add(uncertain_datset)

    uncertain_loader = DataLoader(train_sample, batch_size=len(train_sample), shuffle=True)

    prediction_before, prediction_idx_order = models["ITG_reg"].predict(uncertain_loader)

    # Switching validation dataset to numpy arrays to see if it is quicker
    x_array = valid_dataset.data[train_keys].values
    y_array = valid_dataset.data["itg"].values
    z_array = valid_dataset.data["efiitg_gb"].values
    dataset_numpy = ITGDataset(x_array, y_array, z_array)
    valid_loader = DataLoader(
        dataset_numpy, batch_size=int(0.1 * len(y_array)), shuffle=True
    )

    # Retrain Regressor (Further research required)
    epochs = init_epoch * (i+1)
    train_loss, test_loss = retrain_regressor(
        uncertain_loader,
        valid_loader,
        models["ITG_reg"],
        learning_rate=1e-3,
        epochs=epochs,
        validation_step=True,
        lam = 0.6
    )

    train_losses.append(train_loss)
    test_losses.append(test_loss)

    prediction_after,_ = models["ITG_reg"].predict(uncertain_loader, prediction_idx_order)

    _, uncert_after,_ = regressor_uncertainty(valid_sample, models["ITG_reg"], n_runs=15, keep=0.25, order_idx=data_idx)
    _, train_uncert_after,_ = regressor_uncertainty(train_sample_origin, models["ITG_reg"], n_runs=15,order_idx=train_uncert_idx, train_data=True)
   
    print("\nNovel Data Uncertainty changes...\n")
    uncertainty_change(x=uncert_before, y=uncert_after);

    print("\nTraining Data Uncertainty changes...\n")
    uncertainty_change(x=train_uncert_before, y=train_uncert_after);
    
    mse_change(prediction_before, prediction_after,prediction_idx_order,data_idx, uncertain_loader,[uncert_before, uncert_after])

    mse_change(
    prediction_before,
    prediction_after,
    prediction_idx_order,
    train_uncert_idx,
    uncertain_loader,
    uncertainties=[train_uncert_before, train_uncert_after],
    data="train"
     )


Iteration: 0


Running classifier selection...



  0%|          | 0/100 [00:00<?, ?it/s]


Stable points: 7478
Misclassified points: 554
Percentage of misclassified points:  5.54%

Dropped 7704 rows

Running MC Dropout on Novel Data....



  0%|          | 0/15 [00:00<?, ?it/s]


Number of points passed for MC dropout: 2296
no valid before : 2199776
no valid after : 2201498

Running MC Dropout on Training Data....



  0%|          | 0/15 [00:00<?, ?it/s]


Number of points passed for MC dropout: 10000

Retraining regressor...

Training on 10574 points


  0%|          | 0/11 [00:00<?, ?it/s]

Initial loss: 0.9512
Train Step:  0
Loss: 1.3697
Validation Step:  0


  0%|          | 0/11 [00:00<?, ?it/s]

Test loss: 0.9070
Train Step:  1
Loss: 1.3253
Train Step:  2
Loss: 1.2822
Train Step:  3
Loss: 1.2412
Train Step:  4
Loss: 1.2031
Train Step:  5
Loss: 1.1643
Train Step:  6
Loss: 1.1266
Train Step:  7
Loss: 1.0884
Train Step:  8
Loss: 1.0501
Train Step:  9
Loss: 1.0130
Train Step:  10
Loss: 0.9787
Validation Step:  10


  0%|          | 0/11 [00:00<?, ?it/s]

Test loss: 0.6114
Train Step:  11
Loss: 0.9386
Train Step:  12
Loss: 0.9005
Train Step:  13
Loss: 0.8591
Train Step:  14
Loss: 0.8158
Train Step:  15
Loss: 0.7719
Train Step:  16
Loss: 0.7304
Train Step:  17
Loss: 0.6865
Train Step:  18
Loss: 0.6452
Train Step:  19
Loss: 0.6002
Train Step:  20
Loss: 0.5640
Validation Step:  20


  0%|          | 0/11 [00:00<?, ?it/s]

Test loss: 0.3493
Train Step:  21
Loss: 0.5212
Train Step:  22
Loss: 0.4864
Train Step:  23
Loss: 0.4514
Train Step:  24
Loss: 0.4162
Validation Step:  24


  0%|          | 0/11 [00:00<?, ?it/s]

Test loss: 0.2881

Running MC Dropout on Novel Data....



  0%|          | 0/15 [00:00<?, ?it/s]


Number of points passed for MC dropout: 2296

Running MC Dropout on Training Data....



  0%|          | 0/15 [00:00<?, ?it/s]


Number of points passed for MC dropout: 10000

Novel Data Uncertainty changes...

 Decreased 98.780% Increased: 1.220 % No Change: 0.000 
Initial Average Uncertainty: 0.3793, Final Average Uncertainty: 0.1659

Training Data Uncertainty changes...

 Decreased 40.140% Increased: 59.860 % No Change: 0.000 
Initial Average Uncertainty: 0.0782, Final Average Uncertainty: 0.0676

Change in MSE for novel dataset: 1.8741


Change in MSE for train dataset: 0.1439


Iteration: 1


Running classifier selection...



  0%|          | 0/100 [00:00<?, ?it/s]


Stable points: 7381
Misclassified points: 519
Percentage of misclassified points:  5.19%

Dropped 7632 rows

Running MC Dropout on Novel Data....



  0%|          | 0/15 [00:00<?, ?it/s]


Number of points passed for MC dropout: 2368
no valid before : 2191498
no valid after : 2193274

Running MC Dropout on Training Data....



  0%|          | 0/15 [00:00<?, ?it/s]


Number of points passed for MC dropout: 10574

Retraining regressor...

Training on 11166 points


  0%|          | 0/11 [00:00<?, ?it/s]

Initial loss: 0.7196
Train Step:  0
Loss: 1.2671
Validation Step:  0


  0%|          | 0/11 [00:00<?, ?it/s]

Test loss: 0.6918
Train Step:  1
Loss: 1.2200
Train Step:  2
Loss: 1.1736
Train Step:  3
Loss: 1.1231
Train Step:  4
Loss: 1.0740
Train Step:  5
Loss: 1.0201
Train Step:  6
Loss: 0.9664
Train Step:  7
Loss: 0.9121
Train Step:  8
Loss: 0.8523
Train Step:  9
Loss: 0.7976
Train Step:  10
Loss: 0.7417
Validation Step:  10


  0%|          | 0/11 [00:00<?, ?it/s]

Test loss: 0.4248
Train Step:  11
Loss: 0.6900
Train Step:  12
Loss: 0.6447
Train Step:  13
Loss: 0.5953
Train Step:  14
Loss: 0.5601
Train Step:  15
Loss: 0.5235
Train Step:  16
Loss: 0.4954
Train Step:  17
Loss: 0.4672
Train Step:  18
Loss: 0.4397
Train Step:  19
Loss: 0.4211
Train Step:  20
Loss: 0.3968
Validation Step:  20


  0%|          | 0/11 [00:00<?, ?it/s]

Test loss: 0.2799
Train Step:  21
Loss: 0.3813
Train Step:  22
Loss: 0.3603
Train Step:  23
Loss: 0.3530
Train Step:  24
Loss: 0.3405
Train Step:  25
Loss: 0.3286
Train Step:  26
Loss: 0.3165
Train Step:  27
Loss: 0.3025
Train Step:  28
Loss: 0.2998
Train Step:  29
Loss: 0.2871
Train Step:  30
Loss: 0.2847
Validation Step:  30


  0%|          | 0/11 [00:00<?, ?it/s]

Test loss: 0.2193
Train Step:  31
Loss: 0.2764
Train Step:  32
Loss: 0.2640
Train Step:  33
Loss: 0.2590
Train Step:  34
Loss: 0.2590
Train Step:  35
Loss: 0.2520
Train Step:  36
Loss: 0.2475
Train Step:  37
Loss: 0.2399
Train Step:  38
Loss: 0.2480
Train Step:  39
Loss: 0.2434
Train Step:  40
Loss: 0.2374
Validation Step:  40


  0%|          | 0/11 [00:00<?, ?it/s]

Test loss: 0.1935
Train Step:  41
Loss: 0.2279
Train Step:  42
Loss: 0.2259
Train Step:  43
Loss: 0.2301
Train Step:  44
Loss: 0.2186
Train Step:  45
Loss: 0.2197
Train Step:  46
Loss: 0.2181
Train Step:  47
Loss: 0.2122
Train Step:  48
Loss: 0.2114
Train Step:  49
Loss: 0.2106
Validation Step:  49


  0%|          | 0/11 [00:00<?, ?it/s]

Test loss: 0.1843

Running MC Dropout on Novel Data....



  0%|          | 0/15 [00:00<?, ?it/s]


Number of points passed for MC dropout: 2368

Running MC Dropout on Training Data....



  0%|          | 0/15 [00:00<?, ?it/s]

In [None]:
plt.figure()
for i in range(len(train_losses)):
    plt.scatter(np.arange(0,len(train_losses[i])), train_losses[i], label =f"Iteration {i}");
    plt.legend()

plt.xlabel("Epochs")
plt.ylabel("Training Loss");
plt.savefig("SP_training_loss.png", dpi = 300)

In [None]:
plt.figure()
for i in range(len(test_losses)):
    plt.plot(np.arange(0,len(test_losses[i])), test_losses[i]);
    plt.scatter(np.arange(0,len(test_losses[i])), test_losses[i], label = f"Iteration {i}");
    
plt.legend()
plt.xlabel("Epochs")
plt.ylabel("Test Loss");
plt.savefig("SP_test_loss.png", dpi = 300)