 ### image feature map extraction

In [153]:
import os
import dill
import torch
import argparse
from PIL import Image
import torchvision.transforms as transforms
import segmentation_models_pytorch as smp

option = 'in792sx'
feature_num = "1"

## data directory
data_root_path = 'E:\workspace\PROJECT\doosan\doosan_sem-1\in792sx'    
image_path = 'E:\workspace\PROJECT\doosan\doosan_sem-1\in792sx\images'

model_path = 'E:\workspace\PROJECT\doosan\doosan_sem-1\IN792sx_gamma_best.pt'

with open(os.path.join(data_root_path, 'images.txt')) as f:
    lines = f.readlines()
data_list = [line.rstrip('\n') for line in lines]
        
model = smp.DeepLabV3('resnet34', encoder_depth=4, encoder_weights=None, in_channels=1,decoder_channels=32)
model.load_state_dict(torch.load(os.path.join(model_path)))
model.cuda()

model.eval()

image_name = []
feature_map=[]

transform = transforms.Compose([transforms.Resize((512,512)),
    transforms.ToTensor(),
    ])

for data in data_list:
    data = data.split(' ')[0]
    img = Image.open(os.path.join(image_path, data))
    img = transform(img).cuda().unsqueeze(0)

    image_name.append(data)

    with torch.no_grad():
        _, features, decoder_output = model(img)
        
        ## Extract and save feature maps from encoder(->features) and decoder(->decoder_output)
        if feature_num == "1":
            feature_map.append(features[0])
            feature_name = "f1"
        elif feature_num == "2":
            feature_map.append(features[1])
            feature_name = "f2"
        elif feature_num == "3":
            feature_map.append(features[2])
            feature_name = "f3"
        elif feature_num == "4":
            feature_map.append(features[3])
            feature_name = "f4"
        elif feature_num == "5":
            feature_map.append(features[4])
            feature_name = "f5"
        elif feature_num == "6":
            feature_map.append(decoder_output)
            feature_name = "decoder_output"

data={
        'image_name' : image_name,
        'feature_map':feature_map,
        }



In [154]:
data['image_name']

['210324-409-1_m001_r1.png',
 '210324-409-1_m002_r1.png',
 '210324-409-1_m003_r1.png',
 '210324-409-1_m004_r1.png',
 '210324-409-1_m005_r1.png',
 '210324-409-1_m006_r1.png',
 '210324-409-1_m007_r1.png',
 '210324-409-1_m008_r1.png',
 '210324-409-1_m009_r1.png',
 '210324-409-1_m010_r1.png',
 '210324-409-1_m011_r1.png',
 '210324-409-1_m012_r1.png']

In [157]:
print(data['feature_map'][0].shape)
data['feature_map'][0]

torch.Size([1, 1, 512, 512])


tensor([[[[0.4549, 0.4471, 0.4549,  ..., 0.5882, 0.6235, 0.6196],
          [0.4549, 0.4471, 0.4549,  ..., 0.5725, 0.6157, 0.6275],
          [0.4510, 0.4510, 0.4431,  ..., 0.5686, 0.6157, 0.6392],
          ...,
          [0.3961, 0.3961, 0.3843,  ..., 0.6000, 0.5922, 0.5804],
          [0.3961, 0.3961, 0.3843,  ..., 0.5804, 0.5608, 0.5333],
          [0.4000, 0.3882, 0.3804,  ..., 0.5647, 0.5373, 0.4980]]]],
       device='cuda:0')

### Dataset

In [27]:
import pandas as pd
data_path = r"E:/workspace/PROJECT/doosan/doosan_sem-1/regression/data_all_features_add_image.csv"
dataset = pd.read_csv(data_path, encoding='UTF-8', sep=',') # integrated version of IN792sx, interrupt, cm939w data

In [134]:
from tabulate import tabulate
print('independent variable: (1)이미지 피쳐맵')
print(tabulate(dataset[['id','Name','image_feature_1','image_feature_2','image_feature_3','image_feature_4','image_feature_5','image_feature_6']][:10], headers='keys', tablefmt='fancy_outline'))

independent variable: (1)이미지 피쳐맵
╒════╤══════╤════════════╤════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════╤════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════

In [136]:
from tabulate import tabulate
print('independent variable: (2)시험정보 데이터')
print(tabulate(dataset[['id','Name','stress_mpa','temp_oc', "LMP"]].sample(frac=1).reset_index(drop=True)[:10], headers='keys', tablefmt='fancy_outline',  numalign='right',stralign='right' ))

independent variable: (2)시험정보 데이터
╒════╤═══════╤═══════════════════════════╤══════════════╤═══════════╤════════╕
│    │    id │                      Name │   stress_mpa │   temp_oc │    LMP │
╞════╪═══════╪═══════════════════════════╪══════════════╪═══════════╪════════╡
│  0 │ A0410 │  210330-410-2_m005_r1.png │       263.38 │       900 │ 26.666 │
│  1 │  A-C5 │  210324-AC5-2_m008_r1.png │        357.2 │       800 │ 25.526 │
│  2 │  A-C1 │  210415-AC1-1_m013_r1.png │        302.3 │       800 │ 26.049 │
│  3 │  A-C8 │  210415-AC8-1_m005_r1.png │         79.7 │      1000 │ 30.389 │
│  4 │  A-C9 │  210330-AC9-4_m009_r1.png │          329 │       800 │ 25.857 │
│  5 │   7_1 │                7_1_31.png │        181.5 │       900 │  27.75 │
│  6 │  18_1 │               18_1_36.png │         38.4 │      1000 │  32.66 │
│  7 │  A-C3 │  210324-AC3-4_m012_r1.png │         94.9 │       950 │ 29.799 │
│  8 │ A-C10 │ 210406-AC10-1_m006_r1.png │        164.5 │       900 │ 28.142 │
│  9 │  A-C9 │  21

In [118]:
from tabulate import tabulate
print('independent variable: (3)야금학적 물성정보')
print(tabulate(dataset[['id','Name','gamma','gammaP','gammaP_aspect','gammaP_width','gammaP_circle']].sample(frac=1).reset_index(drop=True)[:10], headers='keys', tablefmt='fancy_outline', numalign='right', stralign='right' ))

independent variable: (3)야금학적 물성정보
╒════╤═══════╤══════════════════════════╤═════════╤══════════╤═════════════════╤════════════════╤═════════════════╕
│    │    id │                     Name │   gamma │   gammaP │   gammaP_aspect │   gammaP_width │   gammaP_circle │
╞════╪═══════╪══════════════════════════╪═════════╪══════════╪═════════════════╪════════════════╪═════════════════╡
│  0 │ G-C11 │       950_g-c11_4_13.png │ 82.6375 │  17.3625 │         1.32894 │        15.9146 │         1.05955 │
│  1 │  A-C2 │ 210416-AC2-1_m003_r1.png │ 53.3584 │  46.6416 │         2.72009 │        52.7805 │         0.44296 │
│  2 │  17_2 │              17_2_23.png │ 56.7493 │  43.2507 │         1.14937 │        198.928 │         0.20721 │
│  3 │ A0412 │ 210330-412-2_m010_r1.png │ 62.0824 │  37.9176 │         3.25611 │        41.5151 │        0.378775 │
│  4 │   7_4 │               7_4_45.png │ 62.2771 │  37.7229 │         1.70116 │        33.8658 │        0.586627 │
│  5 │   7_1 │               7_1_38.p

In [133]:
from tabulate import tabulate
print('dependent variable: 열화수준')
print(tabulate(dataset[['id','Name','mean','upper','lower']].sample(frac=1).reset_index(drop=True)[:10], headers='keys', tablefmt='fancy_outline',numalign='right', stralign='right' ))

dependent variable: 열화수준
╒════╤═══════╤══════════════════════════╤════════╤═════════╤═════════╕
│    │    id │                     Name │   mean │   upper │   lower │
╞════╪═══════╪══════════════════════════╪════════╪═════════╪═════════╡
│  0 │  A-C8 │ 210415-AC8-4_m008_r1.png │    100 │     100 │  50.569 │
│  1 │  A-C6 │ 210324-AC6-3_m014_r1.png │    100 │     100 │  47.714 │
│  2 │   9_5 │               9_5_37.png │   0.49 │   1.027 │   0.234 │
│  3 │  18_3 │               18_3_4.png │ 74.531 │     100 │  37.687 │
│  4 │   7_5 │               7_5_22.png │ 10.541 │  22.095 │   5.029 │
│  5 │  G-C2 │        900_g_c2_2_19.png │    100 │     100 │  47.714 │
│  6 │ G0413 │       1000_g0413_3_8.png │    100 │     100 │  50.569 │
│  7 │  17_5 │              17_5_42.png │ 11.482 │  22.708 │   5.806 │
│  8 │   9_4 │               9_4_31.png │ 11.139 │  23.349 │   5.315 │
│  9 │   7_4 │                7_4_5.png │ 31.589 │  66.212 │  15.071 │
╘════╧═══════╧══════════════════════════╧════════╧══

In [77]:
# 통합데이터

dataset[:5]

Unnamed: 0,file_x,id,Name,stress_mpa,temp_oc,LMP,mean,upper,lower,gamma,...,gammaP_distrib,gammaP_aspect,gammaP_width,gammaP_circle,image_feature_1,image_feature_2,image_feature_3,image_feature_4,image_feature_5,image_feature_6
0,in792sx_interrupt,7_1,7_1_1.png,181.5,900,27.75,21.674,45.429,10.34,74.264962,...,"[90.964, 6.928, 2.108, 0.0, 0.0]",1.474768,33.981497,0.50501,"[[0.12024862319231033, 0.15066924691200256, 0....","[[0.11661333590745926, 0.13787168264389038, 0....","[[-0.2253749519586563, -0.2675281763076782, -0...","[[-0.3594183325767517, 0.003142327070236206, 0...","[[0.05846637487411499, -0.09212616086006165, 0...","[[-0.35045769810676575, -0.2591642439365387, -..."
1,in792sx_interrupt,7_1,7_1_10.png,181.5,900,27.75,21.674,45.429,10.34,66.826782,...,"[86.532, 12.121, 1.347, 0.0, 0.0]",1.478818,39.504702,0.698531,"[[0.0669223964214325, 0.2487642467021942, 0.19...","[[0.1659088283777237, 0.29244664311408997, -0....","[[0.6055417060852051, 0.05654902756214142, 0.0...","[[-0.030698813498020172, 0.15434464812278748, ...","[[0.10532476007938385, -0.15585653483867645, 0...","[[-0.009201321750879288, -0.26059597730636597,..."
2,in792sx_interrupt,7_1,7_1_11.png,181.5,900,27.75,21.674,45.429,10.34,66.53329,...,"[87.973, 8.935, 3.093, 0.0, 0.0]",1.57546,38.257919,0.650696,"[[0.12086768448352814, 0.23515920341014862, 0....","[[0.21036866307258606, -0.00779180321842432, 0...","[[-0.17663319408893585, -0.22108863294124603, ...","[[0.23829159140586853, 0.26274266839027405, 0....","[[0.7567316293716431, -0.14118076860904694, -0...","[[-0.019434262067079544, 0.08177691698074341, ..."
3,in792sx_interrupt,7_1,7_1_12.png,181.5,900,27.75,21.674,45.429,10.34,62.865252,...,"[80.0, 17.358, 2.642, 0.0, 0.0]",1.53697,39.448273,0.713092,"[[0.1290624439716339, 0.19674527645111084, 0.0...","[[0.3020085096359253, 0.06212484464049339, 0.2...","[[0.2172091007232666, -0.5811537504196167, -0....","[[0.01682022213935852, 0.15574216842651367, 0....","[[0.12405859678983688, -0.05523787438869476, 0...","[[0.3297439515590668, -0.12446973472833633, -0..."
4,in792sx_interrupt,7_1,7_1_13.png,181.5,900,27.75,21.674,45.429,10.34,68.665074,...,"[86.513, 12.5, 0.987, 0.0, 0.0]",1.43541,38.770547,0.676962,"[[0.07395485043525696, 0.17124620079994202, 0....","[[0.3928568661212921, 0.08067142963409424, 0.2...","[[0.18069368600845337, -0.038399189710617065, ...","[[0.08189915865659714, -0.49590593576431274, 0...","[[-0.010827787220478058, -0.05440230667591095,...","[[-0.2336520105600357, -0.36249321699142456, -..."


### Regression

data preprocessing

In [2]:
from data_openml import data_prep_openml
import numpy as np

feature_num = 1
cat_dims, cat_idxs, con_idxs, X_train, y_train, X_valid, y_valid, X_test, y_test, train_mean, train_std, IF_train, IF_valid, IF_test, y_upper_train, y_upper_valid, y_upper_test, y_lower_train, y_lower_valid, y_lower_test = data_prep_openml(str(feature_num), datasplit=[.65, .15, .2])
continuous_mean_std = np.array([train_mean,train_std]).astype(np.float32)

c:\Users\cho40\Anaconda3\envs\pytorch\lib\site-packages\numpy\.libs\libopenblas.FB5AE2TYXYH2IJRDKGDGQ3XBKLKTF43H.gfortran-win_amd64.dll
c:\Users\cho40\Anaconda3\envs\pytorch\lib\site-packages\numpy\.libs\libopenblas.XWYDX2IKJW2NMTWSFYNGFUWKQU3LYTCZ.gfortran-win_amd64.dll
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._set_item(key, value)


In [3]:
print(X_train)
print("-"*50)
print(y_train)
print("-"*50)
print(IF_train['data'][0].shape)
print(IF_train['data'][0])

{'data': array([[ 900. ,  181.5],
       [ 900. ,  181.5],
       [ 900. ,  181.5],
       ...,
       [1000. ,   25.1],
       [1000. ,   25.1],
       [1000. ,   25.1]]), 'mask': array([[1, 1],
       [1, 1],
       [1, 1],
       ...,
       [1, 1],
       [1, 1],
       [1, 1]])}
--------------------------------------------------
{'data': array([[ 21.674],
       [ 21.674],
       [ 21.674],
       ...,
       [100.   ],
       [100.   ],
       [100.   ]])}
--------------------------------------------------
torch.Size([1, 32])
tensor([[ 0.1202,  0.1507,  0.1546,  0.1247,  0.0643,  0.0926,  0.1591,  0.0743,
          0.1405,  0.1670,  0.0844, -0.0227,  0.1007,  0.1309,  0.1843,  0.0965,
          0.1741,  0.1247,  0.0965, -0.0221,  0.1345,  0.0791,  0.1235,  0.1331,
          0.1097,  0.0371,  0.0495,  0.0930,  0.1544,  0.1267,  0.2695,  0.1939]])


In [4]:
from torch.utils.data import DataLoader
from data_openml import DataSetCatCon

batchsize=216
train_ds = DataSetCatCon(X_train, y_train, y_upper_train, y_lower_train, IF_train, cat_idxs, continuous_mean_std)
trainloader = DataLoader(train_ds, batch_size=batchsize, shuffle=True,num_workers=4)

valid_ds = DataSetCatCon(X_valid, y_valid, y_upper_valid, y_lower_valid, IF_valid, cat_idxs, continuous_mean_std)
validloader = DataLoader(valid_ds, batch_size=batchsize, shuffle=False,num_workers=4)

test_ds = DataSetCatCon(X_test, y_test, y_upper_test, y_lower_test, IF_test, cat_idxs, continuous_mean_std)
testloader = DataLoader(test_ds, batch_size=batchsize, shuffle=False,num_workers=4)

model

In [22]:
import torch.nn as nn
import numpy as np
from models import TabAttention
import torch

y_dim = 1 # regression
cat_dims = np.append(np.array([1]),np.array(cat_dims)).astype(int) #Appending 1 for CLS token, this is later used to generate embeddings.
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

embedding_size = 32
transformer_depth = 6
attention_heads = 8
attention_dropout = 0.1
ff_dropout = 0.1
cont_embeddings = 'MLP'
attentiontype = 'colrow'

model = TabAttention(
categories = tuple(cat_dims), 
num_continuous = len(con_idxs),                
dim = embedding_size,                           
dim_out = 1,                       
depth = transformer_depth,                       
heads = attention_heads,                         
attn_dropout = attention_dropout,             
ff_dropout = ff_dropout,                  
mlp_hidden_mults = (4, 2),       
cont_embeddings = cont_embeddings,
attentiontype = attentiontype,
y_dim=y_dim,
)

criterion = nn.MSELoss().to(device)
model.to(device)

TabAttention(
  (norm): LayerNorm((2,), eps=1e-05, elementwise_affine=True)
  (simple_MLP): ModuleList(
    (0): simple_MLP(
      (layers): Sequential(
        (0): Linear(in_features=1, out_features=100, bias=True)
        (1): ReLU()
        (2): Linear(in_features=100, out_features=32, bias=True)
      )
    )
    (1): simple_MLP(
      (layers): Sequential(
        (0): Linear(in_features=1, out_features=100, bias=True)
        (1): ReLU()
        (2): Linear(in_features=100, out_features=32, bias=True)
      )
    )
  )
  (transformer): RowColTransformer(
    (embeds): Embedding(4, 32)
    (layers): ModuleList(
      (0): ModuleList(
        (0): PreNorm(
          (norm): LayerNorm((32,), eps=1e-05, elementwise_affine=True)
          (fn): Residual(
            (fn): Attention(
              (to_qkv): Linear(in_features=32, out_features=384, bias=False)
              (to_out): Linear(in_features=128, out_features=32, bias=True)
              (dropout): Dropout(p=0.1, inplace=Fal

In [None]:
#optimizer
import torch.optim as optim
optimizer = optim.AdamW(model.parameters(),lr=0.0001)

#scheduler
epochs=10
scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, epochs)

In [24]:
best_valid_auroc = 0
best_valid_accuracy = 0
best_test_auroc = 0
best_test_accuracy = 0

best_valid_rmsle = 100000
best_valid_ratio = 0
RMSLE_best_test_rmsle = 100000
ACC_best_test_rmsle = 100000
RMSLE_best_test_mae = 100000
ACC_best_test_mae = 100000
RMSLE_best_test_r2 = -100000
ACC_best_test_r2 = -100000
RMSLE_best_test_ratio = 0
ACC_best_test_ratio = 0

In [26]:
from utils import embed_data, mean_sq_error, count_parameters

if __name__ == '__main__':
    print('Training begins now.')
    for epoch in range(epochs):
        model.train()
        running_loss = 0.0
        for i, data in enumerate(trainloader, 0):
            optimizer.zero_grad()
            x_categ, x_cont, y_gts, image_feature = data[0].to(device), data[1].to(device),data[2].to(device), data[3].to(device)

            # convert the data to embeddings in the next step
            _ , x_categ_enc, x_cont_enc = embed_data(x_categ, x_cont, model) 

            reps = model.transformer(x_categ_enc, x_cont_enc, image_feature)
            # select only the representations corresponding to CLS token and apply mlp on it in the next step to get the predictions.
            y_reps = reps[:,0,:]
            y_outs = model.mlpfory(y_reps)

            loss = criterion(y_outs,y_gts) 
            loss.backward()
            optimizer.step()

            scheduler.step()
            running_loss += loss.item()


        if epoch%1==0:
                model.eval()
                with torch.no_grad():

                    valid_mae,valid_mse,valid_rmsle,valid_r2, valid_y_test,valid_y_pred, valid_ratio = mean_sq_error(model, validloader, device)    
                    test_mae,test_mse,test_rmsle,test_r2, test_y_test,test_y_pred, test_ratio = mean_sq_error(model, testloader, device)  

                    print('[EPOCH %d] VALID | %.3f | RMSE: %.3f / MAE: %.3f / MSE: %.3f / R2: %.3f / ACC: %.3f' %(epoch + 1, valid_ratio, valid_rmsle,valid_mae,valid_mse,valid_r2,valid_ratio ))
                    print('[EPOCH %d]  TEST | %.3f | RMSE: %.3f / MAE: %.3f / MSE: %.3f / R2: %.3f / ACC: %.3f' %(epoch + 1, test_ratio, test_rmsle,test_mae,test_mse,test_r2,test_ratio ))
                    
                    if valid_rmsle < best_valid_rmsle:
                        best_valid_rmsle = valid_rmsle
                        RMSLE_best_test_rmsle = test_rmsle
                        RMSLE_best_test_mae = test_mae
                        RMSLE_best_test_r2 = test_r2
                        RMSLE_best_test_ratio = test_ratio
                        #torch.save(model.state_dict(),'%s/bestmodel.pth' % (modelsave_path))
                    if valid_ratio > best_valid_ratio:
                        best_valid_ratio = valid_ratio
                        ACC_best_test_rmsle = test_rmsle
                        ACC_best_test_mae = test_mae
                        ACC_best_test_r2 = test_r2
                        ACC_best_test_ratio = test_ratio
                model.train()


    total_parameters = count_parameters(model)
    print('TOTAL NUMBER OF PARAMS: %d' %(total_parameters))
    print(f"RMSLE-based || MAE:{RMSLE_best_test_mae} | RMSLE:{RMSLE_best_test_rmsle} | R2:{RMSLE_best_test_r2} | ACC:{RMSLE_best_test_ratio}")
    print(f"ACC-based || MAE:{ACC_best_test_mae} | RMSLE:{ACC_best_test_rmsle} | R2:{ACC_best_test_r2} | ACC:{ACC_best_test_ratio}")


Training begins now.
[EPOCH 1] VALID | 0.048 | RMSE: 2.521 / MAE: 66.444 / MSE: 5879.973 / R2: -2.770 / ACC: 0.048
[EPOCH 1]  TEST | 0.035 | RMSE: 2.516 / MAE: 66.037 / MSE: 5882.630 / R2: -2.618 / ACC: 0.035
[EPOCH 2] VALID | 0.058 | RMSE: 2.345 / MAE: 65.509 / MSE: 5720.528 / R2: -2.668 / ACC: 0.058
[EPOCH 2]  TEST | 0.072 | RMSE: 2.343 / MAE: 65.113 / MSE: 5724.020 / R2: -2.521 / ACC: 0.072
[EPOCH 3] VALID | 0.088 | RMSE: 2.167 / MAE: 64.300 / MSE: 5518.912 / R2: -2.538 / ACC: 0.088
[EPOCH 3]  TEST | 0.090 | RMSE: 2.169 / MAE: 63.919 / MSE: 5523.884 / R2: -2.398 / ACC: 0.090
[EPOCH 4] VALID | 0.088 | RMSE: 2.126 / MAE: 63.974 / MSE: 5465.482 / R2: -2.504 / ACC: 0.088
[EPOCH 4]  TEST | 0.090 | RMSE: 2.129 / MAE: 63.598 / MSE: 5470.798 / R2: -2.365 / ACC: 0.090
[EPOCH 5] VALID | 0.088 | RMSE: 1.928 / MAE: 62.043 / MSE: 5156.163 / R2: -2.306 / ACC: 0.088
[EPOCH 5]  TEST | 0.090 | RMSE: 1.937 / MAE: 61.693 / MSE: 5163.832 / R2: -2.176 / ACC: 0.090
[EPOCH 6] VALID | 0.100 | RMSE: 1.877 /