In [1]:
import numpy as np
import pandas as pd
import torch
from torch.utils.data import TensorDataset, DataLoader
import torch.nn as nn
from transformers import BertTokenizer, BertModel
from sklearn.preprocessing import OneHotEncoder, LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
import matplotlib.pyplot as plt
from tqdm import tqdm
import torch.nn.functional as F
from transformers import AdamW
import re

In [2]:
data = pd.read_excel('filtered_30_filled_money.xlsx')

In [3]:
def preprocess_text(text):
    text = text.lower()
    text = re.sub(r'\([^)]*\)', '', text)
    text = re.sub(r'[^\w\s\*/\-\+.,#&]', '', text)
    text = re.sub(r'\s+', ' ', text)
    text = re.sub(r'\b(사용금지|사)\b', '', text, flags=re.IGNORECASE)
    text = text.strip()
    return text

def clean_supplier_name(name):
    name = name.lower()
    name = re.sub(r'coporation|coropration|coproration|corporration', 'corporation', name)
    name = re.sub(r'\(사용금지\)', '', name)
    name = re.sub(r'u\.s\.a', '_usa', name)
    name = re.sub(r'\.', '', name)
    suffixes = r'(corporation|corp|company|co|incorporated|inc|limited|ltd|상사|공사|엔지니어링|주식회사|주|gmbh|pte ltd|llc)'
    name = re.sub(suffixes, '', name, flags=re.IGNORECASE)
    name = re.sub(r'[^\w\s-]', '', name)
    name = re.sub(r'\s+', ' ', name).strip()
    return name

In [4]:
# 텍스트 전처리
data['cleaned_item'] = data['청구품목'].apply(preprocess_text)
data['cleaned_supplier'] = data['발주처'].apply(clean_supplier_name)
data['combined_text'] = data['cleaned_item'].fillna('') + " " + data['Part No.1'].fillna('') + " " + data['cleaned_supplier'].fillna('')


In [5]:
exchange_rates = {'USD': 1, 'KRW': 0.00078, 'EUR': 1.18, 'JPY': 0.0091}

# usd기준해서 금액 통일함 
data['converted_price'] = data.apply(lambda x: x['견적단가'] * exchange_rates[x['견적화폐']], axis=1)


In [6]:
print(data['견적화폐'].unique(), data['견적화폐'].isnull().sum())


['USD' 'KRW' 'EUR' 'JPY'] 0


In [7]:
currency_ohe = OneHotEncoder(sparse_output=False) 
currency_encoded = currency_ohe.fit_transform(data[['견적화폐']])

In [8]:
# 레이블 인코딩
machinery_label_encoder = LabelEncoder()
y_machinery= machinery_label_encoder.fit_transform(data['Machinery'])

assembly_label_encoder = LabelEncoder()
y_assembly = assembly_label_encoder.fit_transform(data['Assembly'])

In [9]:
# train_test split 을 위해 하나로 모으고, 분할하고 다시 텍스트랑 추가피쳐로 분리해줄거임 

# 1. 텍스트 + 추가 피처 결합
X = np.concatenate([
    data['combined_text'].values.reshape(-1, 1),  # 2차원 배열로 바꿔서 결합해줌 
    currency_encoded, 
    data['converted_price'].values.reshape(-1, 1)  # 통일한단가
], axis=1)

X_train_val, X_test, y_train_val_machinery, y_test_machinery, y_train_val_assembly, y_test_assembly = train_test_split(
    X, y_machinery, y_assembly, 
    test_size=0.15, 
    random_state=42, 
    stratify=y_assembly)  # stratify는 주로 메인 레이블 기준으로 설정

X_train, X_val, y_train_machinery, y_val_machinery, y_train_assembly, y_val_assembly = train_test_split(
    X_train_val, y_train_val_machinery, y_train_val_assembly, 
    test_size=0.15, 
    random_state=42, 
    stratify=y_train_val_assembly)  # 다시 stratify 기준으로 설정

# 크기 확인
print(f"combined_text shape: {data['combined_text'].shape}")
print(f"currency_encoded shape: {currency_encoded.shape}")
print(f"converted_price shape: {data['converted_price'].shape}")
print(f"X shape after concatenation: {X.shape}")

print(f"X_train size: {X_train.shape}")
print(f"X_val size: {X_val.shape}")
print(f"X_test size: {X_test.shape}")


combined_text shape: (13882,)
currency_encoded shape: (13882, 4)
converted_price shape: (13882,)
X shape after concatenation: (13882, 6)
X_train size: (10029, 6)
X_val size: (1770, 6)
X_test size: (2083, 6)


In [10]:
#텍스트분리
train_combined_text = X_train[:, 0] 
val_combined_text = X_val[:, 0]
test_combined_text = X_test[:, 0]

train_extra_features = X_train[:, 1:]  # 이 부분에서 이미 2차원으로 분리됨
val_extra_features = X_val[:, 1:]
test_extra_features = X_test[:, 1:]

# object타입이 섞여있다고 해서 astype float 명시해줌
train_extra_features = np.nan_to_num(train_extra_features, nan=0.0).astype(float)
val_extra_features = np.nan_to_num(val_extra_features, nan=0.0).astype(float)
test_extra_features = np.nan_to_num(test_extra_features, nan=0.0).astype(float)

# 스케일링
scaler = StandardScaler()
train_extra_features = scaler.fit_transform(train_extra_features)
val_extra_features = scaler.transform(val_extra_features)
test_extra_features = scaler.transform(test_extra_features)



In [11]:
# 클러스터링을 통한 추가 피처 생성
kmeans = KMeans(n_clusters=5, random_state=42)
train_clusters = kmeans.fit_predict(train_extra_features[:, -1].reshape(-1, 1))  # converted_price 컬럼 클러스터링
val_clusters = kmeans.predict(val_extra_features[:, -1].reshape(-1, 1))
test_clusters = kmeans.predict(test_extra_features[:, -1].reshape(-1, 1))


Found Intel OpenMP ('libiomp') and LLVM OpenMP ('libomp') loaded at
the same time. Both libraries are known to be incompatible and this
can cause random crashes or deadlocks on Linux when loaded in the
same Python program.
Using threadpoolctl may cause crashes or deadlocks. For more
information and possible workarounds, please see
    https://github.com/joblib/threadpoolctl/blob/master/multiple_openmp.md



In [12]:
# 클러스터 레이블을 원-핫 인코딩
cluster_ohe = OneHotEncoder(sparse_output=False)
train_clusters_encoded = cluster_ohe.fit_transform(train_clusters.reshape(-1, 1))
val_clusters_encoded = cluster_ohe.transform(val_clusters.reshape(-1, 1))
test_clusters_encoded = cluster_ohe.transform(test_clusters.reshape(-1, 1))


In [13]:
# 추가 피처에 클러스터 인코딩 결합
train_extra_features = np.concatenate([train_extra_features, train_clusters_encoded], axis=1)
val_extra_features = np.concatenate([val_extra_features, val_clusters_encoded], axis=1)
test_extra_features = np.concatenate([test_extra_features, test_clusters_encoded], axis=1)


In [14]:
# Torch Tensor로 변환 후 디바이스 이동
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

train_extra_features_tensor = torch.tensor(train_extra_features, dtype=torch.float32).to(device)
val_extra_features_tensor = torch.tensor(val_extra_features, dtype=torch.float32).to(device)
test_extra_features_tensor = torch.tensor(test_extra_features, dtype=torch.float32).to(device)


  train_extra_features_tensor = torch.tensor(train_extra_features, dtype=torch.float32).to(device)


In [15]:
# BERT 토크나이저 (텍스트처리)
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')



In [16]:
# X중 텍스트만 BERT 입력 형식으로 변환
def encode_data(texts):
    return tokenizer(texts.tolist(), padding=True, truncation=True, max_length=128, return_tensors='pt')

train_encodings = encode_data(train_combined_text)
val_encodings = encode_data(val_combined_text)
test_encodings = encode_data(test_combined_text)


In [17]:
train_machinery_dataset = TensorDataset(
    train_encodings['input_ids'],
    train_encodings['attention_mask'],
    train_extra_features_tensor,
    torch.tensor(y_train_machinery, dtype=torch.long).to(device)  # Machinery 레이블
)

val_machinery_dataset = TensorDataset(
    val_encodings['input_ids'],
    val_encodings['attention_mask'],
    val_extra_features_tensor,
    torch.tensor(y_val_machinery, dtype=torch.long).to(device)
)

test_machinery_dataset = TensorDataset(
    test_encodings['input_ids'],
    test_encodings['attention_mask'],
    test_extra_features_tensor,
    torch.tensor(y_test_machinery, dtype=torch.long).to(device)
)

In [18]:
print(f"y_train size: {y_train_machinery.shape}")
print(f"y_val size: {y_val_machinery.shape}")
print(f"y_test size: {y_test_machinery.shape}")


y_train size: (10029,)
y_val size: (1770,)
y_test size: (2083,)


In [19]:
# 3. DataLoader 생성
from torch.utils.data import DataLoader

batch_size = 16

train_loader_machinery = DataLoader(train_machinery_dataset, batch_size=batch_size, shuffle=True)
val_loader_machinery  = DataLoader(val_machinery_dataset, batch_size=batch_size, shuffle=False)
test_loader_machinery = DataLoader(test_machinery_dataset, batch_size=batch_size, shuffle=False)

In [20]:
class BertForMachinery(nn.Module):
    def __init__(self, num_machinery_labels, extra_features_dim):
        super(BertForMachinery, self).__init__()
        self.bert = BertModel.from_pretrained('bert-base-uncased')
        self.fc1 = nn.Linear(self.bert.config.hidden_size + extra_features_dim, 256)
        self.relu = nn.ReLU()
        self.dropout = nn.Dropout(0.3)
        self.machinery_classifier = nn.Linear(256, num_machinery_labels)

    def forward(self, input_ids, attention_mask, extra_features):
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        pooled_output = outputs.pooler_output
        
        if extra_features.dim() == 1:
            extra_features = extra_features.unsqueeze(1)
        
        machinery_combined_features = torch.cat((pooled_output, extra_features), dim=1)
        x = self.fc1(machinery_combined_features)
        x = self.relu(x)
        x = self.dropout(x)
        machinery_outputs = self.machinery_classifier(x)
        
        return machinery_outputs

In [21]:
import torch
torch.cuda.empty_cache()

In [22]:
# 디바이스 설정
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
machinery_model = BertForMachinery(num_machinery_labels=len(machinery_label_encoder.classes_), extra_features_dim=10) 
machinery_model.to(device)


BertForMachinery(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementw

In [23]:
# 옵티마이저 및 학습률 스케줄러 설정
optimizer_machinery = AdamW(machinery_model.parameters(), lr=2e-5)
loss_fn_machinery=torch.nn.CrossEntropyLoss()



In [24]:
def train_machinery(model, dataloader, optimizer, device, loss_fn):
    model.train()
    total_loss = 0
    for batch in tqdm(dataloader):
        input_ids, attention_mask, extra_features, labels = [b.to(device) for b in batch]  # 순서 수정
        
        if labels.dim() > 1:
            labels = labels.squeeze()

        optimizer.zero_grad()
        outputs = model(input_ids, attention_mask=attention_mask, extra_features=extra_features)
        labels = labels.to(torch.int64)  # CrossEntropyLoss에 맞게 변환
        loss = loss_fn(outputs, labels)
        total_loss += loss.item()
        loss.backward()
        optimizer.step()

    return total_loss / len(dataloader)


In [25]:
import torch.nn.functional as F

# 평가 함수 - logits-62개짜리 각각의 자신감
def evaluate_machinery(model, dataloader, device, loss_fn_machinery):
    model.eval()
    total_loss = 0
    total_correct = 0
    total_samples = 0
    machinery_predictions = []
    
    with torch.no_grad():
        for batch in tqdm(dataloader):
            input_ids, attention_mask, extra_features, labels = [b.to(device) for b in batch]
            
            # 모델 예측
            outputs = model(input_ids, attention_mask=attention_mask, extra_features=extra_features)
            
            # Loss 계산
            loss = loss_fn_machinery(outputs, labels)
            total_loss += loss.item()
            
            _, predicted = torch.max(outputs, 1)  # softmax 없이 직접 logits에서 최대값 클래스 예측
            
            # 예측값을 저장
            machinery_predictions.append(predicted.cpu().numpy())  # 리스트에 추가
            
            # 정확도 계산
            total_correct += (predicted == labels).sum().item()
            total_samples += labels.size(0)

    accuracy = total_correct / total_samples
    machinery_predictions = np.concatenate(machinery_predictions, axis=0)  
    avg_loss = total_loss / len(dataloader)
    
    return avg_loss, accuracy, machinery_predictions

In [26]:
import torch
torch.cuda.empty_cache()

In [27]:
# Machinery 모델 학습 실행
num_epochs = 20
best_val_acc_machinery = 0
patience = 3
trigger_times = 0

for epoch in range(num_epochs):
    print(f"Epoch {epoch+1}/{num_epochs}")
    
    # Machinery 모델 학습
    train_loss_machinery = train_machinery(
        machinery_model, 
        train_loader_machinery, 
        optimizer_machinery, 
        device, 
        loss_fn_machinery
    )
    
    # Machinery 모델 평가
    val_loss_machinery, val_acc_machinery, val_preds_machinery = evaluate_machinery(
        machinery_model, 
        val_loader_machinery, 
        device, 
        loss_fn_machinery
    )
    test_loss_machinery, test_acc_machinery, test_preds_machinery = evaluate_machinery(
        machinery_model, 
        test_loader_machinery, 
        device, 
        loss_fn_machinery
    )
    
    print(f"Machinery - Train Loss: {train_loss_machinery:.4f}, Val Loss: {val_loss_machinery:.4f}, Val Acc: {val_acc_machinery:.4f}, Test Acc: {test_acc_machinery:.4f}")
    
    # Early Stopping for Machinery
    if val_acc_machinery > best_val_acc_machinery:
        best_val_acc_machinery = val_acc_machinery
        trigger_times = 0
        torch.save(machinery_model.state_dict(), "best_machinery_model.pth")
    else:
        trigger_times += 1
        print(f"Trigger Times (Machinery): {trigger_times}")
        if trigger_times >= patience:
            print("Early stopping for Machinery!")
            break

# 최종 테스트 성능 평가
final_test_loss_machinery, final_test_acc_machinery, final_machinery_predictions = evaluate_machinery(
    machinery_model, 
    test_loader_machinery, 
    device, 
    loss_fn_machinery
)
print(f"Final Test Accuracy (Machinery): {final_test_acc_machinery:.4f}")
torch.save(machinery_model.state_dict(), "best_machinery_model.pth")


Epoch 1/20


  attn_output = torch.nn.functional.scaled_dot_product_attention(
100%|████████████████████████████████████████████████████████████████████████████████| 627/627 [03:07<00:00,  3.35it/s]
100%|████████████████████████████████████████████████████████████████████████████████| 111/111 [00:12<00:00,  8.82it/s]
100%|████████████████████████████████████████████████████████████████████████████████| 131/131 [00:11<00:00, 11.89it/s]


Machinery - Train Loss: 2.2436, Val Loss: 1.4900, Val Acc: 0.6232, Test Acc: 0.6207
Epoch 2/20


100%|████████████████████████████████████████████████████████████████████████████████| 627/627 [03:05<00:00,  3.37it/s]
100%|████████████████████████████████████████████████████████████████████████████████| 111/111 [00:12<00:00,  8.83it/s]
100%|████████████████████████████████████████████████████████████████████████████████| 131/131 [00:11<00:00, 11.90it/s]


Machinery - Train Loss: 1.3356, Val Loss: 1.1412, Val Acc: 0.6989, Test Acc: 0.6961
Epoch 3/20


100%|████████████████████████████████████████████████████████████████████████████████| 627/627 [03:05<00:00,  3.37it/s]
100%|████████████████████████████████████████████████████████████████████████████████| 111/111 [00:12<00:00,  8.83it/s]
100%|████████████████████████████████████████████████████████████████████████████████| 131/131 [00:11<00:00, 11.90it/s]


Machinery - Train Loss: 1.0550, Val Loss: 0.9203, Val Acc: 0.7412, Test Acc: 0.7408
Epoch 4/20


100%|████████████████████████████████████████████████████████████████████████████████| 627/627 [03:05<00:00,  3.37it/s]
100%|████████████████████████████████████████████████████████████████████████████████| 111/111 [00:12<00:00,  8.83it/s]
100%|████████████████████████████████████████████████████████████████████████████████| 131/131 [00:11<00:00, 11.89it/s]


Machinery - Train Loss: 0.8936, Val Loss: 0.8294, Val Acc: 0.7655, Test Acc: 0.7571
Epoch 5/20


100%|████████████████████████████████████████████████████████████████████████████████| 627/627 [03:05<00:00,  3.37it/s]
100%|████████████████████████████████████████████████████████████████████████████████| 111/111 [00:12<00:00,  8.84it/s]
100%|████████████████████████████████████████████████████████████████████████████████| 131/131 [00:11<00:00, 11.89it/s]


Machinery - Train Loss: 0.7785, Val Loss: 0.7455, Val Acc: 0.7915, Test Acc: 0.7916
Epoch 6/20


100%|████████████████████████████████████████████████████████████████████████████████| 627/627 [03:05<00:00,  3.37it/s]
100%|████████████████████████████████████████████████████████████████████████████████| 111/111 [00:12<00:00,  8.83it/s]
100%|████████████████████████████████████████████████████████████████████████████████| 131/131 [00:11<00:00, 11.90it/s]


Machinery - Train Loss: 0.6703, Val Loss: 0.7176, Val Acc: 0.7972, Test Acc: 0.8008
Epoch 7/20


100%|████████████████████████████████████████████████████████████████████████████████| 627/627 [03:05<00:00,  3.38it/s]
100%|████████████████████████████████████████████████████████████████████████████████| 111/111 [00:12<00:00,  8.89it/s]
100%|████████████████████████████████████████████████████████████████████████████████| 131/131 [00:10<00:00, 11.96it/s]


Machinery - Train Loss: 0.5998, Val Loss: 0.6557, Val Acc: 0.8141, Test Acc: 0.8133
Epoch 8/20


100%|████████████████████████████████████████████████████████████████████████████████| 627/627 [03:05<00:00,  3.38it/s]
100%|████████████████████████████████████████████████████████████████████████████████| 111/111 [00:12<00:00,  8.90it/s]
100%|████████████████████████████████████████████████████████████████████████████████| 131/131 [00:10<00:00, 11.95it/s]


Machinery - Train Loss: 0.5363, Val Loss: 0.6060, Val Acc: 0.8209, Test Acc: 0.8147
Epoch 9/20


100%|████████████████████████████████████████████████████████████████████████████████| 627/627 [03:05<00:00,  3.38it/s]
100%|████████████████████████████████████████████████████████████████████████████████| 111/111 [00:12<00:00,  8.84it/s]
100%|████████████████████████████████████████████████████████████████████████████████| 131/131 [00:10<00:00, 11.97it/s]


Machinery - Train Loss: 0.4834, Val Loss: 0.5831, Val Acc: 0.8186, Test Acc: 0.8238
Trigger Times (Machinery): 1
Epoch 10/20


100%|████████████████████████████████████████████████████████████████████████████████| 627/627 [03:05<00:00,  3.38it/s]
100%|████████████████████████████████████████████████████████████████████████████████| 111/111 [00:12<00:00,  8.92it/s]
100%|████████████████████████████████████████████████████████████████████████████████| 131/131 [00:10<00:00, 11.98it/s]


Machinery - Train Loss: 0.4433, Val Loss: 0.5829, Val Acc: 0.8249, Test Acc: 0.8272
Epoch 11/20


100%|████████████████████████████████████████████████████████████████████████████████| 627/627 [03:05<00:00,  3.38it/s]
100%|████████████████████████████████████████████████████████████████████████████████| 111/111 [00:12<00:00,  8.91it/s]
100%|████████████████████████████████████████████████████████████████████████████████| 131/131 [00:10<00:00, 11.98it/s]


Machinery - Train Loss: 0.4091, Val Loss: 0.5549, Val Acc: 0.8328, Test Acc: 0.8382
Epoch 12/20


100%|████████████████████████████████████████████████████████████████████████████████| 627/627 [03:05<00:00,  3.38it/s]
100%|████████████████████████████████████████████████████████████████████████████████| 111/111 [00:12<00:00,  8.91it/s]
100%|████████████████████████████████████████████████████████████████████████████████| 131/131 [00:10<00:00, 11.97it/s]


Machinery - Train Loss: 0.3821, Val Loss: 0.5626, Val Acc: 0.8328, Test Acc: 0.8243
Trigger Times (Machinery): 1
Epoch 13/20


100%|████████████████████████████████████████████████████████████████████████████████| 627/627 [03:05<00:00,  3.38it/s]
100%|████████████████████████████████████████████████████████████████████████████████| 111/111 [00:12<00:00,  8.91it/s]
100%|████████████████████████████████████████████████████████████████████████████████| 131/131 [00:10<00:00, 11.98it/s]


Machinery - Train Loss: 0.3660, Val Loss: 0.5904, Val Acc: 0.8328, Test Acc: 0.8320
Trigger Times (Machinery): 2
Epoch 14/20


100%|████████████████████████████████████████████████████████████████████████████████| 627/627 [03:05<00:00,  3.38it/s]
100%|████████████████████████████████████████████████████████████████████████████████| 111/111 [00:12<00:00,  8.91it/s]
100%|████████████████████████████████████████████████████████████████████████████████| 131/131 [00:10<00:00, 11.98it/s]


Machinery - Train Loss: 0.3404, Val Loss: 0.5610, Val Acc: 0.8367, Test Acc: 0.8368
Epoch 15/20


100%|████████████████████████████████████████████████████████████████████████████████| 627/627 [03:05<00:00,  3.38it/s]
100%|████████████████████████████████████████████████████████████████████████████████| 111/111 [00:12<00:00,  8.91it/s]
100%|████████████████████████████████████████████████████████████████████████████████| 131/131 [00:10<00:00, 11.98it/s]


Machinery - Train Loss: 0.3314, Val Loss: 0.5801, Val Acc: 0.8418, Test Acc: 0.8401
Epoch 16/20


100%|████████████████████████████████████████████████████████████████████████████████| 627/627 [03:05<00:00,  3.38it/s]
100%|████████████████████████████████████████████████████████████████████████████████| 111/111 [00:12<00:00,  8.91it/s]
100%|████████████████████████████████████████████████████████████████████████████████| 131/131 [00:10<00:00, 11.98it/s]


Machinery - Train Loss: 0.3042, Val Loss: 0.5583, Val Acc: 0.8424, Test Acc: 0.8421
Epoch 17/20


100%|████████████████████████████████████████████████████████████████████████████████| 627/627 [03:05<00:00,  3.38it/s]
100%|████████████████████████████████████████████████████████████████████████████████| 111/111 [00:12<00:00,  8.91it/s]
100%|████████████████████████████████████████████████████████████████████████████████| 131/131 [00:10<00:00, 11.97it/s]


Machinery - Train Loss: 0.2983, Val Loss: 0.6133, Val Acc: 0.8362, Test Acc: 0.8358
Trigger Times (Machinery): 1
Epoch 18/20


100%|████████████████████████████████████████████████████████████████████████████████| 627/627 [03:05<00:00,  3.38it/s]
100%|████████████████████████████████████████████████████████████████████████████████| 111/111 [00:12<00:00,  8.91it/s]
100%|████████████████████████████████████████████████████████████████████████████████| 131/131 [00:10<00:00, 11.98it/s]


Machinery - Train Loss: 0.2847, Val Loss: 0.5576, Val Acc: 0.8407, Test Acc: 0.8406
Trigger Times (Machinery): 2
Epoch 19/20


100%|████████████████████████████████████████████████████████████████████████████████| 627/627 [03:05<00:00,  3.38it/s]
100%|████████████████████████████████████████████████████████████████████████████████| 111/111 [00:12<00:00,  8.91it/s]
100%|████████████████████████████████████████████████████████████████████████████████| 131/131 [00:10<00:00, 11.98it/s]


Machinery - Train Loss: 0.2753, Val Loss: 0.5670, Val Acc: 0.8441, Test Acc: 0.8425
Epoch 20/20


100%|████████████████████████████████████████████████████████████████████████████████| 627/627 [03:05<00:00,  3.38it/s]
100%|████████████████████████████████████████████████████████████████████████████████| 111/111 [00:12<00:00,  8.91it/s]
100%|████████████████████████████████████████████████████████████████████████████████| 131/131 [00:10<00:00, 11.98it/s]


Machinery - Train Loss: 0.2764, Val Loss: 0.5816, Val Acc: 0.8458, Test Acc: 0.8387


100%|████████████████████████████████████████████████████████████████████████████████| 131/131 [00:10<00:00, 11.98it/s]


Final Test Accuracy (Machinery): 0.8387


### Feature Extraction: 이미 학습된 모델을 고정(freeze)하고, 그 모델이 생성한 특징을 새로운 모델의 입력으로 사용함

> 프로토타입-1개 최상위 예측 가져오도록 함

In [27]:
# Machinery 모델 로드 및 가중치 고정
num_machinery_labels = len(np.unique(y_machinery))

machinery_model = BertForMachinery(
    num_machinery_labels=len(machinery_label_encoder.classes_), 
    extra_features_dim=train_extra_features.shape[1]
).to(device)

machinery_model.load_state_dict(torch.load("best_machinery_model.pth"))
machinery_model.eval()

for param in machinery_model.parameters():
    param.requires_grad = False

  machinery_model.load_state_dict(torch.load("best_machinery_model.pth"))


In [28]:
def get_top_1_machinery_predictions(model, dataloader):
    model.eval()
    top_1_predictions = []
    with torch.no_grad():
        for batch in tqdm(dataloader, desc="Generating Top-1 Machinery Predictions"):
            input_ids, attention_mask, extra_features = [b.to(device) for b in batch]
            outputs = model(input_ids, attention_mask=attention_mask, extra_features=extra_features)
            
            # 상위 1개의 예측값 가져오기
            _, top_1_predicted = torch.max(outputs, dim=1)
            top_1_predictions.append(top_1_predicted.cpu())  # CPU로 이동 후 리스트에 추가
    return torch.cat(top_1_predictions, dim=0)


In [29]:
import torch
torch.cuda.empty_cache()

In [30]:
# 예측 생성 (상위 1개)
train_machinery_predictions_top_1 = get_top_1_machinery_predictions(
    machinery_model, 
    DataLoader(TensorDataset(
        train_encodings['input_ids'], 
        train_encodings['attention_mask'], 
        train_extra_features_tensor
    ), batch_size=16, shuffle=False)
)

val_machinery_predictions_top_1 = get_top_1_machinery_predictions(
    machinery_model, 
    DataLoader(TensorDataset(
        val_encodings['input_ids'], 
        val_encodings['attention_mask'], 
        val_extra_features_tensor
    ), batch_size=16, shuffle=False)
)

test_machinery_predictions_top_1 = get_top_1_machinery_predictions(
    machinery_model, 
    DataLoader(TensorDataset(
        test_encodings['input_ids'], 
        test_encodings['attention_mask'], 
        test_extra_features_tensor
    ), batch_size=16, shuffle=False)
)


  attn_output = torch.nn.functional.scaled_dot_product_attention(
Generating Top-1 Machinery Predictions: 100%|████████████████████████████████████████| 627/627 [01:11<00:00,  8.76it/s]
Generating Top-1 Machinery Predictions: 100%|████████████████████████████████████████| 111/111 [00:12<00:00,  8.95it/s]
Generating Top-1 Machinery Predictions: 100%|████████████████████████████████████████| 131/131 [00:11<00:00, 11.88it/s]


In [31]:
train_machinery_predictions_top_1 = train_machinery_predictions_top_1.unsqueeze(1)
val_machinery_predictions_top_1 = val_machinery_predictions_top_1.unsqueeze(1)
test_machinery_predictions_top_1 = test_machinery_predictions_top_1.unsqueeze(1)

In [32]:
# Machinery 예측 결과의 크기를 확인
print(train_machinery_predictions_top_1.shape)
print(val_machinery_predictions_top_1.shape)
print(test_machinery_predictions_top_1.shape)

torch.Size([10029, 1])
torch.Size([1770, 1])
torch.Size([2083, 1])


In [33]:
from torch.utils.data import DataLoader, WeightedRandomSampler
from collections import Counter
import numpy as np

# 클래스별 데이터 수를 계산
class_sample_count = Counter(y_train_assembly)

# 소수 클래스 (30개 이하인 클래스)만 오버샘플링
small_class_sample_count = {cls: count for cls, count in class_sample_count.items() if count <= 30}
print(f"Support 30개 이하 Assembly 클래스: {small_class_sample_count}")

# 가중치 계산 (소수 클래스는 높은 가중치)
weights = np.ones_like(y_train_assembly, dtype=np.float32)
for cls in small_class_sample_count:
    class_indices = np.where(y_train_assembly == cls)[0]
    weights[class_indices] = 1. / small_class_sample_count[cls]  # 소수 클래스의 가중치 부여

# WeightedRandomSampler 생성
sampler = WeightedRandomSampler(weights=weights, num_samples=len(weights), replacement=True)


Support 30개 이하 Assembly 클래스: {87: 27, 152: 26, 39: 22, 208: 26, 104: 29, 31: 26, 113: 28, 83: 26, 35: 30, 156: 22, 202: 28, 114: 25, 28: 29, 101: 23, 177: 27, 207: 26, 4: 29, 118: 30, 89: 29, 42: 23, 121: 29, 43: 26, 184: 28, 55: 30, 77: 23, 86: 29, 201: 26, 38: 22, 50: 26, 110: 22, 19: 30, 193: 23, 47: 25, 26: 22, 136: 28, 197: 30, 12: 29, 30: 24, 117: 25, 97: 23, 145: 28, 131: 24, 7: 27, 206: 25, 67: 28, 107: 26, 99: 30, 137: 25, 109: 26, 51: 27, 150: 23, 119: 26, 203: 27, 70: 22, 165: 26, 1: 29, 163: 26, 166: 26, 100: 27, 162: 22, 65: 22, 146: 30, 41: 29, 81: 24, 56: 22, 139: 24, 24: 23, 173: 26, 3: 26, 20: 22, 183: 26, 37: 24, 103: 25, 73: 26, 45: 25}


In [34]:
train_assembly_dataset = TensorDataset(
    train_encodings['input_ids'],
    train_encodings['attention_mask'],
    train_extra_features_tensor,
    train_machinery_predictions_top_1.unsqueeze(1),  # 상위 1개의 Machinery 예측 추가
    torch.tensor(y_train_assembly, dtype=torch.long).to(device)  # Assembly 레이블
)

val_assembly_dataset = TensorDataset(
    val_encodings['input_ids'],
    val_encodings['attention_mask'],
    val_extra_features_tensor,
    val_machinery_predictions_top_1.unsqueeze(1),  # 상위 1개의 Machinery 예측 추가
    torch.tensor(y_val_assembly, dtype=torch.long).to(device)
)

test_assembly_dataset = TensorDataset(
    test_encodings['input_ids'],
    test_encodings['attention_mask'],
    test_extra_features_tensor,
    test_machinery_predictions_top_1.unsqueeze(1),  # 상위 1개의 Machinery 예측 추가
    torch.tensor(y_test_assembly, dtype=torch.long).to(device)
)

In [35]:
# DataLoader 생성
batch_size = 16
train_loader_assembly = DataLoader(train_assembly_dataset, batch_size=batch_size, sampler=sampler)
val_loader_assembly  = DataLoader(val_assembly_dataset, batch_size=batch_size, shuffle=False)
test_loader_assembly = DataLoader(test_assembly_dataset, batch_size=batch_size, shuffle=False)

In [36]:
num_assembly_labels = len(assembly_label_encoder.classes_)
machinery_output_dim = 62 

In [44]:
class AssemblyModel(nn.Module):
    def __init__(self, num_assembly_labels, extra_features_dim, machinery_output_dim):
        super(AssemblyModel, self).__init__()
        self.bert = BertModel.from_pretrained('bert-base-uncased')
        
        # fc1의 출력 차원을 128로 줄임
        self.fc1 = nn.Linear(840, 128)
        self.batchnorm = nn.BatchNorm1d(128)
        self.relu = nn.ReLU()
        self.dropout = nn.Dropout(0.5)  # Dropout 비율을 0.6으로 증가
        self.fc2 = nn.Linear(128, num_assembly_labels)
    
    def forward(self, input_ids, attention_mask, machinery_outputs, extra_features):
        bert_outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        pooled_output = bert_outputs.pooler_output
        
        # combined_features 크기 = (batch_size, 840)
        combined_features = torch.cat((pooled_output, machinery_outputs, extra_features), dim=1)
        x = self.fc1(combined_features)
        x = self.relu(x)
        x = self.batchnorm(x)
        x = self.dropout(x)
        assembly_outputs = self.fc2(x)
        
        return assembly_outputs

In [45]:
import torch
torch.cuda.empty_cache()

In [46]:
# Assembly 모델 초기화
num_assembly_labels = len(np.unique(y_assembly))
machinery_output_dim = 1  

assembly_model = AssemblyModel(
    num_assembly_labels=num_assembly_labels, 
    extra_features_dim=train_extra_features.shape[1], 
    machinery_output_dim=machinery_output_dim
).to(device)

In [47]:
from torch.optim.lr_scheduler import StepLR

# 일반적인 CrossEntropyLoss 사용
loss_fn_assembly = nn.CrossEntropyLoss(label_smoothing=0.1)

# Optimizer 설정
optimizer_assembly = AdamW(assembly_model.parameters(), lr=2e-5, weight_decay=0.01) 
scheduler = StepLR(optimizer_assembly, step_size=3, gamma=0.1)


In [48]:
import torch
torch.cuda.empty_cache()

In [49]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# 모델을 GPU로 이동
assembly_model.to(device)
machinery_model.to(device)

BertForMachinery(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementw

In [50]:
# Assembly 모델 학습 및 평가 함수 정의
def train_assembly(model, machinery_model, dataloader, optimizer, device, loss_fn):
    model.train()
    machinery_model.eval()  # Machinery 모델은 학습되지 않으므로 평가 모드로 설정
    total_loss = 0
    for batch in tqdm(dataloader, desc="Training Assembly"):
        input_ids, attention_mask, extra_features, machinery_predictions, labels = batch
        input_ids = input_ids.to(device)
        attention_mask = attention_mask.to(device)
        extra_features = extra_features.to(device)
        machinery_predictions = machinery_predictions.to(device)  # 이미 Machinery 예측을 포함
        labels = labels.to(device)
        
        optimizer.zero_grad()
        
        # Machinery 모델의 예측을 사용하여 Assembly 모델 입력 준비
        with torch.no_grad():
            machinery_outputs = machinery_model(input_ids, attention_mask, extra_features)
        
        # Assembly 모델 예측
        outputs = model(input_ids, attention_mask, machinery_outputs, extra_features)
        loss = loss_fn(outputs, labels)
        total_loss += loss.item()
        
        loss.backward()
        optimizer.step()
    
    return total_loss / len(dataloader)

In [51]:
def evaluate_assembly(model, machinery_model, dataloader, device):
    model.eval()
    machinery_model.eval()
    total_correct = 0
    total_samples = 0
    all_predictions = []
    
    with torch.no_grad():
        for batch in tqdm(dataloader, desc="Evaluating Assembly"):
            input_ids, attention_mask, extra_features, machinery_predictions, labels = batch
            input_ids = input_ids.to(device)
            attention_mask = attention_mask.to(device)
            extra_features = extra_features.to(device)
            machinery_predictions = machinery_predictions.to(device)
            labels = labels.to(device)
            
            # Machinery 모델의 예측 사용
            machinery_outputs = machinery_model(input_ids, attention_mask, extra_features)
            
            # Assembly 모델 예측
            outputs = model(input_ids, attention_mask, machinery_outputs, extra_features)
            
            probs = F.softmax(outputs, dim=1)
            _, predicted = torch.max(probs, 1)
            
            all_predictions.append(predicted.cpu().numpy())
            total_correct += (predicted == labels).sum().item()
            total_samples += labels.size(0)
    
    accuracy = total_correct / total_samples
    all_predictions = np.concatenate(all_predictions, axis=0)
    return accuracy, all_predictions

In [52]:
import torch
torch.cuda.empty_cache()

In [53]:
# Assembly 모델 학습 실행
num_epochs = 20
best_val_acc_assembly = 0
patience = 3
trigger_times = 0

for epoch in range(num_epochs):
    print(f"Epoch {epoch+1}/{num_epochs}")
    
    # Assembly 모델 학습
    train_loss_assembly = train_assembly(
        assembly_model, 
        machinery_model, 
        train_loader_assembly, 
        optimizer_assembly, 
        device, 
        loss_fn_assembly
    )
    
    # Assembly 모델 평가
    train_acc_assembly, _ = evaluate_assembly(
        assembly_model, 
        machinery_model, 
        train_loader_assembly, 
        device
    )
    val_acc_assembly, _ = evaluate_assembly(
        assembly_model, 
        machinery_model, 
        val_loader_assembly, 
        device
    )
    scheduler.step()  # 매 에포크 후 학습률 조정

    test_acc_assembly, _ = evaluate_assembly(
        assembly_model, 
        machinery_model, 
        test_loader_assembly, 
        device
    )

    print(f"Train Loss: {train_loss_assembly:.4f}")
    print(f"Train Accuracy: {train_acc_assembly:.4f}")
    print(f"Validation Accuracy: {val_acc_assembly:.4f}")
    print(f"Test Accuracy: {test_acc_assembly:.4f}")

    # Early Stopping
    if val_acc_assembly > best_val_acc_assembly:
        best_val_acc_assembly = val_acc_assembly
        trigger_times = 0
        # Best 모델 저장
        torch.save(assembly_model.state_dict(), "best_assembly_model.pth")
    else:
        trigger_times += 1
        print(f"Trigger Times: {trigger_times}")
        if trigger_times >= patience:
            print("Early stopping!")
            break

# 최종 테스트 성능 평가
final_test_acc_assembly, final_assembly_predictions = evaluate_assembly(
    assembly_model, 
    machinery_model, 
    test_loader_assembly, 
    device
)
print(f"Final Test Accuracy (Assembly): {final_test_acc_assembly:.4f}")


Epoch 1/20


Training Assembly:  71%|███████████████████████████████████████████                  | 443/627 [03:08<01:18,  2.35it/s]


KeyboardInterrupt: 