In [1]:
import numpy as np
import pandas as pd
import torch
from torch.utils.data import TensorDataset, DataLoader
import torch.nn as nn
from transformers import BertTokenizer, BertModel
from sklearn.preprocessing import OneHotEncoder, LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
import matplotlib.pyplot as plt
from tqdm import tqdm
import torch.nn.functional as F
from transformers import AdamW
import re

In [2]:
data = pd.read_excel('filtered_30_filled_money.xlsx')

In [3]:
def preprocess_text(text):
    text = text.lower()
    text = re.sub(r'\([^)]*\)', '', text)
    text = re.sub(r'[^\w\s\*/\-\+.,#&]', '', text)
    text = re.sub(r'\s+', ' ', text)
    text = re.sub(r'\b(사용금지|사)\b', '', text, flags=re.IGNORECASE)
    text = text.strip()
    return text

def clean_supplier_name(name):
    name = name.lower()
    name = re.sub(r'coporation|coropration|coproration|corporration', 'corporation', name)
    name = re.sub(r'\(사용금지\)', '', name)
    name = re.sub(r'u\.s\.a', '_usa', name)
    name = re.sub(r'\.', '', name)
    suffixes = r'(corporation|corp|company|co|incorporated|inc|limited|ltd|상사|공사|엔지니어링|주식회사|주|gmbh|pte ltd|llc)'
    name = re.sub(suffixes, '', name, flags=re.IGNORECASE)
    name = re.sub(r'[^\w\s-]', '', name)
    name = re.sub(r'\s+', ' ', name).strip()
    return name

In [4]:
# 텍스트 전처리
data['cleaned_item'] = data['청구품목'].apply(preprocess_text)
data['cleaned_supplier'] = data['발주처'].apply(clean_supplier_name)
data['combined_text'] = data['cleaned_item'].fillna('') + " " + data['Part No.1'].fillna('') + " " + data['cleaned_supplier'].fillna('')


In [5]:
exchange_rates = {'USD': 1, 'KRW': 0.00078, 'EUR': 1.18, 'JPY': 0.0091}

# usd기준해서 금액 통일함 
data['converted_price'] = data.apply(lambda x: x['견적단가'] * exchange_rates[x['견적화폐']], axis=1)


In [6]:
from sklearn.preprocessing import OneHotEncoder

# '견적화폐' 컬럼을 OneHotEncoder를 통해 인코딩
currency_ohe = OneHotEncoder(sparse_output=False) 
currency_encoded = currency_ohe.fit_transform(data[['견적화폐']])

In [7]:
import numpy as np
data['converted_price_log'] = np.log1p(data['converted_price'])  # 로그 변환된 가격


In [8]:
# 레이블 인코딩
machinery_label_encoder = LabelEncoder()
y_machinery= machinery_label_encoder.fit_transform(data['Machinery'])

assembly_label_encoder = LabelEncoder()
y_assembly = assembly_label_encoder.fit_transform(data['Assembly'])

In [9]:
# train_test split 을 위해 하나로 모으고, 분할하고 다시 텍스트랑 추가피쳐로 분리해줄거임 

X = np.concatenate([
    data['combined_text'].values.reshape(-1, 1),  # 2차원 배열로 바꿔서 결합해줌 
    currency_encoded.astype(float),  # currency_encoded를 float로 변환
    data['converted_price_log'].values.reshape(-1, 1)  # 통일한단가
], axis=1)

X_train_val, X_test, y_train_val_machinery, y_test_machinery, y_train_val_assembly, y_test_assembly = train_test_split(
    X, y_machinery, y_assembly, 
    test_size=0.15, 
    random_state=42, 
    stratify=y_assembly)  # stratify는 주로 메인 레이블 기준으로 설정

X_train, X_val, y_train_machinery, y_val_machinery, y_train_assembly, y_val_assembly = train_test_split(
    X_train_val, y_train_val_machinery, y_train_val_assembly, 
    test_size=0.15, 
    random_state=42, 
    stratify=y_train_val_assembly)  # 다시 stratify 기준으로 설정

# 크기 확인
print(f"combined_text shape: {data['combined_text'].shape}")
print(f"currency_encoded shape: {currency_encoded.shape}")
print(f"converted_price shape: {data['converted_price'].shape}")
print(f"X shape after concatenation: {X.shape}")

print(f"X_train size: {X_train.shape}")
print(f"X_val size: {X_val.shape}")
print(f"X_test size: {X_test.shape}")


combined_text shape: (13882,)
currency_encoded shape: (13882, 4)
converted_price shape: (13882,)
X shape after concatenation: (13882, 6)
X_train size: (10029, 6)
X_val size: (1770, 6)
X_test size: (2083, 6)


In [10]:
# 텍스트 분리
train_combined_text = X_train[:, 0]
val_combined_text = X_val[:, 0]
test_combined_text = X_test[:, 0]

# 추가 피처 분리 (currency_encoded와 로그 변환된 가격)
train_extra_features = X_train[:, 1:]
val_extra_features = X_val[:, 1:]
test_extra_features = X_test[:, 1:]

# 추가 피처 분리 (원핫 인코딩된 통화와 로그 변환된 가격)
train_currency_encoded = train_extra_features[:, :-1]  # 마지막 컬럼 제외 (원핫 인코딩된 통화)
val_currency_encoded = val_extra_features[:, :-1]
test_currency_encoded = test_extra_features[:, :-1]

train_price_log = train_extra_features[:, -1].reshape(-1, 1)  # 마지막 컬럼만 (로그 변환된 가격)
val_price_log = val_extra_features[:, -1].reshape(-1, 1)
test_price_log = test_extra_features[:, -1].reshape(-1, 1)

train_currency_encoded = train_currency_encoded.astype(float)
val_currency_encoded = val_currency_encoded.astype(float)
test_currency_encoded = test_currency_encoded.astype(float)

train_price_log = train_price_log.astype(float)
val_price_log = val_price_log.astype(float)
test_price_log = test_price_log.astype(float)

# 로그 변환된 가격에만 스케일링 적용
scaler = StandardScaler()
train_price_log_scaled = scaler.fit_transform(train_price_log)
val_price_log_scaled = scaler.transform(val_price_log)
test_price_log_scaled = scaler.transform(test_price_log)

In [11]:
print(f"currency_encoded dtype: {currency_encoded.dtype}")
print(f"X dtype after concatenation: {X.dtype}")


currency_encoded dtype: float64
X dtype after concatenation: object


In [12]:
# 최종 피처 결합 (원핫 인코딩된 통화 + 스케일링된 가격)
train_final_features = np.hstack([train_currency_encoded, train_price_log])
val_final_features = np.hstack([val_currency_encoded, val_price_log])
test_final_features = np.hstack([test_currency_encoded, test_price_log])

# float 변환 (결합 후 타입 변환)
train_final_features = train_final_features.astype(float)
val_final_features = val_final_features.astype(float)
test_final_features = test_final_features.astype(float)

In [13]:
# Torch Tensor로 변환
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
train_extra_features_tensor = torch.tensor(train_final_features, dtype=torch.float32).to(device)
val_extra_features_tensor = torch.tensor(val_final_features, dtype=torch.float32).to(device)
test_extra_features_tensor = torch.tensor(test_final_features, dtype=torch.float32).to(device)

  train_extra_features_tensor = torch.tensor(train_final_features, dtype=torch.float32).to(device)


In [14]:
# BERT 토크나이저 (텍스트처리)
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')



In [15]:
# X중 텍스트만 BERT 입력 형식으로 변환
def encode_data(texts):
    return tokenizer(texts.tolist(), padding=True, truncation=True, max_length=128, return_tensors='pt')

train_machinery_encodings = encode_data(train_combined_text)
val_machinery_encodings = encode_data(val_combined_text)
test_machinery_encodings = encode_data(test_combined_text)


In [16]:
train_machinery_dataset = TensorDataset(
    train_machinery_encodings['input_ids'],
    train_machinery_encodings['attention_mask'],
    train_extra_features_tensor,
    torch.tensor(y_train_machinery, dtype=torch.long).to(device)  # Machinery 레이블
)

val_machinery_dataset = TensorDataset(
    val_machinery_encodings['input_ids'],
    val_machinery_encodings['attention_mask'],
    val_extra_features_tensor,
    torch.tensor(y_val_machinery, dtype=torch.long).to(device)
)

test_machinery_dataset = TensorDataset(
    test_machinery_encodings['input_ids'],
    test_machinery_encodings['attention_mask'],
    test_extra_features_tensor,
    torch.tensor(y_test_machinery, dtype=torch.long).to(device)
)

In [17]:
print(f"y_train size: {y_train_machinery.shape}")
print(f"y_val size: {y_val_machinery.shape}")
print(f"y_test size: {y_test_machinery.shape}")


y_train size: (10029,)
y_val size: (1770,)
y_test size: (2083,)


In [18]:
# 3. DataLoader 생성
from torch.utils.data import DataLoader

batch_size = 8

train_loader_machinery = DataLoader(train_machinery_dataset, batch_size=batch_size, shuffle=True)
val_loader_machinery  = DataLoader(val_machinery_dataset, batch_size=batch_size, shuffle=False)
test_loader_machinery = DataLoader(test_machinery_dataset, batch_size=batch_size, shuffle=False)

In [19]:
class BertForMachinery(nn.Module):
    def __init__(self, num_machinery_labels, extra_features_dim):
        super(BertForMachinery, self).__init__()
        self.bert = BertModel.from_pretrained('bert-base-uncased')
        self.fc1 = nn.Linear(773, 256)
        self.relu = nn.ReLU()
        self.dropout = nn.Dropout(0.3)
        self.machinery_classifier = nn.Linear(256, num_machinery_labels)

    def forward(self, input_ids, attention_mask, extra_features):
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        pooled_output = outputs.pooler_output
        
        if extra_features.dim() == 1:
            extra_features = extra_features.unsqueeze(1)
        
        machinery_combined_features = torch.cat((pooled_output, extra_features), dim=1)
        x = self.fc1(machinery_combined_features)
        x = self.relu(x)
        x = self.dropout(x)
        machinery_outputs = self.machinery_classifier(x)
        
        return machinery_outputs

In [20]:
import torch
torch.cuda.empty_cache()

In [21]:
# 디바이스 설정
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
machinery_model = BertForMachinery(num_machinery_labels=len(machinery_label_encoder.classes_), extra_features_dim=10) 
machinery_model.to(device)


BertForMachinery(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementw

In [22]:
# 옵티마이저 및 학습률 스케줄러 설정
optimizer_machinery = AdamW(machinery_model.parameters(), lr=2e-5)
loss_fn_machinery=torch.nn.CrossEntropyLoss()



In [23]:
def train_machinery(model, dataloader, optimizer, device, loss_fn):
    model.train()
    total_loss = 0
    for batch in tqdm(dataloader):
        input_ids, attention_mask, extra_features, labels = [b.to(device) for b in batch]  # 순서 수정
        
        if labels.dim() > 1:
            labels = labels.squeeze()

        optimizer.zero_grad()
        outputs = model(input_ids, attention_mask=attention_mask, extra_features=extra_features)
        labels = labels.to(torch.int64)  # CrossEntropyLoss에 맞게 변환
        loss = loss_fn(outputs, labels)
        total_loss += loss.item()
        loss.backward()
        optimizer.step()

    return total_loss / len(dataloader)


In [24]:
import torch.nn.functional as F

# 평가 함수 - logits-62개짜리 각각의 자신감
def evaluate_machinery(model, dataloader, device, loss_fn_machinery):
    model.eval()
    total_loss = 0
    total_correct = 0
    total_samples = 0
    machinery_predictions = []
    
    with torch.no_grad():
        for batch in tqdm(dataloader):
            input_ids, attention_mask, extra_features, labels = [b.to(device) for b in batch]
            
            # 모델 예측
            outputs = model(input_ids, attention_mask=attention_mask, extra_features=extra_features)
            
            # Loss 계산
            loss = loss_fn_machinery(outputs, labels)
            total_loss += loss.item()
            
            _, predicted = torch.max(outputs, 1)  # softmax 없이 직접 logits에서 최대값 클래스 예측
            
            # 예측값을 저장
            machinery_predictions.append(predicted.cpu().numpy())  # 리스트에 추가
            
            # 정확도 계산
            total_correct += (predicted == labels).sum().item()
            total_samples += labels.size(0)

    accuracy = total_correct / total_samples
    machinery_predictions = np.concatenate(machinery_predictions, axis=0)  
    avg_loss = total_loss / len(dataloader)
    
    return avg_loss, accuracy, machinery_predictions

In [25]:
import torch
torch.cuda.empty_cache()

In [26]:
import pickle

# Machinery 모델 학습 실행
num_epochs = 40
best_val_acc_machinery = 0
best_val_loss_machinery = float('inf')
best_model = None
patience = 3
trigger_times = 0

for epoch in range(num_epochs):
    print(f"Epoch {epoch+1}/{num_epochs}")
    
    # Machinery 모델 학습
    train_loss_machinery = train_machinery(
        machinery_model, 
        train_loader_machinery, 
        optimizer_machinery, 
        device, 
        loss_fn_machinery
    )
    
    # Machinery 모델 평가
    val_loss_machinery, val_acc_machinery, val_preds_machinery = evaluate_machinery(
        machinery_model, 
        val_loader_machinery, 
        device, 
        loss_fn_machinery
    )
    
    test_loss_machinery, test_acc_machinery, test_preds_machinery = evaluate_machinery(
        machinery_model, 
        test_loader_machinery, 
        device, 
        loss_fn_machinery
    )
        
    print(f"Machinery - Train Loss: {train_loss_machinery:.4f}, Val Loss: {val_loss_machinery:.4f}, Val Acc: {val_acc_machinery:.4f}, Test Acc: {test_acc_machinery:.4f}")
    
    # 현재 모델이 가장 높은 검증 정확도를 기록한 경우
    if val_acc_machinery > best_val_acc_machinery or (val_acc_machinery == best_val_acc_machinery and val_loss_machinery < best_val_loss_machinery):
        best_val_acc_machinery = val_acc_machinery
        best_val_loss_machinery = val_loss_machinery
        best_model = machinery_model  # 가장 좋은 모델 저장
        trigger_times = 0
    else:
        trigger_times += 1
        print(f"Trigger Times (Machinery): {trigger_times}")
        if trigger_times >= patience:
            print("Early stopping for Machinery!")
            break

# 최종 최고의 모델을 피클로 저장
with open("0924_best_machinery_model.pkl", 'wb') as f:
    pickle.dump(best_model, f)

# 최종 테스트 성능 평가
final_test_loss_machinery, final_test_acc_machinery, final_machinery_predictions = evaluate_machinery(
    best_model, 
    test_loader_machinery, 
    device, 
    loss_fn_machinery
)
print(f"Final Test Accuracy (Machinery): {final_test_acc_machinery:.4f}")

Epoch 1/40


  attn_output = torch.nn.functional.scaled_dot_product_attention(
100%|██████████████████████████████████████████████████████████████████████████████| 1254/1254 [03:54<00:00,  5.34it/s]
100%|████████████████████████████████████████████████████████████████████████████████| 222/222 [00:11<00:00, 19.19it/s]
100%|████████████████████████████████████████████████████████████████████████████████| 261/261 [00:10<00:00, 25.19it/s]


Machinery - Train Loss: 2.0171, Val Loss: 1.3681, Val Acc: 0.6520, Test Acc: 0.6524
Epoch 2/40


100%|██████████████████████████████████████████████████████████████████████████████| 1254/1254 [03:53<00:00,  5.37it/s]
100%|████████████████████████████████████████████████████████████████████████████████| 222/222 [00:11<00:00, 19.25it/s]
100%|████████████████████████████████████████████████████████████████████████████████| 261/261 [00:10<00:00, 25.41it/s]


Machinery - Train Loss: 1.2391, Val Loss: 1.0625, Val Acc: 0.7119, Test Acc: 0.7062
Epoch 3/40


100%|██████████████████████████████████████████████████████████████████████████████| 1254/1254 [03:53<00:00,  5.37it/s]
100%|████████████████████████████████████████████████████████████████████████████████| 222/222 [00:11<00:00, 19.25it/s]
100%|████████████████████████████████████████████████████████████████████████████████| 261/261 [00:10<00:00, 25.40it/s]


Machinery - Train Loss: 0.9769, Val Loss: 0.8609, Val Acc: 0.7480, Test Acc: 0.7518
Epoch 4/40


100%|██████████████████████████████████████████████████████████████████████████████| 1254/1254 [03:54<00:00,  5.34it/s]
100%|████████████████████████████████████████████████████████████████████████████████| 222/222 [00:11<00:00, 19.17it/s]
100%|████████████████████████████████████████████████████████████████████████████████| 261/261 [00:10<00:00, 25.04it/s]


Machinery - Train Loss: 0.7999, Val Loss: 0.7434, Val Acc: 0.7847, Test Acc: 0.7782
Epoch 5/40


100%|██████████████████████████████████████████████████████████████████████████████| 1254/1254 [03:55<00:00,  5.33it/s]
100%|████████████████████████████████████████████████████████████████████████████████| 222/222 [00:11<00:00, 19.13it/s]
100%|████████████████████████████████████████████████████████████████████████████████| 261/261 [00:10<00:00, 25.43it/s]


Machinery - Train Loss: 0.6755, Val Loss: 0.6922, Val Acc: 0.7955, Test Acc: 0.7969
Epoch 6/40


100%|██████████████████████████████████████████████████████████████████████████████| 1254/1254 [03:56<00:00,  5.31it/s]
100%|████████████████████████████████████████████████████████████████████████████████| 222/222 [00:11<00:00, 19.12it/s]
100%|████████████████████████████████████████████████████████████████████████████████| 261/261 [00:10<00:00, 25.53it/s]


Machinery - Train Loss: 0.5884, Val Loss: 0.6362, Val Acc: 0.8062, Test Acc: 0.8147
Epoch 7/40


100%|██████████████████████████████████████████████████████████████████████████████| 1254/1254 [03:58<00:00,  5.27it/s]
100%|████████████████████████████████████████████████████████████████████████████████| 222/222 [00:11<00:00, 19.11it/s]
100%|████████████████████████████████████████████████████████████████████████████████| 261/261 [00:10<00:00, 25.27it/s]


Machinery - Train Loss: 0.5200, Val Loss: 0.5965, Val Acc: 0.8096, Test Acc: 0.8147
Epoch 8/40


100%|██████████████████████████████████████████████████████████████████████████████| 1254/1254 [03:54<00:00,  5.34it/s]
100%|████████████████████████████████████████████████████████████████████████████████| 222/222 [00:11<00:00, 19.25it/s]
100%|████████████████████████████████████████████████████████████████████████████████| 261/261 [00:10<00:00, 25.40it/s]


Machinery - Train Loss: 0.4672, Val Loss: 0.5604, Val Acc: 0.8282, Test Acc: 0.8248
Epoch 9/40


100%|██████████████████████████████████████████████████████████████████████████████| 1254/1254 [03:52<00:00,  5.39it/s]
100%|████████████████████████████████████████████████████████████████████████████████| 222/222 [00:11<00:00, 19.43it/s]
100%|████████████████████████████████████████████████████████████████████████████████| 261/261 [00:10<00:00, 25.58it/s]


Machinery - Train Loss: 0.4282, Val Loss: 0.5542, Val Acc: 0.8333, Test Acc: 0.8353
Epoch 10/40


100%|██████████████████████████████████████████████████████████████████████████████| 1254/1254 [03:52<00:00,  5.40it/s]
100%|████████████████████████████████████████████████████████████████████████████████| 222/222 [00:11<00:00, 19.44it/s]
100%|████████████████████████████████████████████████████████████████████████████████| 261/261 [00:10<00:00, 25.59it/s]


Machinery - Train Loss: 0.3956, Val Loss: 0.5274, Val Acc: 0.8367, Test Acc: 0.8277
Epoch 11/40


100%|██████████████████████████████████████████████████████████████████████████████| 1254/1254 [03:52<00:00,  5.40it/s]
100%|████████████████████████████████████████████████████████████████████████████████| 222/222 [00:11<00:00, 19.45it/s]
100%|████████████████████████████████████████████████████████████████████████████████| 261/261 [00:10<00:00, 25.59it/s]


Machinery - Train Loss: 0.3701, Val Loss: 0.5367, Val Acc: 0.8390, Test Acc: 0.8368
Epoch 12/40


100%|██████████████████████████████████████████████████████████████████████████████| 1254/1254 [03:52<00:00,  5.40it/s]
100%|████████████████████████████████████████████████████████████████████████████████| 222/222 [00:11<00:00, 19.45it/s]
100%|████████████████████████████████████████████████████████████████████████████████| 261/261 [00:10<00:00, 25.61it/s]


Machinery - Train Loss: 0.3470, Val Loss: 0.5883, Val Acc: 0.8294, Test Acc: 0.8257
Trigger Times (Machinery): 1
Epoch 13/40


100%|██████████████████████████████████████████████████████████████████████████████| 1254/1254 [03:52<00:00,  5.40it/s]
100%|████████████████████████████████████████████████████████████████████████████████| 222/222 [00:11<00:00, 19.45it/s]
100%|████████████████████████████████████████████████████████████████████████████████| 261/261 [00:10<00:00, 25.59it/s]


Machinery - Train Loss: 0.3322, Val Loss: 0.5384, Val Acc: 0.8384, Test Acc: 0.8397
Trigger Times (Machinery): 2
Epoch 14/40


100%|██████████████████████████████████████████████████████████████████████████████| 1254/1254 [03:52<00:00,  5.40it/s]
100%|████████████████████████████████████████████████████████████████████████████████| 222/222 [00:11<00:00, 19.45it/s]
100%|████████████████████████████████████████████████████████████████████████████████| 261/261 [00:10<00:00, 25.59it/s]


Machinery - Train Loss: 0.3113, Val Loss: 0.5324, Val Acc: 0.8424, Test Acc: 0.8435
Epoch 15/40


100%|██████████████████████████████████████████████████████████████████████████████| 1254/1254 [03:52<00:00,  5.40it/s]
100%|████████████████████████████████████████████████████████████████████████████████| 222/222 [00:11<00:00, 19.46it/s]
100%|████████████████████████████████████████████████████████████████████████████████| 261/261 [00:10<00:00, 25.60it/s]


Machinery - Train Loss: 0.2920, Val Loss: 0.5431, Val Acc: 0.8486, Test Acc: 0.8368
Epoch 16/40


100%|██████████████████████████████████████████████████████████████████████████████| 1254/1254 [03:51<00:00,  5.41it/s]
100%|████████████████████████████████████████████████████████████████████████████████| 222/222 [00:11<00:00, 19.49it/s]
100%|████████████████████████████████████████████████████████████████████████████████| 261/261 [00:10<00:00, 25.65it/s]


Machinery - Train Loss: 0.2928, Val Loss: 0.5636, Val Acc: 0.8395, Test Acc: 0.8344
Trigger Times (Machinery): 1
Epoch 17/40


100%|██████████████████████████████████████████████████████████████████████████████| 1254/1254 [03:51<00:00,  5.41it/s]
100%|████████████████████████████████████████████████████████████████████████████████| 222/222 [00:11<00:00, 19.50it/s]
100%|████████████████████████████████████████████████████████████████████████████████| 261/261 [00:10<00:00, 25.65it/s]


Machinery - Train Loss: 0.2737, Val Loss: 0.5512, Val Acc: 0.8418, Test Acc: 0.8435
Trigger Times (Machinery): 2
Epoch 18/40


100%|██████████████████████████████████████████████████████████████████████████████| 1254/1254 [03:51<00:00,  5.41it/s]
100%|████████████████████████████████████████████████████████████████████████████████| 222/222 [00:11<00:00, 19.50it/s]
100%|████████████████████████████████████████████████████████████████████████████████| 261/261 [00:10<00:00, 25.64it/s]


Machinery - Train Loss: 0.2721, Val Loss: 0.5699, Val Acc: 0.8435, Test Acc: 0.8358
Trigger Times (Machinery): 3
Early stopping for Machinery!


100%|████████████████████████████████████████████████████████████████████████████████| 261/261 [00:10<00:00, 25.65it/s]

Final Test Accuracy (Machinery): 0.8358





### BERT 기반 machinery 모델 학습 및 예측값

> xgboost assembly

In [34]:
import torch
import pickle

# 저장된 최적의 모델을 불러옵니다 (에포크 14에서 저장된 모델)
with open("0924_best_machinery_model.pkl", 'rb') as f:
    best_machinery_model = pickle.load(f)

best_machinery_model.to(device)  # 모델을 GPU로 이동

# Machinery 모델을 평가 모드로 설정
best_machinery_model.eval()

  return torch.load(io.BytesIO(b))


BertForMachinery(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementw

In [45]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from xgboost import XGBClassifier
from collections import Counter
from imblearn.combine import SMOTETomek
from sklearn.metrics import accuracy_score, classification_report
import numpy as np

# 1. 어셈블리 토크나이저 피팅
assembly_tokenizer = Tokenizer(num_words=20000)
assembly_tokenizer.fit_on_texts(data['combined_text'])  # 필요한 데이터를 사용

# 2. 텍스트를 정수 시퀀스로 변환
sequences = assembly_tokenizer.texts_to_sequences(data['combined_text'])

# 3. 시퀀스 패딩
max_len = 50
X_seq = pad_sequences(sequences, maxlen=max_len)

# 어셈블리 토크나이저를 파일로 저장
with open('models/assembly_tokenizer.pkl', 'wb') as f:
    pickle.dump(assembly_tokenizer, f)
    
# 3. 정수형 레이블 (y)
assembly_labels = data['Assembly'].values

label_encoder_assembly = LabelEncoder()
y_assembly = label_encoder_assembly.fit_transform(assembly_labels)

# Train-Test Split을 일관되게 적용
X_seq_train, X_seq_test, y_train_assembly, y_test_assembly = train_test_split(
    X_seq, y_assembly, test_size=0.15, random_state=42, stratify=y_assembly
)

X_seq_train_final, X_seq_val, y_train_assembly_final, y_val_assembly = train_test_split(
    X_seq_train, y_train_assembly, test_size=0.15, random_state=42, stratify=y_train_assembly
)

# 분할된 데이터 크기 확인
print(f"X_seq_train_final shape: {X_seq_train_final.shape}")
print(f"X_seq_val shape: {X_seq_val.shape}")
print(f"X_seq_test shape: {X_seq_test.shape}")

X_seq_train_final shape: (10029, 50)
X_seq_val shape: (1770, 50)
X_seq_test shape: (2083, 50)


In [36]:
def predict_machinery_classes(model, dataloader, device):
    model.eval()
    all_preds = []
    with torch.no_grad():
        for batch in dataloader:
            input_ids, attention_mask, extra_features = [b.to(device) for b in batch[:3]]
            outputs = model(input_ids=input_ids, attention_mask=attention_mask, extra_features=extra_features)
            preds = torch.argmax(outputs, dim=1)  # Get the class with the highest probability
            all_preds.append(preds.cpu().numpy())
    return np.concatenate(all_preds)


In [37]:
machinery_preds_train = predict_machinery_classes(best_machinery_model, train_loader_machinery, device)
machinery_preds_val = predict_machinery_classes(best_machinery_model, val_loader_machinery, device)
machinery_preds_test = predict_machinery_classes(best_machinery_model, test_loader_machinery, device)

In [38]:
# 예측된 machinery 클래스를 시퀀스 데이터에 추가
X_seq_train_with_machinery = np.hstack([X_seq_train_final, machinery_preds_train.reshape(-1, 1)])
X_seq_val_with_machinery = np.hstack([X_seq_val, machinery_preds_val.reshape(-1, 1)])
X_seq_test_with_machinery = np.hstack([X_seq_test, machinery_preds_test.reshape(-1, 1)])

# 결합된 데이터 크기 확인
print(f"X_seq_train_with_machinery shape: {X_seq_train_with_machinery.shape}")
print(f"X_seq_val_with_machinery shape: {X_seq_val_with_machinery.shape}")
print(f"X_seq_test_with_machinery shape: {X_seq_test_with_machinery.shape}")


X_seq_train_with_machinery shape: (10029, 51)
X_seq_val_with_machinery shape: (1770, 51)
X_seq_test_with_machinery shape: (2083, 51)


In [39]:
# 크기가 일치하는지 확인
assert X_seq_train_with_machinery.shape[0] == y_train_assembly_final.shape[0], "X_train과 y_train의 크기가 일치하지 않습니다."

# SMOTE + Tomek Links 적용
smote_tomek = SMOTETomek(random_state=42)
X_resampled_train, y_resampled_train_assembly = smote_tomek.fit_resample(X_seq_train_with_machinery, y_train_assembly_final)

# 결과 확인
print(f"Resampled X shape: {X_resampled_train.shape}")
print(f"Resampled y shape: {y_resampled_train_assembly.shape}")





Resampled X shape: (82185, 51)
Resampled y shape: (82185,)


In [40]:
# 5. XGBoost Assembly 모델 설정 및 학습
assembly_model = XGBClassifier(
    objective='multi:softmax',
    num_class=209,  # Assembly 클래스 수
    learning_rate=0.05,
    max_depth=8,
    n_estimators=200,
    subsample=0.8,
    colsample_bytree=0.8,
    reg_lambda=1,
    random_state=42,
    verbosity=1
)

# SMOTE + Tomek Links로 불균형 데이터를 해결한 후 훈련 세트로 학습
assembly_model.fit(X_resampled_train, y_resampled_train_assembly)

# 6. 성능 평가 (검증 세트)
assembly_preds_val = assembly_model.predict(X_seq_val_with_machinery)
assembly_accuracy_val = accuracy_score(y_val_assembly, assembly_preds_val)
print(f'Assembly Validation Accuracy: {assembly_accuracy_val:.4f}')

# 성능 평가 (테스트 세트)
assembly_preds_test = assembly_model.predict(X_seq_test_with_machinery)
assembly_accuracy_test = accuracy_score(y_test_assembly, assembly_preds_test)
print(f'Assembly Test Accuracy: {assembly_accuracy_test:.4f}')

Assembly Validation Accuracy: 0.7599
Assembly Test Accuracy: 0.7710


In [41]:
from sklearn.model_selection import RandomizedSearchCV
from xgboost import XGBClassifier
from scipy.stats import uniform, randint
import numpy as np

assembly_model = XGBClassifier(
    objective='multi:softmax',
    num_class=209,  # Assembly 클래스 수에 맞게 설정
    random_state=42,
    verbosity=1
)

param_grid = {
    'learning_rate': uniform(0.01, 0.1),  # 0.05를 중심으로 ±0.05 범위
    'max_depth': randint(5, 12),  # 8을 중심으로 ±3
    'n_estimators': randint(150, 300),  # 200을 중심으로 ±50
    'subsample': uniform(0.6, 0.4),  # 0.8을 중심으로 ±0.2
    'colsample_bytree': uniform(0.6, 0.3),  # 0.8을 중심으로 ±0.2
    'reg_lambda': [0.5, 1, 2, 5],  # 다양한 lambda 값
    'reg_alpha': [0, 0.5, 1, 2],  # L1 정규화 추가
    'min_child_weight': randint(1, 6),  # 최소 자식 가중치
    'gamma': uniform(0, 0.5),  # 트리 분할 시 요구되는 최소 손실 감소
}

random_search_assembly = RandomizedSearchCV(
    estimator=assembly_model,
    param_distributions=param_grid,
    n_iter=50,  # 시도할 파라미터 조합 수를 늘릴 수 있습니다
    scoring='accuracy',
    cv=3,
    verbose=1,
    n_jobs=-1,
    random_state=42
)

# 4. RandomizedSearchCV 실행 (훈련 세트 사용)
random_search_assembly.fit(X_seq_train_with_machinery, y_train_assembly_final)

# 5. 최적의 하이퍼파라미터 출력
print(f"Best parameters for Assembly: {random_search_assembly.best_params_}")

# 6. 검증 세트 성능 평가
assembly_preds_val = random_search_assembly.best_estimator_.predict(X_seq_val_with_machinery)
assembly_accuracy_val = accuracy_score(y_val_assembly, assembly_preds_val)
print(f'Assembly Validation Accuracy: {assembly_accuracy_val:.4f}')

# 7. 테스트 세트 성능 평가
assembly_preds_test = random_search_assembly.best_estimator_.predict(X_seq_test_with_machinery)
assembly_accuracy_test = accuracy_score(y_test_assembly, assembly_preds_test)
print(f'Assembly Test Accuracy: {assembly_accuracy_test:.4f}')

Fitting 3 folds for each of 50 candidates, totalling 150 fits
Best parameters for Assembly: {'colsample_bytree': 0.7949321047928289, 'gamma': 0.024029462098516863, 'learning_rate': 0.10491457315913859, 'max_depth': 8, 'min_child_weight': 2, 'n_estimators': 287, 'reg_alpha': 0.5, 'reg_lambda': 2, 'subsample': 0.8012545034320351}
Assembly Validation Accuracy: 0.7791
Assembly Test Accuracy: 0.7792


In [42]:
import joblib

# 최적의 모델을 파일로 저장
joblib.dump(random_search_assembly.best_estimator_, '0924_final_assembly_model.pkl')
print("Model saved as 0924_final_assembly_model.pkl")

Model saved as 0924_final_assembly_model.pkl
