In [1]:
import numpy as np
import pandas as pd
import torch
from torch.utils.data import TensorDataset, DataLoader
import torch.nn as nn
from transformers import BertTokenizer, BertModel
from sklearn.preprocessing import OneHotEncoder, LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
import matplotlib.pyplot as plt
from tqdm import tqdm
import torch.nn.functional as F
from transformers import AdamW
import re

In [2]:
data = pd.read_excel('filtered_30_filled_money.xlsx')

In [3]:
def preprocess_text(text):
    text = text.lower()
    text = re.sub(r'\([^)]*\)', '', text)
    text = re.sub(r'[^\w\s\*/\-\+.,#&]', '', text)
    text = re.sub(r'\s+', ' ', text)
    text = re.sub(r'\b(사용금지|사)\b', '', text, flags=re.IGNORECASE)
    text = text.strip()
    return text

def clean_supplier_name(name):
    name = name.lower()
    name = re.sub(r'coporation|coropration|coproration|corporration', 'corporation', name)
    name = re.sub(r'\(사용금지\)', '', name)
    name = re.sub(r'u\.s\.a', '_usa', name)
    name = re.sub(r'\.', '', name)
    suffixes = r'(corporation|corp|company|co|incorporated|inc|limited|ltd|상사|공사|엔지니어링|주식회사|주|gmbh|pte ltd|llc)'
    name = re.sub(suffixes, '', name, flags=re.IGNORECASE)
    name = re.sub(r'[^\w\s-]', '', name)
    name = re.sub(r'\s+', ' ', name).strip()
    return name

In [4]:
# 텍스트 전처리
data['cleaned_item'] = data['청구품목'].apply(preprocess_text)
data['cleaned_supplier'] = data['발주처'].apply(clean_supplier_name)
data['combined_text'] = data['cleaned_item'].fillna('') + " " + data['Part No.1'].fillna('') + " " + data['cleaned_supplier'].fillna('')


In [5]:
exchange_rates = {'USD': 1, 'KRW': 0.00078, 'EUR': 1.18, 'JPY': 0.0091}

# usd기준해서 금액 통일함 
data['converted_price'] = data.apply(lambda x: x['견적단가'] * exchange_rates[x['견적화폐']], axis=1)


In [6]:
from sklearn.preprocessing import OneHotEncoder

# '견적화폐' 컬럼을 OneHotEncoder를 통해 인코딩
currency_ohe = OneHotEncoder(sparse_output=False) 
currency_encoded = currency_ohe.fit_transform(data[['견적화폐']])

In [7]:
import numpy as np
data['converted_price_log'] = np.log1p(data['converted_price'])  # 로그 변환된 가격


In [8]:
# 레이블 인코딩
machinery_label_encoder = LabelEncoder()
y_machinery= machinery_label_encoder.fit_transform(data['Machinery'])

assembly_label_encoder = LabelEncoder()
y_assembly = assembly_label_encoder.fit_transform(data['Assembly'])

In [9]:
# train_test split 을 위해 하나로 모으고, 분할하고 다시 텍스트랑 추가피쳐로 분리해줄거임 

# 1. 텍스트 + 추가 피처 결합
X = np.concatenate([
    data['combined_text'].values.reshape(-1, 1),  # 2차원 배열로 바꿔서 결합해줌 
    currency_encoded, 
    data['converted_price_log'].values.reshape(-1, 1)  # 통일한단가
], axis=1)

X_train_val, X_test, y_train_val_machinery, y_test_machinery, y_train_val_assembly, y_test_assembly = train_test_split(
    X, y_machinery, y_assembly, 
    test_size=0.15, 
    random_state=42, 
    stratify=y_assembly)  # stratify는 주로 메인 레이블 기준으로 설정

X_train, X_val, y_train_machinery, y_val_machinery, y_train_assembly, y_val_assembly = train_test_split(
    X_train_val, y_train_val_machinery, y_train_val_assembly, 
    test_size=0.15, 
    random_state=42, 
    stratify=y_train_val_assembly)  # 다시 stratify 기준으로 설정

# 크기 확인
print(f"combined_text shape: {data['combined_text'].shape}")
print(f"currency_encoded shape: {currency_encoded.shape}")
print(f"converted_price shape: {data['converted_price'].shape}")
print(f"X shape after concatenation: {X.shape}")

print(f"X_train size: {X_train.shape}")
print(f"X_val size: {X_val.shape}")
print(f"X_test size: {X_test.shape}")


combined_text shape: (13882,)
currency_encoded shape: (13882, 4)
converted_price shape: (13882,)
X shape after concatenation: (13882, 6)
X_train size: (10029, 6)
X_val size: (1770, 6)
X_test size: (2083, 6)


In [10]:
# 텍스트 분리
train_combined_text = X_train[:, 0]
val_combined_text = X_val[:, 0]
test_combined_text = X_test[:, 0]

# 추가 피처 분리 (currency_encoded와 로그 변환된 가격)
train_extra_features = X_train[:, 1:]
val_extra_features = X_val[:, 1:]
test_extra_features = X_test[:, 1:]

# 값이 float으로 변환되도록 nan 처리
train_extra_features = np.nan_to_num(train_extra_features, nan=0.0).astype(float)
val_extra_features = np.nan_to_num(val_extra_features, nan=0.0).astype(float)
test_extra_features = np.nan_to_num(test_extra_features, nan=0.0).astype(float)

# 스케일링 적용
scaler = StandardScaler()
train_extra_features = scaler.fit_transform(train_extra_features)
val_extra_features = scaler.transform(val_extra_features)
test_extra_features = scaler.transform(test_extra_features)



In [11]:
# Torch Tensor로 변환 후 디바이스 이동
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
train_extra_features_tensor = torch.tensor(train_extra_features, dtype=torch.float32).to(device)
val_extra_features_tensor = torch.tensor(val_extra_features, dtype=torch.float32).to(device)
test_extra_features_tensor = torch.tensor(test_extra_features, dtype=torch.float32).to(device)


  train_extra_features_tensor = torch.tensor(train_extra_features, dtype=torch.float32).to(device)


In [12]:
# BERT 토크나이저 (텍스트처리)
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')



In [13]:
# X중 텍스트만 BERT 입력 형식으로 변환
def encode_data(texts):
    return tokenizer(texts.tolist(), padding=True, truncation=True, max_length=128, return_tensors='pt')

train_machinery_encodings = encode_data(train_combined_text)
val_machinery_encodings = encode_data(val_combined_text)
test_machinery_encodings = encode_data(test_combined_text)

# Assembly용 인코딩
train_assembly_encodings = encode_data(train_combined_text)  # 같은 텍스트지만 독립적으로 관리
val_assembly_encodings = encode_data(val_combined_text)
test_assembly_encodings = encode_data(test_combined_text)


In [14]:
train_machinery_dataset = TensorDataset(
    train_machinery_encodings['input_ids'],
    train_machinery_encodings['attention_mask'],
    train_extra_features_tensor,
    torch.tensor(y_train_machinery, dtype=torch.long).to(device)  # Machinery 레이블
)

val_machinery_dataset = TensorDataset(
    val_machinery_encodings['input_ids'],
    val_machinery_encodings['attention_mask'],
    val_extra_features_tensor,
    torch.tensor(y_val_machinery, dtype=torch.long).to(device)
)

test_machinery_dataset = TensorDataset(
    test_machinery_encodings['input_ids'],
    test_machinery_encodings['attention_mask'],
    test_extra_features_tensor,
    torch.tensor(y_test_machinery, dtype=torch.long).to(device)
)

In [15]:
print(f"y_train size: {y_train_machinery.shape}")
print(f"y_val size: {y_val_machinery.shape}")
print(f"y_test size: {y_test_machinery.shape}")


y_train size: (10029,)
y_val size: (1770,)
y_test size: (2083,)


In [16]:
# 3. DataLoader 생성
from torch.utils.data import DataLoader

batch_size = 16

train_loader_machinery = DataLoader(train_machinery_dataset, batch_size=batch_size, shuffle=True)
val_loader_machinery  = DataLoader(val_machinery_dataset, batch_size=batch_size, shuffle=False)
test_loader_machinery = DataLoader(test_machinery_dataset, batch_size=batch_size, shuffle=False)

In [17]:
class BertForMachinery(nn.Module):
    def __init__(self, num_machinery_labels, extra_features_dim):
        super(BertForMachinery, self).__init__()
        self.bert = BertModel.from_pretrained('bert-base-uncased')
        self.fc1 = nn.Linear(773, 256)
        self.relu = nn.ReLU()
        self.dropout = nn.Dropout(0.2)
        self.machinery_classifier = nn.Linear(256, num_machinery_labels)

    def forward(self, input_ids, attention_mask, extra_features):
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        pooled_output = outputs.pooler_output
        
        if extra_features.dim() == 1:
            extra_features = extra_features.unsqueeze(1)
        
        machinery_combined_features = torch.cat((pooled_output, extra_features), dim=1)
        x = self.fc1(machinery_combined_features)
        x = self.relu(x)
        x = self.dropout(x)
        machinery_outputs = self.machinery_classifier(x)
        
        return machinery_outputs

In [18]:
#!conda install conda-forge::optuna -y

In [19]:
import torch
torch.cuda.empty_cache()

In [20]:
def train_machinery(model, dataloader, optimizer, device, loss_fn):
    model.train()
    total_loss = 0
    for batch in tqdm(dataloader):
        input_ids, attention_mask, extra_features, labels = [b.to(device) for b in batch]  # 순서 수정
        
        if labels.dim() > 1:
            labels = labels.squeeze()

        optimizer.zero_grad()
        outputs = model(input_ids, attention_mask=attention_mask, extra_features=extra_features)
        labels = labels.to(torch.int64)  # CrossEntropyLoss에 맞게 변환
        loss = loss_fn(outputs, labels)
        total_loss += loss.item()
        loss.backward()
        optimizer.step()

    return total_loss / len(dataloader)


In [21]:
import torch.nn.functional as F

# 평가 함수 - logits-62개짜리 각각의 자신감
def evaluate_machinery(model, dataloader, device, loss_fn_machinery):
    model.eval()
    total_loss = 0
    total_correct = 0
    total_samples = 0
    machinery_predictions = []
    
    with torch.no_grad():
        for batch in tqdm(dataloader):
            input_ids, attention_mask, extra_features, labels = [b.to(device) for b in batch]
            
            # 모델 예측
            outputs = model(input_ids, attention_mask=attention_mask, extra_features=extra_features)
            
            # Loss 계산
            loss = loss_fn_machinery(outputs, labels)
            total_loss += loss.item()
            
            _, predicted = torch.max(outputs, 1)
            machinery_predictions.append(predicted.cpu().numpy())  # 리스트에 추가
            
            # 정확도 계산
            total_correct += (predicted == labels).sum().item()
            total_samples += labels.size(0)

    accuracy = total_correct / total_samples
    avg_loss = total_loss / len(dataloader)
    machinery_predictions = np.concatenate(machinery_predictions, axis=0)  # 최종 예측값
    
    return avg_loss, accuracy, machinery_predictions

> class, train, evaluate define 하고 모델 피팅 전 "하이퍼파라미터 튜닝" => 최적의 param 나온 걸로 모델 정의하고 피팅함

In [23]:
import optuna
import torch
from torch import nn
from transformers import AdamW

loss_fn_machinery=torch.nn.CrossEntropyLoss()

# Objective Function 정의
def objective(trial):
    # 하이퍼파라미터 설정
    learning_rate = trial.suggest_loguniform('learning_rate', 1e-5, 1e-2)  # 1e-5 ~ 1e-2
    dropout_rate = trial.suggest_uniform('dropout_rate', 0.1, 0.5)         # 0.1 ~ 0.5
    optimizer_name = trial.suggest_categorical('optimizer', ['adamw', 'sgd'])  # AdamW, SGD 선택
    
    # 모델 정의
    model = BertForMachinery(num_machinery_labels=len(machinery_label_encoder.classes_), extra_features_dim=10)
    model.dropout = nn.Dropout(dropout_rate)
    model.to(device)
    
    # Optimizer 선택
    if optimizer_name == 'adamw':
        optimizer = AdamW(model.parameters(), lr=learning_rate)
    else:  # 'sgd'
        optimizer = torch.optim.SGD(model.parameters(), lr=learning_rate, momentum=0.9)

    # Training 및 Validation 평가
    train_loss = train_machinery(model, train_loader_machinery, optimizer, device, loss_fn_machinery)
    val_loss, val_acc, machinery_predictions = evaluate_machinery(model, val_loader_machinery, device, loss_fn_machinery)

    return val_acc  # Validation accuracy 최대화

# Optuna로 Hyperparameter Optimization 실행
study = optuna.create_study(direction='maximize')  # 최대화 방향 설정
study.optimize(objective, n_trials=50)  # 50번의 시도

# 최적 하이퍼파라미터 출력
print("Best hyperparameters: ", study.best_trial.params)

[I 2024-09-20 10:15:53,588] A new study created in memory with name: no-name-dedb0b86-950e-4fcb-b45e-d8628cb10483
  learning_rate = trial.suggest_loguniform('learning_rate', 1e-5, 1e-2)  # 1e-5 ~ 1e-2
  dropout_rate = trial.suggest_uniform('dropout_rate', 0.1, 0.5)         # 0.1 ~ 0.5
  attn_output = torch.nn.functional.scaled_dot_product_attention(
100%|████████████████████████████████████████████████████████████████████████████████| 627/627 [02:50<00:00,  3.68it/s]
100%|████████████████████████████████████████████████████████████████████████████████| 111/111 [00:12<00:00,  8.81it/s]
[I 2024-09-20 10:18:57,436] Trial 0 finished with value: 0.576271186440678 and parameters: {'learning_rate': 0.001013134137284379, 'dropout_rate': 0.4811521460895716, 'optimizer': 'sgd'}. Best is trial 0 with value: 0.576271186440678.
100%|████████████████████████████████████████████████████████████████████████████████| 627/627 [03:11<00:00,  3.27it/s]
100%|████████████████████████████████████████████████

Best hyperparameters:  {'learning_rate': 0.002361194991533609, 'dropout_rate': 0.18317574580310583, 'optimizer': 'sgd'}


In [22]:
# 디바이스 설정
# best_params = study.best_trial.params
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
# 4. 최적 하이퍼파라미터로 모델 정의 및 학습
#machinery_model = BertForMachinery(num_machinery_labels=len(machinery_label_encoder.classes_), extra_features_dim=10).to(device)
# machinery_model.dropout = nn.Dropout(best_params['dropout_rate'])
machinery_model = BertForMachinery(num_machinery_labels=len(machinery_label_encoder.classes_), extra_features_dim=10) 
machinery_model.to(device)

optimizer_machinery = AdamW(machinery_model.parameters(), lr=3e-5)
loss_fn_machinery=torch.nn.CrossEntropyLoss()



In [28]:
# 옵티마이저 및 학습률 스케줄러 설정
#optimizer_machinery = (
#    AdamW(machinery_model.parameters(), lr=best_params['learning_rate']) 
#    if best_params['optimizer'] == 'adamw' 
#    else torch.optim.SGD(machinery_model.parameters(), lr=best_params['learning_rate'], momentum=0.9)
#)

In [23]:
import torch
torch.cuda.empty_cache()

In [24]:
# Machinery 모델 학습 실행
num_epochs = 20
best_val_acc_machinery = 0
patience = 3
trigger_times = 0

for epoch in range(num_epochs):
    print(f"Epoch {epoch+1}/{num_epochs}")
    
    # Machinery 모델 학습
    train_loss_machinery = train_machinery(
        machinery_model, 
        train_loader_machinery, 
        optimizer_machinery, 
        device, 
        loss_fn_machinery
    )
    
    # Machinery 모델 평가
    val_loss_machinery, val_acc_machinery, val_preds_machinery = evaluate_machinery(
        machinery_model, 
        val_loader_machinery, 
        device, 
        loss_fn_machinery
    )
    test_loss_machinery, test_acc_machinery, test_preds_machinery = evaluate_machinery(
        machinery_model, 
        test_loader_machinery, 
        device, 
        loss_fn_machinery
    )
    
    print(f"Machinery - Train Loss: {train_loss_machinery:.4f}, Val Loss: {val_loss_machinery:.4f}, Val Acc: {val_acc_machinery:.4f}, Test Acc: {test_acc_machinery:.4f}")
    
    # Early Stopping for Machinery
    if val_acc_machinery > best_val_acc_machinery:
        best_val_acc_machinery = val_acc_machinery
        trigger_times = 0
        torch.save(machinery_model.state_dict(), "final_machinery_model.pth")
    else:
        trigger_times += 1
        print(f"Trigger Times (Machinery): {trigger_times}")
        if trigger_times >= patience:
            print("Early stopping for Machinery!")
            break

# 최종 테스트 성능 평가
final_test_loss_machinery, final_test_acc_machinery, final_machinery_predictions = evaluate_machinery(
    machinery_model, 
    test_loader_machinery, 
    device, 
    loss_fn_machinery
)
print(f"Final Test Accuracy (Machinery): {final_test_acc_machinery:.4f}")
torch.save(machinery_model.state_dict(), "final_machinery_model.pth")


Epoch 1/20


  attn_output = torch.nn.functional.scaled_dot_product_attention(
100%|████████████████████████████████████████████████████████████████████████████████| 627/627 [03:06<00:00,  3.36it/s]
100%|████████████████████████████████████████████████████████████████████████████████| 111/111 [00:12<00:00,  8.94it/s]
100%|████████████████████████████████████████████████████████████████████████████████| 131/131 [00:10<00:00, 12.00it/s]


Machinery - Train Loss: 2.0721, Val Loss: 1.3901, Val Acc: 0.6356, Test Acc: 0.6323
Epoch 2/20


100%|████████████████████████████████████████████████████████████████████████████████| 627/627 [03:06<00:00,  3.36it/s]
100%|████████████████████████████████████████████████████████████████████████████████| 111/111 [00:12<00:00,  8.90it/s]
100%|████████████████████████████████████████████████████████████████████████████████| 131/131 [00:11<00:00, 11.85it/s]


Machinery - Train Loss: 1.2565, Val Loss: 1.0643, Val Acc: 0.7023, Test Acc: 0.6976
Epoch 3/20


100%|████████████████████████████████████████████████████████████████████████████████| 627/627 [03:06<00:00,  3.37it/s]
100%|████████████████████████████████████████████████████████████████████████████████| 111/111 [00:12<00:00,  8.90it/s]
100%|████████████████████████████████████████████████████████████████████████████████| 131/131 [00:11<00:00, 11.91it/s]


Machinery - Train Loss: 0.9909, Val Loss: 0.9139, Val Acc: 0.7390, Test Acc: 0.7331
Epoch 4/20


100%|████████████████████████████████████████████████████████████████████████████████| 627/627 [03:07<00:00,  3.34it/s]
100%|████████████████████████████████████████████████████████████████████████████████| 111/111 [00:12<00:00,  8.71it/s]
100%|████████████████████████████████████████████████████████████████████████████████| 131/131 [00:11<00:00, 11.48it/s]


Machinery - Train Loss: 0.8381, Val Loss: 0.7969, Val Acc: 0.7576, Test Acc: 0.7504
Epoch 5/20


100%|████████████████████████████████████████████████████████████████████████████████| 627/627 [03:07<00:00,  3.34it/s]
100%|████████████████████████████████████████████████████████████████████████████████| 111/111 [00:12<00:00,  8.90it/s]
100%|████████████████████████████████████████████████████████████████████████████████| 131/131 [00:10<00:00, 11.91it/s]


Machinery - Train Loss: 0.7175, Val Loss: 0.7505, Val Acc: 0.7780, Test Acc: 0.7643
Epoch 6/20


100%|████████████████████████████████████████████████████████████████████████████████| 627/627 [03:08<00:00,  3.32it/s]
100%|████████████████████████████████████████████████████████████████████████████████| 111/111 [00:12<00:00,  8.71it/s]
100%|████████████████████████████████████████████████████████████████████████████████| 131/131 [00:11<00:00, 11.85it/s]


Machinery - Train Loss: 0.6354, Val Loss: 0.6658, Val Acc: 0.7921, Test Acc: 0.7926
Epoch 7/20


100%|████████████████████████████████████████████████████████████████████████████████| 627/627 [03:07<00:00,  3.34it/s]
100%|████████████████████████████████████████████████████████████████████████████████| 111/111 [00:12<00:00,  8.80it/s]
100%|████████████████████████████████████████████████████████████████████████████████| 131/131 [00:11<00:00, 11.88it/s]


Machinery - Train Loss: 0.5699, Val Loss: 0.6159, Val Acc: 0.8102, Test Acc: 0.8080
Epoch 8/20


100%|████████████████████████████████████████████████████████████████████████████████| 627/627 [03:06<00:00,  3.36it/s]
100%|████████████████████████████████████████████████████████████████████████████████| 111/111 [00:12<00:00,  8.81it/s]
100%|████████████████████████████████████████████████████████████████████████████████| 131/131 [00:11<00:00, 11.82it/s]


Machinery - Train Loss: 0.5121, Val Loss: 0.6201, Val Acc: 0.8051, Test Acc: 0.8051
Trigger Times (Machinery): 1
Epoch 9/20


100%|████████████████████████████████████████████████████████████████████████████████| 627/627 [03:05<00:00,  3.37it/s]
100%|████████████████████████████████████████████████████████████████████████████████| 111/111 [00:12<00:00,  8.89it/s]
100%|████████████████████████████████████████████████████████████████████████████████| 131/131 [00:11<00:00, 11.90it/s]


Machinery - Train Loss: 0.4584, Val Loss: 0.5659, Val Acc: 0.8254, Test Acc: 0.8248
Epoch 10/20


100%|████████████████████████████████████████████████████████████████████████████████| 627/627 [03:09<00:00,  3.31it/s]
100%|████████████████████████████████████████████████████████████████████████████████| 111/111 [00:12<00:00,  8.76it/s]
100%|████████████████████████████████████████████████████████████████████████████████| 131/131 [00:11<00:00, 11.85it/s]


Machinery - Train Loss: 0.4281, Val Loss: 0.5486, Val Acc: 0.8260, Test Acc: 0.8301
Epoch 11/20


100%|████████████████████████████████████████████████████████████████████████████████| 627/627 [03:10<00:00,  3.28it/s]
100%|████████████████████████████████████████████████████████████████████████████████| 111/111 [00:13<00:00,  8.41it/s]
100%|████████████████████████████████████████████████████████████████████████████████| 131/131 [00:11<00:00, 11.26it/s]


Machinery - Train Loss: 0.4014, Val Loss: 0.5530, Val Acc: 0.8266, Test Acc: 0.8181
Epoch 12/20


100%|████████████████████████████████████████████████████████████████████████████████| 627/627 [03:11<00:00,  3.28it/s]
100%|████████████████████████████████████████████████████████████████████████████████| 111/111 [00:12<00:00,  8.70it/s]
100%|████████████████████████████████████████████████████████████████████████████████| 131/131 [00:11<00:00, 11.21it/s]


Machinery - Train Loss: 0.3789, Val Loss: 0.5754, Val Acc: 0.8367, Test Acc: 0.8344
Epoch 13/20


100%|████████████████████████████████████████████████████████████████████████████████| 627/627 [05:05<00:00,  2.05it/s]
100%|████████████████████████████████████████████████████████████████████████████████| 111/111 [00:25<00:00,  4.33it/s]
100%|████████████████████████████████████████████████████████████████████████████████| 131/131 [00:21<00:00,  5.96it/s]


Machinery - Train Loss: 0.3573, Val Loss: 0.5412, Val Acc: 0.8429, Test Acc: 0.8382
Epoch 14/20


100%|████████████████████████████████████████████████████████████████████████████████| 627/627 [04:04<00:00,  2.57it/s]
100%|████████████████████████████████████████████████████████████████████████████████| 111/111 [00:12<00:00,  8.73it/s]
100%|████████████████████████████████████████████████████████████████████████████████| 131/131 [00:11<00:00, 11.84it/s]


Machinery - Train Loss: 0.3326, Val Loss: 0.5363, Val Acc: 0.8446, Test Acc: 0.8377
Epoch 15/20


100%|████████████████████████████████████████████████████████████████████████████████| 627/627 [03:06<00:00,  3.36it/s]
100%|████████████████████████████████████████████████████████████████████████████████| 111/111 [00:12<00:00,  8.85it/s]
100%|████████████████████████████████████████████████████████████████████████████████| 131/131 [00:11<00:00, 11.78it/s]


Machinery - Train Loss: 0.3177, Val Loss: 0.5397, Val Acc: 0.8345, Test Acc: 0.8301
Trigger Times (Machinery): 1
Epoch 16/20


100%|████████████████████████████████████████████████████████████████████████████████| 627/627 [03:16<00:00,  3.19it/s]
100%|████████████████████████████████████████████████████████████████████████████████| 111/111 [00:13<00:00,  8.48it/s]
100%|████████████████████████████████████████████████████████████████████████████████| 131/131 [00:11<00:00, 11.39it/s]


Machinery - Train Loss: 0.2998, Val Loss: 0.5457, Val Acc: 0.8407, Test Acc: 0.8373
Trigger Times (Machinery): 2
Epoch 17/20


100%|████████████████████████████████████████████████████████████████████████████████| 627/627 [03:06<00:00,  3.36it/s]
100%|████████████████████████████████████████████████████████████████████████████████| 111/111 [00:12<00:00,  8.94it/s]
100%|████████████████████████████████████████████████████████████████████████████████| 131/131 [00:10<00:00, 12.00it/s]


Machinery - Train Loss: 0.2840, Val Loss: 0.5342, Val Acc: 0.8424, Test Acc: 0.8382
Trigger Times (Machinery): 3
Early stopping for Machinery!


100%|████████████████████████████████████████████████████████████████████████████████| 131/131 [00:10<00:00, 11.96it/s]


Final Test Accuracy (Machinery): 0.8382


### BERT 기반 machinery 모델 학습 및 예측값

> xgboost assembly

In [135]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from xgboost import XGBClassifier
from collections import Counter
from imblearn.combine import SMOTETomek
from sklearn.metrics import accuracy_score, classification_report
import numpy as np

# 1. 텍스트를 정수 시퀀스로 변환
tokenizer = Tokenizer(num_words=20000)
tokenizer.fit_on_texts(data['combined_text'])
sequences = tokenizer.texts_to_sequences(data['combined_text'])

# 2. 시퀀스 패딩 (X_seq로 변경)
max_len = 50
X_seq = pad_sequences(sequences, maxlen=max_len)

# 3. 정수형 레이블 (y)
assembly_labels = data['Assembly'].values

label_encoder_assembly = LabelEncoder()
y_assembly = label_encoder_assembly.fit_transform(assembly_labels)

# Train-Test Split을 일관되게 적용
X_seq_train, X_seq_test, y_train_assembly, y_test_assembly = train_test_split(
    X_seq, y_assembly, test_size=0.15, random_state=42, stratify=y_assembly
)

X_seq_train_final, X_seq_val, y_train_assembly_final, y_val_assembly = train_test_split(
    X_seq_train, y_train_assembly, test_size=0.15, random_state=42, stratify=y_train_assembly
)

# 분할된 데이터 크기 확인
print(f"X_seq_train_final shape: {X_seq_train_final.shape}")
print(f"X_seq_val shape: {X_seq_val.shape}")
print(f"X_seq_test shape: {X_seq_test.shape}")

X_seq_train_final shape: (10029, 50)
X_seq_val shape: (1770, 50)
X_seq_test shape: (2083, 50)


In [129]:
def predict_machinery_classes(model, dataloader, device):
    model.eval()
    all_preds = []
    with torch.no_grad():
        for batch in dataloader:
            input_ids, attention_mask, extra_features = [b.to(device) for b in batch[:3]]
            outputs = model(input_ids=input_ids, attention_mask=attention_mask, extra_features=extra_features)
            preds = torch.argmax(outputs, dim=1)  # Get the class with the highest probability
            all_preds.append(preds.cpu().numpy())
    return np.concatenate(all_preds)


In [130]:

# 1. Machinery 예측 클래스 가져오기
machinery_preds_train = predict_machinery_classes(machinery_model, train_loader_machinery, device)
machinery_preds_val = predict_machinery_classes(machinery_model, val_loader_machinery, device)
machinery_preds_test = predict_machinery_classes(machinery_model, test_loader_machinery, device)


In [136]:
# 2. Machinery 예측값을 시퀀스 데이터에 추가
X_seq_train_with_machinery = np.hstack([X_seq_train_final, machinery_preds_train.reshape(-1, 1)])
X_seq_val_with_machinery = np.hstack([X_seq_val, machinery_preds_val.reshape(-1, 1)])
X_seq_test_with_machinery = np.hstack([X_seq_test, machinery_preds_test.reshape(-1, 1)])


In [137]:
# 데이터 크기 확인
print(f"X_seq_train_with_machinery shape: {X_seq_train_with_machinery.shape}")
print(f"y_train_assembly_final shape: {y_train_assembly_final.shape}")

X_seq_train_with_machinery shape: (10029, 51)
y_train_assembly_final shape: (10029,)


In [139]:
# 크기가 일치하는지 확인
assert X_seq_train_with_machinery.shape[0] == y_train_assembly_final.shape[0], "X_train과 y_train의 크기가 일치하지 않습니다."

# SMOTE + Tomek Links 적용
smote_tomek = SMOTETomek(random_state=42)
X_resampled_train, y_resampled_train_assembly = smote_tomek.fit_resample(X_seq_train_with_machinery, y_train_assembly_final)

# 결과 확인
print(f"Resampled X shape: {X_resampled_train.shape}")
print(f"Resampled y shape: {y_resampled_train_assembly.shape}")




Resampled X shape: (82267, 51)
Resampled y shape: (82267,)


In [141]:
# 5. XGBoost Assembly 모델 설정 및 학습
assembly_model = XGBClassifier(
    objective='multi:softmax',
    num_class=209,  # Assembly 클래스 수
    learning_rate=0.05,
    max_depth=8,
    n_estimators=200,
    subsample=0.8,
    colsample_bytree=0.8,
    reg_lambda=1,
    random_state=42,
    verbosity=1
)

# SMOTE + Tomek Links로 불균형 데이터를 해결한 후 훈련 세트로 학습
assembly_model.fit(X_resampled_train, y_resampled_train_assembly)

# 6. 성능 평가 (검증 세트)
assembly_preds_val = assembly_model.predict(X_seq_val_with_machinery)
assembly_accuracy_val = accuracy_score(y_val_assembly, assembly_preds_val)
print(f'Assembly Validation Accuracy: {assembly_accuracy_val:.4f}')

# 성능 평가 (테스트 세트)
assembly_preds_test = assembly_model.predict(X_seq_test_with_machinery)
assembly_accuracy_test = accuracy_score(y_test_assembly, assembly_preds_test)
print(f'Assembly Test Accuracy: {assembly_accuracy_test:.4f}')

Assembly Validation Accuracy: 0.7701
Assembly Test Accuracy: 0.7835


In [143]:
from sklearn.model_selection import RandomizedSearchCV
from xgboost import XGBClassifier
from scipy.stats import uniform, randint
import numpy as np

assembly_model = XGBClassifier(
    objective='multi:softmax',
    num_class=209,  # Assembly 클래스 수에 맞게 설정
    random_state=42,
    verbosity=1
)

param_grid = {
    'learning_rate': uniform(0.03, 0.03),  # 0.05를 중심으로 ±0.03
    'max_depth': randint(7, 10),  # 8을 중심으로 ±1
    'n_estimators': randint(180, 220),  # 200을 중심으로 ±20
    'subsample': uniform(0.75, 0.1),  # 0.8을 중심으로 ±0.05
    'colsample_bytree': uniform(0.75, 0.05),  # 0.8을 중심으로 ±0.05
    'reg_lambda': [1, 2],
}

# 3. RandomizedSearchCV 설정
random_search_assembly = RandomizedSearchCV(
    estimator=assembly_model,
    param_distributions=param_grid,
    n_iter=30,  # 시도할 파라미터 조합 수
    scoring='accuracy',
    cv=3,  # 교차 검증 fold 수
    verbose=1,
    n_jobs=-1,  # 가능한 모든 코어 사용
    random_state=42
)

# 4. RandomizedSearchCV 실행 (훈련 세트 사용)
random_search_assembly.fit(X_seq_train_with_machinery, y_train_assembly_final)

# 5. 최적의 하이퍼파라미터 출력
print(f"Best parameters for Assembly: {random_search_assembly.best_params_}")

# 6. 검증 세트 성능 평가
assembly_preds_val = random_search_assembly.best_estimator_.predict(X_seq_val_with_machinery)
assembly_accuracy_val = accuracy_score(y_val_assembly, assembly_preds_val)
print(f'Assembly Validation Accuracy: {assembly_accuracy_val:.4f}')

# 7. 테스트 세트 성능 평가
assembly_preds_test = random_search_assembly.best_estimator_.predict(X_seq_test_with_machinery)
assembly_accuracy_test = accuracy_score(y_test_assembly, assembly_preds_test)
print(f'Assembly Test Accuracy: {assembly_accuracy_test:.4f}')

Fitting 3 folds for each of 30 candidates, totalling 90 fits
Best parameters for Assembly: {'colsample_bytree': 0.7687270059423681, 'learning_rate': 0.05852142919229748, 'max_depth': 9, 'n_estimators': 187, 'reg_lambda': 1, 'subsample': 0.8096850157946487}
Assembly Validation Accuracy: 0.7983
Assembly Test Accuracy: 0.8113


In [144]:
import joblib

# 최적의 모델을 파일로 저장
joblib.dump(random_search_assembly.best_estimator_, 'berttoxgboost_assembly_model.pkl')
print("Model saved as berttoxgboost_assembly_model.pkl")

Model saved as berttoxgboost_assembly_model.pkl
