In [1]:
import pandas as pd
import numpy as np
import torch.nn as nn
import torch

import re
from transformers import BertTokenizer, BertForSequenceClassification, AdamW, BertModel
from torch.utils.data import DataLoader, TensorDataset
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from tqdm import tqdm

In [2]:
data = pd.read_excel('filtered_30_filled_money.xlsx')

In [3]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13882 entries, 0 to 13881
Data columns (total 32 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   청구서번호        13882 non-null  object 
 1   No.          13882 non-null  int64  
 2   Subject      13872 non-null  object 
 3   Machinery    13882 non-null  object 
 4   Assembly     13882 non-null  object 
 5   청구품목         13882 non-null  object 
 6   Unnamed: 6   0 non-null      float64
 7   Part No.1    13881 non-null  object 
 8   Part No.2    2430 non-null   object 
 9   청구량          13818 non-null  float64
 10  견적           13698 non-null  object 
 11  견적수량         13818 non-null  float64
 12  견적화폐         13882 non-null  object 
 13  견적단가         13882 non-null  float64
 14  발주번호         13882 non-null  object 
 15  발주처          13882 non-null  object 
 16  발주           13882 non-null  object 
 17  발주수량         13818 non-null  float64
 18  발주금액         13818 non-null  float64
 19  D/T 

In [4]:
data['견적화폐'].isnull().sum()

0

In [5]:
def preprocess_text(text):
    text = text.lower()
    text = re.sub(r'\([^)]*\)', '', text)
    text = re.sub(r'[^\w\s\*/\-\+.,#&]', '', text)
    text = re.sub(r'\s+', ' ', text)
    text = re.sub(r'\b(사용금지|사)\b', '', text, flags=re.IGNORECASE)
    text = text.strip()
    return text

def clean_supplier_name(name):
    name = name.lower()
    name = re.sub(r'coporation|coropration|coproration|corporration', 'corporation', name)
    name = re.sub(r'\(사용금지\)', '', name)
    name = re.sub(r'u\.s\.a', '_usa', name)
    name = re.sub(r'\.', '', name)
    suffixes = r'(corporation|corp|company|co|incorporated|inc|limited|ltd|상사|공사|엔지니어링|주식회사|주|gmbh|pte ltd|llc)'
    name = re.sub(suffixes, '', name, flags=re.IGNORECASE)
    name = re.sub(r'[^\w\s-]', '', name)
    name = re.sub(r'\s+', ' ', name).strip()
    return name

In [6]:
# 텍스트 전처리
data['cleaned_item'] = data['청구품목'].apply(preprocess_text)
data['cleaned_supplier'] = data['발주처'].apply(clean_supplier_name)
data['combined_text'] = data['cleaned_item'].fillna('') + " " + data['Part No.1'].fillna('') + " " + data['cleaned_supplier'].fillna('')


In [7]:
exchange_rates = {'USD': 1, 'KRW': 0.00078, 'EUR': 1.18, 'JPY': 0.0091}

# usd기준해서 금액 통일함 
data['converted_price'] = data.apply(lambda x: x['견적단가'] * exchange_rates[x['견적화폐']], axis=1)


In [8]:
print(data['견적화폐'].unique(), data['견적화폐'].isnull().sum())


['USD' 'KRW' 'EUR' 'JPY'] 0


In [9]:
currency_ohe = OneHotEncoder(sparse_output=False) 
currency_encoded = currency_ohe.fit_transform(data[['견적화폐']])

In [10]:
# 레이블 인코딩
machinery_label_encoder = LabelEncoder()
y_machinery = machinery_label_encoder.fit_transform(data['Machinery'])

assembly_label_encoder = LabelEncoder()
y_assembly = assembly_label_encoder.fit_transform(data['Assembly'])

In [11]:
# train_test split 을 위해 하나로 모으고, 분할하고 다시 텍스트랑 추가피쳐로 분리해줄거임 

# 1. 텍스트 + 추가 피처 결합
X = np.concatenate([
    data['combined_text'].values.reshape(-1, 1),  # 2차원 배열로 바꿔서 결합해줌 
    currency_encoded, 
    data['converted_price'].values.reshape(-1, 1)  # 통일한단가
], axis=1)

X_train_val, X_test, y_train_val_machinery, y_test_machinery, y_train_val_assembly, y_test_assembly = train_test_split(
    X, y_machinery, y_assembly, test_size=0.15, random_state=42, stratify=y_machinery)

X_train, X_val, y_train_machinery, y_val_machinery, y_train_assembly, y_val_assembly = train_test_split(
    X_train_val, y_train_val_machinery, y_train_val_assembly, test_size=0.15, random_state=42, stratify=y_train_val_machinery)

# 크기 확인
print(f"combined_text shape: {data['combined_text'].shape}")
print(f"currency_encoded shape: {currency_encoded.shape}")
print(f"converted_price shape: {data['converted_price'].shape}")
print(f"X shape after concatenation: {X.shape}")

print(f"X_train size: {X_train.shape}")
print(f"X_val size: {X_val.shape}")
print(f"X_test size: {X_test.shape}")


combined_text shape: (13882,)
currency_encoded shape: (13882, 4)
converted_price shape: (13882,)
X shape after concatenation: (13882, 6)
X_train size: (10029, 6)
X_val size: (1770, 6)
X_test size: (2083, 6)


In [12]:
#텍스트분리
train_combined_text = X_train[:, 0] 
val_combined_text = X_val[:, 0]
test_combined_text = X_test[:, 0]

train_extra_features = X_train[:, 1:]  # 이 부분에서 이미 2차원으로 분리됨
val_extra_features = X_val[:, 1:]
test_extra_features = X_test[:, 1:]

# object타입이 섞여있다고 해서 astype float 명시해줌
train_extra_features = np.nan_to_num(train_extra_features, nan=0.0).astype(float)
val_extra_features = np.nan_to_num(val_extra_features, nan=0.0).astype(float)
test_extra_features = np.nan_to_num(test_extra_features, nan=0.0).astype(float)

# Torch Tensor로 변환 - 추가로 변환할 필요 없이 2차원 유지
train_extra_features_tensor = torch.tensor(train_extra_features, dtype=torch.float32)  # 이미 2차원
val_extra_features_tensor = torch.tensor(val_extra_features, dtype=torch.float32)
test_extra_features_tensor = torch.tensor(test_extra_features, dtype=torch.float32)

# 크기 확인
print(f"train_extra_features size: {train_extra_features.shape}")
print(f"val_extra_features size: {val_extra_features.shape}")
print(f"test_extra_features size: {test_extra_features.shape}")



train_extra_features size: (10029, 5)
val_extra_features size: (1770, 5)
test_extra_features size: (2083, 5)


In [13]:
# 데이터 타입 확인
print(f"train_extra_features dtype: {train_extra_features.dtype}")
print(f"val_extra_features dtype: {val_extra_features.dtype}")
print(f"test_extra_features dtype: {test_extra_features.dtype}")

train_extra_features dtype: float64
val_extra_features dtype: float64
test_extra_features dtype: float64


In [14]:
# BERT 토크나이저 (텍스트처리)
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')



In [15]:
# X중 텍스트만 BERT 입력 형식으로 변환
def encode_data(texts):
    return tokenizer(texts.tolist(), padding=True, truncation=True, max_length=128, return_tensors='pt')

train_encodings = encode_data(train_combined_text)
val_encodings = encode_data(val_combined_text)
test_encodings = encode_data(test_combined_text)


In [16]:
# BERT 텍스트 인코딩 + 추가 피처 더해서 dataset 생성
train_dataset = TensorDataset(
    train_encodings['input_ids'],
    train_encodings['attention_mask'],
    train_extra_features_tensor,
    torch.tensor(y_train_machinery),
    torch.tensor(y_train_assembly)
)
val_dataset = TensorDataset(
    val_encodings['input_ids'],
    val_encodings['attention_mask'],
    val_extra_features_tensor,
    torch.tensor(y_val_machinery),
    torch.tensor(y_val_assembly)
)
test_dataset = TensorDataset(
    test_encodings['input_ids'],
    test_encodings['attention_mask'],
    test_extra_features_tensor,
    torch.tensor(y_test_machinery),
    torch.tensor(y_test_assembly)
)

In [17]:
print(f"y_train size: {y_train_assembly.shape}")
print(f"y_val size: {y_val_assembly.shape}")
print(f"y_test size: {y_test_assembly.shape}")


y_train size: (10029,)
y_val size: (1770,)
y_test size: (2083,)


In [18]:
# 데이터 로더
batch_size = 8
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, num_workers=0)
val_loader  = DataLoader(val_dataset, batch_size=batch_size, shuffle=False, num_workers=0)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False, num_workers=0)


In [19]:
# Machinery 예측 후 Assembly 모델에 전달
class BertWithTwoStages(nn.Module):
    def __init__(self, num_machinery_labels, num_assembly_labels, extra_features_dim):
        super(BertWithTwoStages, self).__init__()
        self.bert = BertModel.from_pretrained('bert-base-uncased')
        self.machinery_classifier = nn.Linear(self.bert.config.hidden_size + extra_features_dim, num_machinery_labels)
        
        self.assembly_hidden = nn.Linear(self.bert.config.hidden_size + extra_features_dim + num_machinery_labels, 512) 

        self.assembly_activation = nn.ReLU()  # 활성화 함수
        self.assembly_dropout = nn.Dropout(0.3)  # 드롭아웃 추가
        self.assembly_classifier = nn.Linear(512, num_assembly_labels)

    def forward(self, input_ids, attention_mask, extra_features):
        # BERT에서 pooled_output 추출
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        pooled_output = outputs.pooler_output

        # Machinery 예측
        machinery_combined_features = torch.cat((pooled_output, extra_features), dim=1)
        machinery_outputs = self.machinery_classifier(machinery_combined_features)

        # Assembly 예측 (machinery 예측값 추가)
        combined_features = torch.cat((pooled_output, extra_features, machinery_outputs), dim=1)
        
        assembly_hidden_output = self.assembly_hidden(combined_features)
        assembly_hidden_output = self.assembly_activation(assembly_hidden_output)
        assembly_hidden_output = self.assembly_dropout(assembly_hidden_output)
        
        assembly_outputs = self.assembly_classifier(assembly_hidden_output)
    

        return machinery_outputs, assembly_outputs

In [20]:
import torch
torch.cuda.empty_cache()

In [21]:
# 디바이스 설정
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
# Machinery와 Assembly의 클래스 개수 확인
num_machinery_labels = len(machinery_label_encoder.classes_)  # machinery 레이블의 클래스 수
num_assembly_labels = len(assembly_label_encoder.classes_)    # assembly 레이블의 클래스 수

# 모델 초기화
model = BertWithTwoStages(num_machinery_labels=num_machinery_labels, num_assembly_labels=num_assembly_labels, extra_features_dim=5)
model.to(device)

  return t.to(


BertWithTwoStages(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, element

In [22]:
# 옵티마이저 및 학습률 스케줄러 설정
optimizer = AdamW(model.parameters(), lr=2e-5)
loss_fn=torch.nn.CrossEntropyLoss()



In [23]:
def train(model, dataloader, optimizer, device):
    model.train()
    total_loss = 0
    for batch in tqdm(dataloader):
        input_ids, attention_mask, extra_features, y_machinery, y_assembly = [b.to(device) for b in batch]  # 두 개의 레이블
        
        # 레이블을 int64로 변환
        y_machinery = y_machinery.to(torch.int64)
        y_assembly = y_assembly.to(torch.int64)
        
        optimizer.zero_grad()

        machinery_outputs, assembly_outputs = model(input_ids, attention_mask=attention_mask, extra_features=extra_features)

        machinery_loss = loss_fn(machinery_outputs, y_machinery)
        assembly_loss = loss_fn(assembly_outputs, y_assembly)
        
        loss = machinery_loss + assembly_loss
        total_loss += loss.item()

        loss.backward()
        optimizer.step()

    return total_loss / len(dataloader)

In [24]:
import torch.nn.functional as F

# 평가 함수 - logits-62개짜리 각각의 자신감
def evaluate(model, dataloader, device):
    model.eval()
    total_correct_machinery = 0
    total_correct_assembly = 0
    total_samples = 0
    
    with torch.no_grad():
        for batch in tqdm(dataloader):
            input_ids, attention_mask, extra_features, y_machinery, y_assembly = [b.to(device) for b in batch]
            machinery_outputs, assembly_outputs = model(input_ids, attention_mask=attention_mask, extra_features=extra_features)

            # Machinery 예측 정확도 계산
            _, predicted_machinery = torch.max(F.softmax(machinery_outputs, dim=1), 1)
            total_correct_machinery += (predicted_machinery == y_machinery).sum().item()

            # Assembly 예측 정확도 계산
            _, predicted_assembly = torch.max(F.softmax(assembly_outputs, dim=1), 1)
            total_correct_assembly += (predicted_assembly == y_assembly).sum().item()

            total_samples += y_machinery.size(0)

    # 두 개의 정확도 반환
    accuracy_machinery = total_correct_machinery / total_samples
    accuracy_assembly = total_correct_assembly / total_samples
    return accuracy_machinery, accuracy_assembly

In [25]:
# 학습 실행
num_epochs = 15
for epoch in range(num_epochs):
    train_loss = train(model, train_loader, optimizer, device)
    
    # 두 개의 평가 함수 호출
    train_acc_machinery, train_acc_assembly = evaluate(model, train_loader, device)
    val_acc_machinery, val_acc_assembly = evaluate(model, val_loader, device)
    test_acc_machinery, test_acc_assembly = evaluate(model, test_loader, device)

    print(f"Epoch {epoch+1}/{num_epochs}")
    print(f"Train Loss: {train_loss:.4f}, Train Machinery Acc: {train_acc_machinery:.4f}, Train Assembly Acc: {train_acc_assembly:.4f}")
    print(f"Val Machinery Acc: {val_acc_machinery:.4f}, Val Assembly Acc: {val_acc_assembly:.4f}")
    print(f"Test Machinery Acc: {test_acc_machinery:.4f}, Test Assembly Acc: {test_acc_assembly:.4f}")

# 최종 테스트 성능 평가
final_test_acc_machinery, final_test_acc_assembly = evaluate(model, test_loader, device)
print(f"Final Test Machinery Accuracy: {final_test_acc_machinery:.4f}")
print(f"Final Test Assembly Accuracy: {final_test_acc_assembly:.4f}")

  attn_output = torch.nn.functional.scaled_dot_product_attention(
100%|██████████████████████████████████████████████████████████████████████████████| 1254/1254 [03:56<00:00,  5.30it/s]
100%|██████████████████████████████████████████████████████████████████████████████| 1254/1254 [01:05<00:00, 19.26it/s]
100%|████████████████████████████████████████████████████████████████████████████████| 222/222 [00:11<00:00, 19.13it/s]
100%|████████████████████████████████████████████████████████████████████████████████| 261/261 [00:10<00:00, 24.33it/s]


Epoch 1/15
Train Loss: 34.9749, Train Machinery Acc: 0.5485, Train Assembly Acc: 0.1576
Val Machinery Acc: 0.5328, Val Assembly Acc: 0.1469
Test Machinery Acc: 0.5425, Test Assembly Acc: 0.1637


100%|██████████████████████████████████████████████████████████████████████████████| 1254/1254 [03:56<00:00,  5.30it/s]
100%|██████████████████████████████████████████████████████████████████████████████| 1254/1254 [01:05<00:00, 19.15it/s]
100%|████████████████████████████████████████████████████████████████████████████████| 222/222 [00:11<00:00, 19.19it/s]
100%|████████████████████████████████████████████████████████████████████████████████| 261/261 [00:10<00:00, 24.46it/s]


Epoch 2/15
Train Loss: 22.3900, Train Machinery Acc: 0.6330, Train Assembly Acc: 0.2092
Val Machinery Acc: 0.6192, Val Assembly Acc: 0.1893
Test Machinery Acc: 0.6284, Test Assembly Acc: 0.1959


 32%|█████████████████████████▍                                                     | 403/1254 [01:16<02:41,  5.26it/s]


KeyboardInterrupt: 

In [None]:
!nvidia-smi
