In [1]:
import pandas as pd
import numpy as np
import torch.nn as nn
import torch

import re
from transformers import BertTokenizer, BertForSequenceClassification, AdamW, BertModel
from torch.utils.data import DataLoader, TensorDataset
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from tqdm import tqdm

In [2]:
data = pd.read_excel('filtered_30_filled_money.xlsx')

In [3]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13882 entries, 0 to 13881
Data columns (total 32 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   청구서번호        13882 non-null  object 
 1   No.          13882 non-null  int64  
 2   Subject      13872 non-null  object 
 3   Machinery    13882 non-null  object 
 4   Assembly     13882 non-null  object 
 5   청구품목         13882 non-null  object 
 6   Unnamed: 6   0 non-null      float64
 7   Part No.1    13881 non-null  object 
 8   Part No.2    2430 non-null   object 
 9   청구량          13818 non-null  float64
 10  견적           13698 non-null  object 
 11  견적수량         13818 non-null  float64
 12  견적화폐         13882 non-null  object 
 13  견적단가         13882 non-null  float64
 14  발주번호         13882 non-null  object 
 15  발주처          13882 non-null  object 
 16  발주           13882 non-null  object 
 17  발주수량         13818 non-null  float64
 18  발주금액         13818 non-null  float64
 19  D/T 

In [4]:
data['견적화폐'].isnull().sum()

0

In [5]:
def preprocess_text(text):
    text = text.lower()
    text = re.sub(r'\([^)]*\)', '', text)
    text = re.sub(r'[^\w\s\*/\-\+.,#&]', '', text)
    text = re.sub(r'\s+', ' ', text)
    text = re.sub(r'\b(사용금지|사)\b', '', text, flags=re.IGNORECASE)
    text = text.strip()
    return text

def clean_supplier_name(name):
    name = name.lower()
    name = re.sub(r'coporation|coropration|coproration|corporration', 'corporation', name)
    name = re.sub(r'\(사용금지\)', '', name)
    name = re.sub(r'u\.s\.a', '_usa', name)
    name = re.sub(r'\.', '', name)
    suffixes = r'(corporation|corp|company|co|incorporated|inc|limited|ltd|상사|공사|엔지니어링|주식회사|주|gmbh|pte ltd|llc)'
    name = re.sub(suffixes, '', name, flags=re.IGNORECASE)
    name = re.sub(r'[^\w\s-]', '', name)
    name = re.sub(r'\s+', ' ', name).strip()
    return name

In [6]:
# 텍스트 전처리
data['cleaned_item'] = data['청구품목'].apply(preprocess_text)
data['cleaned_supplier'] = data['발주처'].apply(clean_supplier_name)
data['combined_text'] = data['cleaned_item'].fillna('') + " " + data['Part No.1'].fillna('') + " " + data['cleaned_supplier'].fillna('')


In [7]:
exchange_rates = {'USD': 1, 'KRW': 0.00078, 'EUR': 1.18, 'JPY': 0.0091}

# usd기준해서 금액 통일함 
data['converted_price'] = data.apply(lambda x: x['견적단가'] * exchange_rates[x['견적화폐']], axis=1)


In [8]:
print(data['견적화폐'].unique(), data['견적화폐'].isnull().sum())


['USD' 'KRW' 'EUR' 'JPY'] 0


In [9]:
currency_ohe = OneHotEncoder(sparse_output=False) 
currency_encoded = currency_ohe.fit_transform(data[['견적화폐']])

In [10]:
# 레이블 인코딩
label_encoder = LabelEncoder()
y = label_encoder.fit_transform(data['Machinery'])

In [11]:
# train_test split 을 위해 하나로 모으고, 분할하고 다시 텍스트랑 추가피쳐로 분리해줄거임 

# 1. 텍스트 + 추가 피처 결합
X = np.concatenate([
    data['combined_text'].values.reshape(-1, 1),  # 2차원 배열로 바꿔서 결합해줌 
    currency_encoded, 
    data['converted_price'].values.reshape(-1, 1)  # 통일한단가
], axis=1)

X_train_val, X_test, y_train_val, y_test = train_test_split(X, y, test_size=0.15, random_state=42, stratify=y)
X_train, X_val, y_train, y_val = train_test_split(X_train_val, y_train_val, test_size=0.15, random_state=42, stratify=y_train_val)

# 크기 확인
print(f"combined_text shape: {data['combined_text'].shape}")
print(f"currency_encoded shape: {currency_encoded.shape}")
print(f"converted_price shape: {data['converted_price'].shape}")
print(f"X shape after concatenation: {X.shape}")

print(f"X_train size: {X_train.shape}")
print(f"X_val size: {X_val.shape}")
print(f"X_test size: {X_test.shape}")


combined_text shape: (13882,)
currency_encoded shape: (13882, 4)
converted_price shape: (13882,)
X shape after concatenation: (13882, 6)
X_train size: (10029, 6)
X_val size: (1770, 6)
X_test size: (2083, 6)


In [12]:
#텍스트분리
train_combined_text = X_train[:, 0] 
val_combined_text = X_val[:, 0]
test_combined_text = X_test[:, 0]

train_extra_features = X_train[:, 1:]  # 이 부분에서 이미 2차원으로 분리됨
val_extra_features = X_val[:, 1:]
test_extra_features = X_test[:, 1:]

# object타입이 섞여있다고 해서 astype float 명시해줌
train_extra_features = np.nan_to_num(train_extra_features, nan=0.0).astype(float)
val_extra_features = np.nan_to_num(val_extra_features, nan=0.0).astype(float)
test_extra_features = np.nan_to_num(test_extra_features, nan=0.0).astype(float)

# Torch Tensor로 변환 - 추가로 변환할 필요 없이 2차원 유지
train_extra_features_tensor = torch.tensor(train_extra_features, dtype=torch.float32)  # 이미 2차원
val_extra_features_tensor = torch.tensor(val_extra_features, dtype=torch.float32)
test_extra_features_tensor = torch.tensor(test_extra_features, dtype=torch.float32)

# 크기 확인
print(f"train_extra_features size: {train_extra_features.shape}")
print(f"val_extra_features size: {val_extra_features.shape}")
print(f"test_extra_features size: {test_extra_features.shape}")



train_extra_features size: (10029, 5)
val_extra_features size: (1770, 5)
test_extra_features size: (2083, 5)


In [13]:
# 데이터 타입 확인
print(f"train_extra_features dtype: {train_extra_features.dtype}")
print(f"val_extra_features dtype: {val_extra_features.dtype}")
print(f"test_extra_features dtype: {test_extra_features.dtype}")

train_extra_features dtype: float64
val_extra_features dtype: float64
test_extra_features dtype: float64


In [14]:
# BERT 토크나이저 (텍스트처리)
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')



In [15]:
# X중 텍스트만 BERT 입력 형식으로 변환
def encode_data(texts):
    return tokenizer(texts.tolist(), padding=True, truncation=True, max_length=128, return_tensors='pt')

train_encodings = encode_data(train_combined_text)
val_encodings = encode_data(val_combined_text)
test_encodings = encode_data(test_combined_text)


In [16]:
# BERT 텍스트 인코딩 + 추가 피처 더해서 dataset 생성
train_dataset = TensorDataset(
    train_encodings['input_ids'],
    train_encodings['attention_mask'],
    train_extra_features_tensor,
    torch.tensor(y_train),
)
val_dataset = TensorDataset(
    val_encodings['input_ids'],
    val_encodings['attention_mask'],
    val_extra_features_tensor,
    torch.tensor(y_val),
)
test_dataset = TensorDataset(
    test_encodings['input_ids'],
    test_encodings['attention_mask'],
    test_extra_features_tensor,
    torch.tensor(y_test),
)

In [17]:
print(f"y_train size: {y_train.shape}")
print(f"y_val size: {y_val.shape}")
print(f"y_test size: {y_test.shape}")


y_train size: (10029,)
y_val size: (1770,)
y_test size: (2083,)


In [18]:
# 데이터 로더
batch_size = 8
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_loader  = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)


In [19]:
class BertWithExtraFeatures(nn.Module):
    def __init__(self, num_labels, extra_features_dim):
        super(BertWithExtraFeatures, self).__init__()
        self.bert = BertModel.from_pretrained('bert-base-uncased')
        # BERT의 hidden size(768) + extra features(5차원)
        self.classifier = nn.Linear(self.bert.config.hidden_size + extra_features_dim, num_labels)

    def forward(self, input_ids, attention_mask, extra_features):
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        pooled_output = outputs.pooler_output

        # extra_features가 1차원이 되지 않도록 차원 조정
        if extra_features.dim() == 1:
            extra_features = extra_features.unsqueeze(1)  # 2차원으로 만듦 (batch_size, 1)

        # pooled_output과 extra_features 결합
        combined_features = torch.cat((pooled_output, extra_features), dim=1)  # (batch_size, 768 + 5)

        results = self.classifier(combined_features)
        return results

In [20]:
import torch
torch.cuda.empty_cache()

In [21]:
# 디바이스 설정
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = BertWithExtraFeatures(num_labels=len(label_encoder.classes_), extra_features_dim=5) 
model.to(device)

  return t.to(


BertWithExtraFeatures(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, ele

In [22]:
# 옵티마이저 및 학습률 스케줄러 설정
optimizer = AdamW(model.parameters(), lr=2e-5)
loss_fn=torch.nn.CrossEntropyLoss()



In [23]:
for batch in train_loader:
    input_ids, attention_mask, extra_features, labels = batch
    print(f"Labels batch shape before: {labels.shape}")  # 이 부분을 확인
    if labels.dim() > 1:
        labels = labels.squeeze()
    print(f"Labels batch shape after: {labels.shape}")
    break

Labels batch shape before: torch.Size([8])
Labels batch shape after: torch.Size([8])


In [24]:
print(f"y_train shape: {torch.tensor(y_train).shape}")
print(f"y_val shape: {torch.tensor(y_val).shape}")
print(f"y_test shape: {torch.tensor(y_test).shape}")

y_train shape: torch.Size([10029])
y_val shape: torch.Size([1770])
y_test shape: torch.Size([2083])


In [25]:
def train(model, dataloader, optimizer, device):
    model.train()
    total_loss = 0
    for batch in tqdm(dataloader):
        input_ids, attention_mask, extra_features, labels = [b.to(device) for b in batch]  # 순서 수정
        
        if labels.dim() > 1:
            labels = labels.squeeze()

        optimizer.zero_grad()
        outputs = model(input_ids, attention_mask=attention_mask, extra_features=extra_features)
        labels = labels.to(torch.int64)  # CrossEntropyLoss에 맞게 변환
        loss = loss_fn(outputs, labels)
        total_loss += loss.item()
        loss.backward()
        optimizer.step()

    return total_loss / len(dataloader)


In [26]:
import torch.nn.functional as F

# 평가 함수 - logits-62개짜리 각각의 자신감
def evaluate(model, dataloader, device):
    model.eval()
    total_correct = 0
    total_samples = 0
    machinery_predictions = []
    with torch.no_grad():
        for batch in tqdm(dataloader):
            input_ids, attention_mask, extra_features, labels = [b.to(device) for b in batch]
            outputs = model(input_ids, attention_mask=attention_mask, extra_features=extra_features)
            logits = outputs
            probs = F.softmax(logits, dim=1)
            _, predicted = torch.max(probs, 1)
            machinery_predictions.extend(predicted.cpu().numpy())  # 예측값을 저장

            total_correct += (predicted == labels).sum().item()
            total_samples += labels.size(0)

    accuracy = total_correct / total_samples 
    return accuracy, machinery_predictions


In [None]:
import torch
torch.cuda.empty_cache()

In [28]:
# 학습 실행
num_epochs = 5
for epoch in range(num_epochs):
    train_loss = train(model, train_loader, optimizer, device)
    train_acc, train_machinery_predictions = evaluate(model, train_loader, device)  # Train 예측값 저장
    val_acc, val_machinery_predictions = evaluate(model, val_loader, device)        # Val 예측값 저장
    test_acc, test_machinery_predictions = evaluate(model, test_loader, device)     # Test 예측값 저장

    # 정확도만 출력
    print(f"Epoch {epoch+1}/{num_epochs}")
    print(f"Train Loss: {train_loss:.4f}, Train Acc: {train_acc:.4f}, Val Acc : {val_acc:.4f}, Test Acc: {test_acc:.4f}")

# 최종 테스트 성능 평가
final_test_acc, final_machinery_predictions = evaluate(model, test_loader, device)
print(f"Final Test Accuracy: {final_test_acc:.4f}")

100%|██████████████████████████████████████████████████████████████████████████████| 1254/1254 [03:55<00:00,  5.33it/s]
100%|██████████████████████████████████████████████████████████████████████████████| 1254/1254 [01:05<00:00, 19.16it/s]
100%|████████████████████████████████████████████████████████████████████████████████| 222/222 [00:11<00:00, 19.06it/s]
100%|████████████████████████████████████████████████████████████████████████████████| 261/261 [00:10<00:00, 24.61it/s]


Epoch 1/5
Train Loss: 19.6582, Train Acc: 0.5947, Val Acc : 0.5751, Test Acc: 0.5867


100%|██████████████████████████████████████████████████████████████████████████████| 1254/1254 [03:54<00:00,  5.34it/s]
100%|██████████████████████████████████████████████████████████████████████████████| 1254/1254 [01:06<00:00, 18.94it/s]
100%|████████████████████████████████████████████████████████████████████████████████| 222/222 [00:11<00:00, 19.19it/s]
100%|████████████████████████████████████████████████████████████████████████████████| 261/261 [00:10<00:00, 24.68it/s]


Epoch 2/5
Train Loss: 14.0505, Train Acc: 0.6770, Val Acc : 0.6480, Test Acc: 0.6567


100%|██████████████████████████████████████████████████████████████████████████████| 1254/1254 [03:55<00:00,  5.31it/s]
100%|██████████████████████████████████████████████████████████████████████████████| 1254/1254 [01:05<00:00, 19.18it/s]
100%|████████████████████████████████████████████████████████████████████████████████| 222/222 [00:11<00:00, 19.30it/s]
100%|████████████████████████████████████████████████████████████████████████████████| 261/261 [00:10<00:00, 24.58it/s]


Epoch 3/5
Train Loss: 9.8494, Train Acc: 0.7278, Val Acc : 0.6791, Test Acc: 0.7000


100%|██████████████████████████████████████████████████████████████████████████████| 1254/1254 [03:55<00:00,  5.34it/s]
100%|██████████████████████████████████████████████████████████████████████████████| 1254/1254 [01:04<00:00, 19.30it/s]
100%|████████████████████████████████████████████████████████████████████████████████| 222/222 [00:11<00:00, 19.32it/s]
100%|████████████████████████████████████████████████████████████████████████████████| 261/261 [00:10<00:00, 24.39it/s]


Epoch 4/5
Train Loss: 6.6179, Train Acc: 0.7803, Val Acc : 0.7333, Test Acc: 0.7316


 13%|██████████▌                                                                    | 168/1254 [00:31<03:23,  5.34it/s]


KeyboardInterrupt: 

### Machinery 예측값을 assembly 모델의 추가 피처로 결합하여 Assembly 모델 생성

In [None]:
assembly_label_encoder = LabelEncoder()
y_assembly = assembly_label_encoder.fit_transform(data['Assembly'])

In [None]:
# 2. 예측값을 assembly 모델의 추가 피처로 결합


# 1. 텍스트 + 추가 피처 결합
X = np.concatenate([
    data['combined_text'].values.reshape(-1, 1),  # 2차원 배열로 바꿔서 결합해줌 
    currency_encoded, 
    data['converted_price'].values.reshape(-1, 1)  # 통일한단가
], axis=1)


In [None]:
X_train_val_assembly, X_test_assembly, y_train_val_assembly, y_test_assembly = train_test_split(
    X, y_assembly, test_size=0.15, random_state=42, stratify=y_assembly)

X_train_assembly, X_val_assembly, y_train_assembly, y_val_assembly = train_test_split(
    X_train_val_assembly, y_train_val_assembly, test_size=0.15, stratify=y_train_val_assembly)


In [None]:
# 4. Machinery 모델에서 나온 예측값을 Train, Val, Test에 추가
machinery_train_predictions = np.array(train_machinery_predictions).reshape(-1, 1)  # Train 예측값
machinery_val_predictions = np.array(val_machinery_predictions).reshape(-1, 1)      # Val 예측값
machinery_test_predictions = np.array(test_machinery_predictions).reshape(-1, 1)    # Test 예측값


In [None]:
#Assembly 예측에 사용할 X 데이터에 예측값 추가
X_train_assembly = np.concatenate([X_train_assembly, machinery_train_predictions], axis=1)
X_val_assembly = np.concatenate([X_val_assembly, machinery_val_predictions], axis=1)
X_test_assembly = np.concatenate([X_test_assembly, machinery_test_predictions], axis=1)


In [None]:
# 3. assembly 모델용 데이터셋 생성
train_assembly_dataset = TensorDataset(
    train_encodings['input_ids'],
    train_encodings['attention_mask'],
    torch.tensor(machinery_train_predictions, dtype=torch.float32),  # 추가된 피처
    torch.tensor(y_assembly_train),  # assembly 레이블
)
