In [1]:
import pandas as pd
import numpy as np
import torch.nn as nn
import torch

import re
from transformers import BertTokenizer, BertForSequenceClassification, AdamW, BertModel
from torch.utils.data import DataLoader, TensorDataset
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from tqdm import tqdm

In [2]:
data = pd.read_excel('filtered_30_filled_money.xlsx')

In [3]:
def preprocess_text(text):
    text = text.lower()
    text = re.sub(r'\([^)]*\)', '', text)
    text = re.sub(r'[^\w\s\*/\-\+.,#&]', '', text)
    text = re.sub(r'\s+', ' ', text)
    text = re.sub(r'\b(사용금지|사)\b', '', text, flags=re.IGNORECASE)
    text = text.strip()
    return text

def clean_supplier_name(name):
    name = name.lower()
    name = re.sub(r'coporation|coropration|coproration|corporration', 'corporation', name)
    name = re.sub(r'\(사용금지\)', '', name)
    name = re.sub(r'u\.s\.a', '_usa', name)
    name = re.sub(r'\.', '', name)
    suffixes = r'(corporation|corp|company|co|incorporated|inc|limited|ltd|상사|공사|엔지니어링|주식회사|주|gmbh|pte ltd|llc)'
    name = re.sub(suffixes, '', name, flags=re.IGNORECASE)
    name = re.sub(r'[^\w\s-]', '', name)
    name = re.sub(r'\s+', ' ', name).strip()
    return name

In [4]:
# 텍스트 전처리
data['cleaned_item'] = data['청구품목'].apply(preprocess_text)
data['cleaned_supplier'] = data['발주처'].apply(clean_supplier_name)
data['combined_text'] = data['cleaned_item'].fillna('') + " " + data['Part No.1'].fillna('') + " " + data['cleaned_supplier'].fillna('')


In [5]:
exchange_rates = {'USD': 1, 'KRW': 0.00078, 'EUR': 1.18, 'JPY': 0.0091}

# usd기준해서 금액 통일함 
data['converted_price'] = data.apply(lambda x: x['견적단가'] * exchange_rates[x['견적화폐']], axis=1)


In [6]:
print(data['견적화폐'].unique(), data['견적화폐'].isnull().sum())


['USD' 'KRW' 'EUR' 'JPY'] 0


In [7]:
currency_ohe = OneHotEncoder(sparse_output=False) 
currency_encoded = currency_ohe.fit_transform(data[['견적화폐']])

In [8]:
# 레이블 인코딩
label_encoder = LabelEncoder()
y= label_encoder.fit_transform(data['Machinery'])

In [9]:
# train_test split 을 위해 하나로 모으고, 분할하고 다시 텍스트랑 추가피쳐로 분리해줄거임 

# 1. 텍스트 + 추가 피처 결합
X = np.concatenate([
    data['combined_text'].values.reshape(-1, 1),  # 2차원 배열로 바꿔서 결합해줌 
    currency_encoded, 
    data['converted_price'].values.reshape(-1, 1)  # 통일한단가
], axis=1)

X_train_val, X_test, y_train_val, y_test = train_test_split(X, y, test_size=0.15, random_state=42, stratify=y)
X_train, X_val, y_train, y_val = train_test_split(X_train_val, y_train_val, test_size=0.15, random_state=42, stratify=y_train_val)

# 크기 확인
print(f"combined_text shape: {data['combined_text'].shape}")
print(f"currency_encoded shape: {currency_encoded.shape}")
print(f"converted_price shape: {data['converted_price'].shape}")
print(f"X shape after concatenation: {X.shape}")

print(f"X_train size: {X_train.shape}")
print(f"X_val size: {X_val.shape}")
print(f"X_test size: {X_test.shape}")


combined_text shape: (13882,)
currency_encoded shape: (13882, 4)
converted_price shape: (13882,)
X shape after concatenation: (13882, 6)
X_train size: (10029, 6)
X_val size: (1770, 6)
X_test size: (2083, 6)


In [10]:
#텍스트분리
train_combined_text = X_train[:, 0] 
val_combined_text = X_val[:, 0]
test_combined_text = X_test[:, 0]

train_extra_features = X_train[:, 1:]  # 이 부분에서 이미 2차원으로 분리됨
val_extra_features = X_val[:, 1:]
test_extra_features = X_test[:, 1:]

# object타입이 섞여있다고 해서 astype float 명시해줌
train_extra_features = np.nan_to_num(train_extra_features, nan=0.0).astype(float)
val_extra_features = np.nan_to_num(val_extra_features, nan=0.0).astype(float)
test_extra_features = np.nan_to_num(test_extra_features, nan=0.0).astype(float)

# Torch Tensor로 변환 - 추가로 변환할 필요 없이 2차원 유지
train_extra_features_tensor = torch.tensor(train_extra_features, dtype=torch.float32)  # 이미 2차원
val_extra_features_tensor = torch.tensor(val_extra_features, dtype=torch.float32)
test_extra_features_tensor = torch.tensor(test_extra_features, dtype=torch.float32)

# 크기 확인
print(f"train_extra_features size: {train_extra_features.shape}")
print(f"val_extra_features size: {val_extra_features.shape}")
print(f"test_extra_features size: {test_extra_features.shape}")



train_extra_features size: (10029, 5)
val_extra_features size: (1770, 5)
test_extra_features size: (2083, 5)


In [11]:
# 데이터 타입 확인
print(f"train_extra_features dtype: {train_extra_features.dtype}")
print(f"val_extra_features dtype: {val_extra_features.dtype}")
print(f"test_extra_features dtype: {test_extra_features.dtype}")

train_extra_features dtype: float64
val_extra_features dtype: float64
test_extra_features dtype: float64


In [12]:
# BERT 토크나이저 (텍스트처리)
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')



In [13]:
# X중 텍스트만 BERT 입력 형식으로 변환
def encode_data(texts):
    return tokenizer(texts.tolist(), padding=True, truncation=True, max_length=128, return_tensors='pt')

train_encodings = encode_data(train_combined_text)
val_encodings = encode_data(val_combined_text)
test_encodings = encode_data(test_combined_text)


In [14]:
# BERT 텍스트 인코딩 + 추가 피처 더해서 dataset 생성
train_dataset = TensorDataset(
    train_encodings['input_ids'],
    train_encodings['attention_mask'],
    train_extra_features_tensor,
    torch.tensor(y_train),
)
val_dataset = TensorDataset(
    val_encodings['input_ids'],
    val_encodings['attention_mask'],
    val_extra_features_tensor,
    torch.tensor(y_val),
)
test_dataset = TensorDataset(
    test_encodings['input_ids'],
    test_encodings['attention_mask'],
    test_extra_features_tensor,
    torch.tensor(y_test),
)

In [15]:
print(f"y_train size: {y_train.shape}")
print(f"y_val size: {y_val.shape}")
print(f"y_test size: {y_test.shape}")


y_train size: (10029,)
y_val size: (1770,)
y_test size: (2083,)


In [16]:
# 데이터 로더
batch_size = 16
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_loader  = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)


In [17]:
class BertForMachinery(nn.Module):
    def __init__(self, num_machinery_labels, extra_features_dim):
        super(BertForMachinery, self).__init__()
        self.bert = BertModel.from_pretrained('bert-base-uncased')
        self.machinery_classifier = nn.Linear(self.bert.config.hidden_size + extra_features_dim, num_machinery_labels)

    def forward(self, input_ids, attention_mask, extra_features):
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        pooled_output = outputs.pooler_output
        
        # 2차원으로 만듦 (batch_size, 1)
        if extra_features.dim() == 1:
            extra_features = extra_features.unsqueeze(1)
            
        machinery_combined_features = torch.cat((pooled_output, extra_features), dim=1)
        machinery_outputs = self.machinery_classifier(machinery_combined_features)

        return machinery_outputs

In [18]:
import torch
torch.cuda.empty_cache()

In [19]:
# 디바이스 설정
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
machinery_model = BertForMachinery(num_machinery_labels=len(label_encoder.classes_), extra_features_dim=5) 
machinery_model.to(device)


  return t.to(


BertForMachinery(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementw

In [20]:
print(f"y_train shape: {torch.tensor(y_train).shape}")
print(f"y_val shape: {torch.tensor(y_val).shape}")
print(f"y_test shape: {torch.tensor(y_test).shape}")

y_train shape: torch.Size([10029])
y_val shape: torch.Size([1770])
y_test shape: torch.Size([2083])


In [21]:
# 옵티마이저 및 학습률 스케줄러 설정
optimizer = AdamW(machinery_model.parameters(), lr=2e-5)
loss_fn=torch.nn.CrossEntropyLoss()



In [22]:
def train(model, dataloader, optimizer, device):
    model.train()
    total_loss = 0
    for batch in tqdm(dataloader):
        input_ids, attention_mask, extra_features, labels = [b.to(device) for b in batch]  # 순서 수정
        
        if labels.dim() > 1:
            labels = labels.squeeze()

        optimizer.zero_grad()
        outputs = model(input_ids, attention_mask=attention_mask, extra_features=extra_features)
        labels = labels.to(torch.int64)  # CrossEntropyLoss에 맞게 변환
        loss = loss_fn(outputs, labels)
        total_loss += loss.item()
        loss.backward()
        optimizer.step()

    return total_loss / len(dataloader)


In [23]:
import torch.nn.functional as F

# 평가 함수 - logits-62개짜리 각각의 자신감
def evaluate(model, dataloader, device):
    model.eval()
    total_correct = 0
    total_samples = 0
    machinery_predictions = []
    
    with torch.no_grad():
        for batch in tqdm(dataloader):
            input_ids, attention_mask, extra_features, labels = [b.to(device) for b in batch]
            outputs = model(input_ids, attention_mask=attention_mask, extra_features=extra_features)
            
            # logits를 사용하여 정확한 예측값 계산
            probs = F.softmax(outputs, dim=1)
            _, predicted = torch.max(probs, 1)
            
            # 예측값을 저장
            machinery_predictions.append(predicted.cpu().numpy())  # 리스트에 추가
            
            # 정확도 계산
            total_correct += (predicted == labels).sum().item()
            total_samples += labels.size(0)

    accuracy = total_correct / total_samples
    machinery_predictions = np.concatenate(machinery_predictions, axis=0)  
    return accuracy, machinery_predictions

In [None]:
# 학습 실행
num_epochs = 5
for epoch in range(num_epochs):
    train_loss = train(machinery_model, train_loader, optimizer, device)
    train_acc, train_machinery_predictions = evaluate(machinery_model, train_loader, device)  
    val_acc, val_machinery_predictions = evaluate(machinery_model, val_loader, device)        
    test_acc, test_machinery_predictions = evaluate(machinery_model, test_loader, device)    

    print(f"Epoch {epoch+1}/{num_epochs}")
    print(f"Train Loss: {train_loss:.4f}, Train Acc: {train_acc:.4f}, Val Acc : {val_acc:.4f}, Test Acc: {test_acc:.4f}")

# 최종 테스트 성능 평가
final_test_acc, final_machinery_predictions = evaluate(machinery_model, test_loader, device)
print(f"Final Test Accuracy: {final_test_acc:.4f}")

# 모델 저장
torch.save(machinery_model.state_dict(), "machinery_model.pth")

  attn_output = torch.nn.functional.scaled_dot_product_attention(
 53%|██████████████████████████████████████████▏                                     | 331/627 [01:40<01:28,  3.36it/s]

### 전이 학습으로 Assembly 모델 

In [None]:
# Machinery 모델 로드 및 가중치 고정
machinery_model = BertForMachinery(num_machinery_labels=len(label_encoder.classes_), extra_features_dim=5)
machinery_model.load_state_dict(torch.load("machinery_model.pth"))
machinery_model.eval() 

for param in machinery_model.parameters():
    param.requires_grad = False

In [None]:
assembly_label_encoder = LabelEncoder()
y_assembly = assembly_label_encoder.fit_transform(data['Assembly'])

In [None]:

#텍스트 + 추가 피처 결합-machinery에서 썼던거 그대로임 
X = np.concatenate([
    data['combined_text'].values.reshape(-1, 1), 
    currency_encoded, 
    data['converted_price'].values.reshape(-1, 1)
], axis=1)


In [None]:
X_train_val_assembly, X_test_assembly, y_train_val_assembly, y_test_assembly = train_test_split(
    X, y_assembly, test_size=0.15, random_state=42, stratify=y_assembly)

X_train_assembly, X_val_assembly, y_train_assembly, y_val_assembly = train_test_split(
    X_train_val_assembly, y_train_val_assembly, test_size=0.15, stratify=y_train_val_assembly)


In [None]:
train_machinery_predictions = train_machinery_predictions.reshape(-1, 1)
val_machinery_predictions = val_machinery_predictions.reshape(-1, 1)
test_machinery_predictions = test_machinery_predictions.reshape(-1, 1)


In [None]:
print("train_encodings['input_ids'] shape:", train_encodings['input_ids'].shape)
print("train_encodings['attention_mask'] shape:", train_encodings['attention_mask'].shape)
print("train_extra_features_tensor shape:", train_extra_features_tensor.shape)
print("train_machinery_predictions shape:", train_machinery_predictions.shape)
print("y_train_assembly shape:", torch.tensor(y_train_assembly, dtype=torch.long).shape)

In [None]:
# Torch Tensor로 변환
train_assembly_dataset = TensorDataset(
    train_encodings['input_ids'],                
    train_encodings['attention_mask'],           
    train_extra_features_tensor,                 
    torch.tensor(train_machinery_predictions, dtype=torch.float32), 
    torch.tensor(y_train_assembly, dtype=torch.long)  # Assembly 레이블
)

val_assembly_dataset = TensorDataset(
    val_encodings['input_ids'],
    val_encodings['attention_mask'],
    val_extra_features_tensor,
    torch.tensor(val_machinery_predictions, dtype=torch.float32),
    torch.tensor(y_val_assembly, dtype=torch.long)
)

test_assembly_dataset = TensorDataset(
    test_encodings['input_ids'],
    test_encodings['attention_mask'],
    test_extra_features_tensor,
    torch.tensor(test_machinery_predictions, dtype=torch.float32),
    torch.tensor(y_test_assembly, dtype=torch.long)
)


In [None]:
batch_size = 16
train_loader_assembly = DataLoader(train_assembly_dataset, batch_size=batch_size, shuffle=True)
val_loader_assembly  = DataLoader(val_assembly_dataset, batch_size=batch_size, shuffle=False)
test_loader_assembly = DataLoader(test_assembly_dataset, batch_size=batch_size, shuffle=False)

In [None]:
from torch.utils.data import DataLoader

train_loader = DataLoader(train_assembly_dataset, batch_size=16, shuffle=True)

for batch in train_loader:
    input_ids, attention_mask, extra_features, machinery_predictions, labels = batch
    print(f"Batch size: {len(batch)}")
    print(f"input_ids shape: {input_ids.shape}")
    print(f"attention_mask shape: {attention_mask.shape}")
    print(f"extra_features shape: {extra_features.shape}")
    print(f"machinery_predictions shape: {machinery_predictions.shape}")
    print(f"labels shape: {labels.shape}")
    break  # 확인 후 반복을 중지

In [None]:
for batch in train_loader:
    print("Batch length:", len(batch))
    print("First element shape (input_ids):", batch[0].shape)
    print("Second element shape (attention_mask):", batch[1].shape)
    print("Third element shape (extra_features):", batch[2].shape)
    print("Fourth element shape (machinery_predictions):", batch[3].shape)
    print("Fifth element shape (labels):", batch[4].shape)
    break

In [None]:
num_assembly_labels = len(assembly_label_encoder.classes_)
machinery_output_dim = 62 

In [None]:
class AssemblyModel(nn.Module):
    def __init__(self, num_assembly_labels, extra_features_dim, machinery_output_dim):
        super(AssemblyModel, self).__init__()
        self.assembly_classifier = nn.Linear(extra_features_dim + machinery_output_dim, num_assembly_labels)
    
    def forward(self, machinery_predictions, extra_features):
        machinery_predictions = machinery_predictions.view(-1, machinery_predictions.size(1))
        
        # extra_features와 결합
        combined_features = torch.cat((machinery_predictions, extra_features), dim=1)
        
        # 어셈블리 예측
        assembly_outputs = self.assembly_classifier(combined_features)
        return assembly_outputs

In [None]:
assembly_model = AssemblyModel(num_assembly_labels=len(assembly_label_encoder.classes_), extra_features_dim=5, machinery_output_dim=machinery_output_dim)
assembly_model.to(device)

optimizer_assembly = AdamW(assembly_model.parameters(), lr=2e-5)
loss_fn = nn.CrossEntropyLoss()

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# 모델을 GPU로 이동
assembly_model.to(device)
machinery_model.to(device)

In [None]:
# Assembly 모델 학습 함수
def train_assembly(model, machinery_model, dataloader, optimizer, device):
    model.train()
    total_loss = 0
    for batch in tqdm(dataloader):
        input_ids, attention_mask, extra_features, machinery_predictions, labels = [b.to(device) for b in batch]
        
        optimizer.zero_grad()

        with torch.no_grad():
            machinery_outputs = machinery_model(input_ids, attention_mask, extra_features)

        outputs = model(machinery_outputs, extra_features)
        labels = labels.to(torch.int64)
        loss = loss_fn(outputs, labels)
        total_loss += loss.item()
        
        loss.backward()
        optimizer.step()

    return total_loss / len(dataloader)

In [None]:
def evaluate_assembly(model, dataloader, device):
    model.eval()  # 모델을 평가 모드로 설정
    total_correct = 0
    total_samples = 0
    
    with torch.no_grad():
        for batch in dataloader:
            input_ids, attention_mask, extra_features, machinery_predictions, labels = [b.to(device) for b in batch]
            
            # 머신러닝 모델에서 출력값 얻기
            machinery_outputs = machinery_model(input_ids, attention_mask, extra_features)
            
            # Assembly 모델에서 예측값 얻기
            outputs = model(machinery_outputs, extra_features)
            
            # 예측값과 실제값 비교
            _, predicted = torch.max(outputs, 1)
            total_correct += (predicted == labels).sum().item()
            total_samples += labels.size(0)
    
    accuracy = total_correct / total_samples
    return accuracy

In [None]:
# Training loop
num_epochs = 10
for epoch in range(num_epochs):
    train_loss = train_assembly(assembly_model, machinery_model, train_loader_assembly, optimizer_assembly, device)
    train_acc = evaluate_assembly(assembly_model, train_loader_assembly, device)
    val_acc = evaluate_assembly(assembly_model, val_loader_assembly, device)
    test_acc = evaluate_assembly(assembly_model, test_loader_assembly, device)

    print(f"Epoch {epoch+1}/{num_epochs}")
    print(f"Train Loss: {train_loss:.4f}")
    print(f"Train Accuracy: {train_acc:.4f}")
    print(f"Validation Accuracy: {val_acc:.4f}")
    print(f"Test Accuracy: {test_acc:.4f}")