In [1]:
import torch

print("CUDA available:", torch.cuda.is_available())

print("Current device:", torch.cuda.current_device())
print("Device name:", torch.cuda.get_device_name(torch.cuda.current_device()))

CUDA available: True
Current device: 0
Device name: NVIDIA GeForce GTX 1650


In [2]:
import pandas as pd
import numpy as np
import re

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score

In [3]:
data=pd.read_excel('dataset0828.xlsx')

In [4]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 24664 entries, 0 to 24663
Data columns (total 32 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   청구서번호        24664 non-null  object 
 1   No.          24664 non-null  int64  
 2   Subject      24642 non-null  object 
 3   Machinery    24664 non-null  object 
 4   Assembly     24664 non-null  object 
 5   청구품목         24664 non-null  object 
 6   Unnamed: 6   0 non-null      float64
 7   Part No.1    24645 non-null  object 
 8   Part No.2    3599 non-null   object 
 9   청구량          24546 non-null  float64
 10  견적           24200 non-null  object 
 11  견적수량         24546 non-null  float64
 12  견적화폐         24546 non-null  object 
 13  견적단가         24664 non-null  float64
 14  발주번호         24664 non-null  object 
 15  발주처          24664 non-null  object 
 16  발주           24664 non-null  object 
 17  발주수량         24546 non-null  float64
 18  발주금액         24546 non-null  float64
 19  D/T 

In [5]:
import re

def preprocess_text(text):
    # 괄호 안의 내용 제거
    text = re.sub(r'\([^)]*\)', '', text)
    # 특수 문자 제거 (알파벳, 숫자, 일부 허용된 특수문자 제외)
    text = re.sub(r'[^\w\s\*\-\+/.,]', '', text)
    # 여러 공백을 언더스코어로 변환
    text = re.sub(r'\s+', '_', text)
    # 텍스트 중간의 연속된 언더스코어를 하나로 줄임
    text = re.sub(r'_+', '_', text)
    # 중간에 언더스코어가 불필요하게 남아있는 경우 처리
    text = re.sub(r'(?<!\w)_(?!\w)', '', text)
    # 언더스코어 앞뒤로 존재하는 특수문자 제거
    text = re.sub(r'_([^\w]+)_', '_', text)
    text = re.sub(r'_([^\w]+)$', '', text)
    text = re.sub(r'^([^\w]+)_', '', text)
    # 텍스트 끝부분의 불필요한 언더스코어 제거
    text = re.sub(r'_+$', '', text)
    # 영어 단어는 소문자로 변환
    text = ' '.join([word.lower() if re.match(r'[A-Za-z]', word) else word for word in text.split()])
    text = text.strip()
    return text

def clean_supplier_name(name):
    # 접미사 제거
    suffixes = r'\b(Corp\.?|Corporation|Company|Co\.?|Incorporated|Inc\.?|Limited|Ltd\.?|GmbH|S\.L\.|SDN\. BHD\.)\b'
    name = re.sub(suffixes, '', name, flags=re.IGNORECASE)
    # 특수 문자 제거
    name = re.sub(r'[^\w\s]', '', name)
    # 불필요한 단어 제거
    name = re.sub(r'\b(사용금지|사)\b', '', name, flags=re.IGNORECASE)
    # 공백 정리
    name = re.sub(r'\s+', ' ', name).strip()
    # 오타 수정 및 문자열 정리
    name = re.sub(r'coporation|coropration|coproration|corporration', 'corporation', name, flags=re.IGNORECASE)
    name = name.lower().strip()
    return name

In [6]:
# 각 칼럼 전처리
data['cleaned_machinery'] = data['Machinery'].apply(preprocess_text)
data['cleaned_assembly'] = data['Assembly'].apply(preprocess_text)
data['cleaned_item'] = data['청구품목'].apply(preprocess_text)
data['cleaned_part'] = data['Part No.1'].astype(str).apply(preprocess_text)
data['cleaned_supplier'] = data['발주처'].apply(clean_supplier_name)

# 전처리된 칼럼 결합
data['combined_text'] = data['cleaned_machinery'].fillna('') + " " + data['cleaned_assembly'].fillna('') + " " + data['cleaned_item'].fillna('') + " " + data['cleaned_part'].fillna('') + " " + data['cleaned_supplier'].fillna('')


In [7]:
print(data[['combined_text']])

                                           combined_text
0      cargo_boom_vang_block block mckissick_construc...
1      spanish_boom_vang_block block mckissick_constr...
2      purse_block tow_block westec_20ton_tow_block w...
3      main_engine power_pack_as ge_power_pack_fork_e...
4      main_engine power_pack_as ge_power_pack_fork_e...
...                                                  ...
24659  no.3_generator_engine 342-0537_GENERATOR_GP-E ...
24660  no.3_generator_engine 342-0537_GENERATOR_GP-E ...
24661  no.3_generator_engine 342-0537_GENERATOR_GP-E ...
24662  no.3_generator_engine 342-0537_GENERATOR_GP-E ...
24663  no.3_generator_engine 342-0537_GENERATOR_GP-E ...

[24664 rows x 1 columns]


In [8]:
from gensim.models import FastText
import torch


# 문장을 토큰화하여 리스트로 만들어야 합니다.
sentences = [text.split() for text in data['combined_text']]

# Gensim을 사용하여 FastText 모델 학습
model = FastText(vector_size=100, window=3, min_count=1)  # 파라미터를 원하는 대로 조정 가능
model.build_vocab(sentences)  # 어휘 빌드
model.train(sentences, total_examples=len(sentences), epochs=10)  # 모델 학습

(1054460, 1471130)

In [9]:

# FastText 임베딩 생성
embeddings = []
for text in data['combined_text']:
    # 각 텍스트의 평균 임베딩을 구합니다
    words = text.split()
    word_vectors = [model.wv[word] for word in words if word in model.wv]
    if word_vectors:  # 단어가 모델 어휘에 있는 경우
        embedding = torch.tensor(word_vectors).mean(dim=0)  # 단어 벡터의 평균 계산
    else:
        embedding = torch.zeros(model.vector_size)  # 단어가 없는 경우, 0 벡터로 처리
    embeddings.append(embedding)

# 임베딩 리스트를 텐서로 변환
embeddings_tensor = torch.stack(embeddings)

print(embeddings_tensor.shape)  # 확인용 출력

  embedding = torch.tensor(word_vectors).mean(dim=0)  # 단어 벡터의 평균 계산


torch.Size([24664, 100])


In [10]:
import torch
import torch.nn as nn
import torch.optim as optim
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

# 데이터 준비
machinery = data['Machinery'].values
assembly = data['Assembly'].values

# LabelEncoder를 사용하여 문자열을 정수로 인코딩
machinery_encoder = LabelEncoder()
assembly_encoder = LabelEncoder()

machinery_labels = machinery_encoder.fit_transform(machinery)
assembly_labels = assembly_encoder.fit_transform(assembly)

In [11]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

Using device: cuda


In [18]:
import torch
import torch.nn as nn
import torch.optim as optim
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split

# 1. 데이터 준비 및 인코딩
machinery = data['Machinery'].values
assembly = data['Assembly'].values

machinery_encoder = LabelEncoder()
assembly_encoder = LabelEncoder()

machinery_labels = machinery_encoder.fit_transform(machinery)
assembly_labels = assembly_encoder.fit_transform(assembly)

# 2. 임베딩을 numpy 배열로 변환
X = embeddings_tensor.numpy()

# 3. Train-Test Split (각 레이블에 대해 동일한 분할 사용)
X_train, X_test, y_train_machinery, y_test_machinery, y_train_assembly, y_test_assembly = train_test_split(
    X, machinery_labels, assembly_labels, test_size=0.2, random_state=42)

# 4. 데이터 정규화 (StandardScaler)
scaler = StandardScaler()
X_train_normalized = scaler.fit_transform(X_train)
X_test_normalized = scaler.transform(X_test)


# 5. Train 데이터를 torch Tensor로 변환
X_train_tensor = torch.tensor(X_train_normalized, dtype=torch.float32).to(device)
X_test_tensor = torch.tensor(X_test_normalized, dtype=torch.float32).to(device)
y_train_machinery_tensor = torch.tensor(y_train_machinery, dtype=torch.long).to(device)
y_test_machinery_tensor = torch.tensor(y_test_machinery, dtype=torch.long).to(device)
y_train_assembly_tensor = torch.tensor(y_train_assembly, dtype=torch.long).to(device)
y_test_assembly_tensor = torch.tensor(y_test_assembly, dtype=torch.long).to(device)

In [80]:
# SharedTransformer 정의
class SharedTransformer(nn.Module):
    def __init__(self, input_dim, hidden_dim, num_heads, num_layers, dropout=0.1):
        super(SharedTransformer, self).__init__()
        self.embedding = nn.Linear(input_dim, hidden_dim)
        transformer_layer = nn.TransformerEncoderLayer(d_model=hidden_dim, nhead=num_heads, dropout=dropout)
        self.transformer_encoder = nn.TransformerEncoder(transformer_layer, num_layers=num_layers)
        
    def forward(self, x):
        x = self.embedding(x)
        x = self.transformer_encoder(x)
        return x

# MachineryHead 정의
class MachineryHead(nn.Module):
    def __init__(self, hidden_dim, output_dim):
        super(MachineryHead, self).__init__()
        self.fc = nn.Linear(hidden_dim, output_dim)
    
    def forward(self, x):
        x = x.view(x.size(0), -1)
        return self.fc(x)

# AssemblyHead 정의
class AssemblyHead(nn.Module):
    def __init__(self, hidden_dim, output_dim):
        super(AssemblyHead, self).__init__()
        self.fc = nn.Linear(hidden_dim, output_dim)
    
    def forward(self, x):
        return self.fc(x.mean(dim=1))

# HierarchicalModel 정의
class HierarchicalModel(nn.Module):
    def __init__(self, input_dim, hidden_dim, machinery_output_dim, assembly_output_dim):
        super(HierarchicalModel, self).__init__()
        self.shared_transformer = SharedTransformer(input_dim, hidden_dim, num_heads=8, num_layers=6)
        self.machinery_head = MachineryHead(hidden_dim, machinery_output_dim)
        
        # assembly_head에 들어가는 Linear 레이어의 입력 차원을 맞추기 위해 hidden_dim + machinery_output_dim을 지정합니다.
        self.assembly_head = nn.Sequential(
            nn.Linear(hidden_dim + machinery_output_dim, hidden_dim),
            nn.ReLU(),
            nn.Linear(hidden_dim, assembly_output_dim)
        )
    
    def forward(self, x):
        shared_rep = self.shared_transformer(x)  # [batch_size, seq_len, hidden_dim]
        machinery_out = self.machinery_head(shared_rep)  # [batch_size, machinery_output_dim]
    
        # shared_rep의 시퀀스 차원을 평균하여 [batch_size, hidden_dim] 형태로 만듭니다.
        shared_rep_mean = shared_rep.mean(dim=1)  # [batch_size, hidden_dim]
    
        combined_input = torch.cat((shared_rep_mean, machinery_out), dim=-1)  # [batch_size, hidden_dim + machinery_output_dim]

        print("Shared Rep Mean:", shared_rep_mean.shape)
        print("Machinery Out:", machinery_out.shape)
        print("Combined Input:", combined_input.shape)
        
        assembly_out = self.assembly_head(combined_input)  # [batch_size, assembly_output_dim]
        return machinery_out, assembly_out
            

In [81]:
input_dim = 100  # 임베딩 벡터의 크기
hidden_dim = 128  # 은닉층의 뉴런 수
machinery_classes = len(machinery_encoder.classes_)
assembly_classes = len(assembly_encoder.classes_)

In [82]:
# 공유된 Transformer .
model = HierarchicalModel(input_dim, hidden_dim, machinery_classes, assembly_classes).to(device)


In [83]:
criterion_machinery = nn.CrossEntropyLoss()
criterion_assembly = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

In [84]:
def train_model(model, optimizer, criterion_machinery, criterion_assembly, num_epochs=50):
    model.train()
    for epoch in range(num_epochs):
        total_loss = 0.0
        for inputs, machinery_labels, assembly_labels in zip(X_train_tensor, y_train_machinery_tensor, y_train_assembly_tensor):
            inputs = inputs.unsqueeze(0)  # 배치 차원 추가
            optimizer.zero_grad()
            machinery_out, assembly_out = model(inputs)
            loss_machinery = criterion_machinery(machinery_out, machinery_labels.unsqueeze(0))
            loss_assembly = criterion_assembly(assembly_out, assembly_labels.unsqueeze(0))
            loss = loss_machinery + loss_assembly
            loss.backward()
            optimizer.step()
            total_loss += loss.item()
        print(f"Epoch [{epoch + 1}/{num_epochs}], Total Loss: {total_loss:.4f}")


In [85]:
train_model(model, optimizer, criterion_machinery, criterion_assembly, num_epochs=50)


RuntimeError: Tensors must have same number of dimensions: got 1 and 2

In [43]:
print("Shared Rep Mean:", shared_rep.mean(dim=1).shape)
print("Machinery Out:", machinery_out.shape)

NameError: name 'shared_rep' is not defined