In [1]:
import torch

print("CUDA available:", torch.cuda.is_available())

print("Current device:", torch.cuda.current_device())
print("Device name:", torch.cuda.get_device_name(torch.cuda.current_device()))

CUDA available: True
Current device: 0
Device name: NVIDIA GeForce GTX 1650


In [1]:
import pandas as pd
import numpy as np
import re

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score

In [2]:
data=pd.read_excel('dataset0828.xlsx')

In [3]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 24664 entries, 0 to 24663
Data columns (total 32 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   청구서번호        24664 non-null  object 
 1   No.          24664 non-null  int64  
 2   Subject      24642 non-null  object 
 3   Machinery    24664 non-null  object 
 4   Assembly     24664 non-null  object 
 5   청구품목         24664 non-null  object 
 6   Unnamed: 6   0 non-null      float64
 7   Part No.1    24645 non-null  object 
 8   Part No.2    3599 non-null   object 
 9   청구량          24546 non-null  float64
 10  견적           24200 non-null  object 
 11  견적수량         24546 non-null  float64
 12  견적화폐         24546 non-null  object 
 13  견적단가         24664 non-null  float64
 14  발주번호         24664 non-null  object 
 15  발주처          24664 non-null  object 
 16  발주           24664 non-null  object 
 17  발주수량         24546 non-null  float64
 18  발주금액         24546 non-null  float64
 19  D/T 

In [4]:
import re

def preprocess_text(text):
    # 괄호 안의 내용 제거
    text = re.sub(r'\([^)]*\)', '', text)
    # 특수 문자 제거 (알파벳, 숫자, 일부 허용된 특수문자 제외)
    text = re.sub(r'[^\w\s\*\-\+/.,]', '', text)
    # 여러 공백을 언더스코어로 변환
    text = re.sub(r'\s+', '_', text)
    # 텍스트 중간의 연속된 언더스코어를 하나로 줄임
    text = re.sub(r'_+', '_', text)
    # 중간에 언더스코어가 불필요하게 남아있는 경우 처리
    text = re.sub(r'(?<!\w)_(?!\w)', '', text)
    # 언더스코어 앞뒤로 존재하는 특수문자 제거
    text = re.sub(r'_([^\w]+)_', '_', text)
    text = re.sub(r'_([^\w]+)$', '', text)
    text = re.sub(r'^([^\w]+)_', '', text)
    # 텍스트 끝부분의 불필요한 언더스코어 제거
    text = re.sub(r'_+$', '', text)
    # 영어 단어는 소문자로 변환
    text = ' '.join([word.lower() if re.match(r'[A-Za-z]', word) else word for word in text.split()])
    text = text.strip()
    return text

def clean_supplier_name(name):
    # 접미사 제거
    suffixes = r'\b(Corp\.?|Corporation|Company|Co\.?|Incorporated|Inc\.?|Limited|Ltd\.?|GmbH|S\.L\.|SDN\. BHD\.)\b'
    name = re.sub(suffixes, '', name, flags=re.IGNORECASE)
    # 특수 문자 제거
    name = re.sub(r'[^\w\s]', '', name)
    # 불필요한 단어 제거
    name = re.sub(r'\b(사용금지|사)\b', '', name, flags=re.IGNORECASE)
    # 공백 정리
    name = re.sub(r'\s+', ' ', name).strip()
    # 오타 수정 및 문자열 정리
    name = re.sub(r'coporation|coropration|coproration|corporration', 'corporation', name, flags=re.IGNORECASE)
    name = name.lower().strip()
    return name

In [5]:
# 각 칼럼 전처리
data['cleaned_machinery'] = data['Machinery'].apply(preprocess_text)
data['cleaned_assembly'] = data['Assembly'].apply(preprocess_text)
data['cleaned_item'] = data['청구품목'].apply(preprocess_text)
data['cleaned_part'] = data['Part No.1'].astype(str).apply(preprocess_text)
data['cleaned_supplier'] = data['발주처'].apply(clean_supplier_name)

# 전처리된 칼럼 결합
data['combined_text'] = data['cleaned_machinery'].fillna('') + " " + data['cleaned_assembly'].fillna('') + " " + data['cleaned_item'].fillna('') + " " + data['cleaned_part'].fillna('') + " " + data['cleaned_supplier'].fillna('')


In [6]:
print(data[['combined_text']])

                                           combined_text
0      cargo_boom_vang_block block mckissick_construc...
1      spanish_boom_vang_block block mckissick_constr...
2      purse_block tow_block westec_20ton_tow_block w...
3      main_engine power_pack_as ge_power_pack_fork_e...
4      main_engine power_pack_as ge_power_pack_fork_e...
...                                                  ...
24659  no.3_generator_engine 342-0537_GENERATOR_GP-E ...
24660  no.3_generator_engine 342-0537_GENERATOR_GP-E ...
24661  no.3_generator_engine 342-0537_GENERATOR_GP-E ...
24662  no.3_generator_engine 342-0537_GENERATOR_GP-E ...
24663  no.3_generator_engine 342-0537_GENERATOR_GP-E ...

[24664 rows x 1 columns]


In [7]:
from gensim.models import FastText
import torch


# 문장을 토큰화하여 리스트로 만들어야 합니다.
sentences = [text.split() for text in data['combined_text']]

# Gensim을 사용하여 FastText 모델 학습
model = FastText(vector_size=100, window=3, min_count=1)  # 파라미터를 원하는 대로 조정 가능
model.build_vocab(sentences)  # 어휘 빌드
model.train(sentences, total_examples=len(sentences), epochs=10)  # 모델 학습

(1054726, 1471130)

In [8]:

# FastText 임베딩 생성
embeddings = []
for text in data['combined_text']:
    # 각 텍스트의 평균 임베딩을 구합니다
    words = text.split()
    word_vectors = [model.wv[word] for word in words if word in model.wv]
    if word_vectors:  # 단어가 모델 어휘에 있는 경우
        embedding = torch.tensor(word_vectors).mean(dim=0)  # 단어 벡터의 평균 계산
    else:
        embedding = torch.zeros(model.vector_size)  # 단어가 없는 경우, 0 벡터로 처리
    embeddings.append(embedding)

# 임베딩 리스트를 텐서로 변환
embeddings_tensor = torch.stack(embeddings)

print(embeddings_tensor.shape)  # 확인용 출력

  embedding = torch.tensor(word_vectors).mean(dim=0)  # 단어 벡터의 평균 계산


torch.Size([24664, 100])


In [12]:
import torch
import torch.nn as nn
import torch.optim as optim
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

# 데이터 준비
machinery = data['Machinery'].values
assembly = data['Assembly'].values

# LabelEncoder를 사용하여 문자열을 정수로 인코딩
machinery_encoder = LabelEncoder()
assembly_encoder = LabelEncoder()

machinery_labels = machinery_encoder.fit_transform(machinery)
assembly_labels = assembly_encoder.fit_transform(assembly)

In [53]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

Using device: cuda


In [55]:
import os
os.environ['CUDA_LAUNCH_BLOCKING'] = "1"

In [67]:
import torch
import torch.nn as nn
import torch.optim as optim
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split

# 1. 데이터 준비 및 인코딩
machinery = data['Machinery'].values
assembly = data['Assembly'].values

machinery_encoder = LabelEncoder()
assembly_encoder = LabelEncoder()

machinery_labels = machinery_encoder.fit_transform(machinery)
assembly_labels = assembly_encoder.fit_transform(assembly)

X = embeddings_tensor.numpy()
y_combined = combined_labels


# 3. Train-Test Split (각각의 타겟에 대해 독립적으로 분리)
X_train, X_test, y_train_machinery, y_test_machinery = train_test_split(
    X, machinery_labels, test_size=0.2, random_state=42)

X_train, X_test, y_train_assembly, y_test_assembly = train_test_split(
    X, assembly_labels, test_size=0.2, random_state=42)

# 4. 데이터 정규화 (StandardScaler)
scaler = StandardScaler()
X_train_normalized = scaler.fit_transform(X_train)
X_test_normalized = scaler.transform(X_test)


# 5. Train 데이터를 torch Tensor로 변환
X_train_tensor = torch.tensor(X_train_normalized, dtype=torch.float32)
X_test_tensor = torch.tensor(X_test_normalized, dtype=torch.float32)
y_train_machinery_tensor = torch.tensor(y_train_machinery, dtype=torch.long)
y_test_machinery_tensor = torch.tensor(y_test_machinery, dtype=torch.long)
y_train_assembly_tensor = torch.tensor(y_train_assembly, dtype=torch.long)
y_test_assembly_tensor = torch.tensor(y_test_assembly, dtype=torch.long)


In [68]:
# 레이블 텐서 확인
print("y_train_tensor:", y_train_tensor)
print("Type of y_train_tensor:", type(y_train_tensor))

# dtype 확인
print("Data type of y_train_tensor:", y_train_tensor.dtype)

y_train_tensor: tensor([  8045,  87373,  87371,  ..., 102592, 102030, 104610])
Type of y_train_tensor: <class 'torch.Tensor'>
Data type of y_train_tensor: torch.int64


In [69]:

# 파라미터 설정
input_dim = 100  # 임베딩 벡터의 크기
hidden_dim = 128  # 은닉층의 뉴런 수
machinery_classes = len(machinery_encoder.classes_)
assembly_classes = len(assembly_encoder.classes_)

In [70]:
import torch
import torch.nn as nn
import torch.optim as optim

class ComplexCombinedModel(nn.Module):
    def __init__(self, input_dim, hidden_dim, machinery_classes, assembly_classes):
        super(ComplexCombinedModel, self).__init__()
        
        # Feature extractor with more layers and batch normalization
        self.feature_extractor = nn.Sequential(
            nn.Linear(input_dim, hidden_dim),
            nn.ReLU(),
            nn.BatchNorm1d(hidden_dim),
            nn.Dropout(0.3),  # Dropout for regularization
            nn.Linear(hidden_dim, hidden_dim),
            nn.ReLU(),
            nn.BatchNorm1d(hidden_dim),
            nn.Dropout(0.3),
            nn.Linear(hidden_dim, hidden_dim),
            nn.ReLU(),
            nn.BatchNorm1d(hidden_dim),
            nn.Dropout(0.3)
        )
        
        # Separate output heads for machinery and assembly with additional layers
        self.machinery_output = nn.Sequential(
            nn.Linear(hidden_dim, hidden_dim // 2),
            nn.ReLU(),
            nn.BatchNorm1d(hidden_dim // 2),
            nn.Linear(hidden_dim // 2, machinery_classes)
        )
        
        self.assembly_output = nn.Sequential(
            nn.Linear(hidden_dim, hidden_dim // 2),
            nn.ReLU(),
            nn.BatchNorm1d(hidden_dim // 2),
            nn.Linear(hidden_dim // 2, assembly_classes)
        )
    
    def forward(self, x):
        x = self.feature_extractor(x)
        machinery_out = self.machinery_output(x)
        assembly_out = self.assembly_output(x)
        return (machinery_out, assembly_out)

In [72]:
def init_weights(m):
    if isinstance(m, nn.Linear):
        nn.init.xavier_uniform_(m.weight)
        if m.bias is not None:
            nn.init.constant_(m.bias, 0)

model.apply(init_weights)

ComplexCombinedModel(
  (feature_extractor): Sequential(
    (0): Linear(in_features=100, out_features=128, bias=True)
    (1): ReLU()
    (2): BatchNorm1d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (3): Dropout(p=0.3, inplace=False)
    (4): Linear(in_features=128, out_features=128, bias=True)
    (5): ReLU()
    (6): BatchNorm1d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (7): Dropout(p=0.3, inplace=False)
    (8): Linear(in_features=128, out_features=128, bias=True)
    (9): ReLU()
    (10): BatchNorm1d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (11): Dropout(p=0.3, inplace=False)
  )
  (machinery_output): Sequential(
    (0): Linear(in_features=128, out_features=64, bias=True)
    (1): ReLU()
    (2): BatchNorm1d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (3): Linear(in_features=64, out_features=168, bias=True)
  )
  (assembly_output): Sequential(
    (0): Linear(in

In [73]:
machinery_classes = len(machinery_encoder.classes_)
assembly_classes = len(assembly_encoder.classes_)
model = ComplexCombinedModel(input_dim, hidden_dim, machinery_classes, assembly_classes)


model.apply(init_weights)

criterion_machinery = nn.CrossEntropyLoss()
criterion_assembly = nn.CrossEntropyLoss()

optimizer = optim.Adam(model.parameters(), lr=0.0001)

In [74]:

# 학습 함수
def train_model(model, data_loader, optimizer, num_epochs=50):
    model.train()
    for epoch in range(num_epochs):
        total_loss = 0.0
        for inputs, machinery_targets, assembly_targets in data_loader:
            optimizer.zero_grad()
            machinery_out, assembly_out = model(inputs)
            loss_machinery = criterion_machinery(machinery_out, machinery_targets)
            loss_assembly = criterion_assembly(assembly_out, assembly_targets)
            loss = loss_machinery + loss_assembly
            loss.backward()
            optimizer.step()
            total_loss += loss.item()
        print(f"Epoch [{epoch + 1}/{num_epochs}], Total Loss: {total_loss:.4f}")


In [75]:
from torch.utils.data import DataLoader, TensorDataset
train_dataset = TensorDataset(X_train_tensor, y_train_machinery_tensor, y_train_assembly_tensor)
train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)


In [None]:
train_model(model, train_loader, optimizer, num_epochs=50)

Epoch [1/50], Total Loss: 13158.4228
Epoch [2/50], Total Loss: 10631.5210
Epoch [3/50], Total Loss: 9214.9710
Epoch [4/50], Total Loss: 8404.2737
Epoch [5/50], Total Loss: 7817.4108
Epoch [6/50], Total Loss: 7379.7694
Epoch [7/50], Total Loss: 6999.5066
Epoch [8/50], Total Loss: 6691.7838
Epoch [9/50], Total Loss: 6402.7395
Epoch [10/50], Total Loss: 6163.2464
Epoch [11/50], Total Loss: 5925.9040
Epoch [12/50], Total Loss: 5753.4661
Epoch [13/50], Total Loss: 5563.0537
Epoch [14/50], Total Loss: 5404.0167
Epoch [15/50], Total Loss: 5261.4957
Epoch [16/50], Total Loss: 5129.6849
Epoch [17/50], Total Loss: 5007.9639
Epoch [18/50], Total Loss: 4894.8624
Epoch [19/50], Total Loss: 4786.6193
Epoch [20/50], Total Loss: 4690.0902


In [35]:
unique_labels = len(set(y_combined))
print(unique_labels)



2272


In [59]:
import torch
print(torch.cuda.is_available())

True
