In [1]:
import torch

# CUDA가 사용 가능한지 확인
print("CUDA available:", torch.cuda.is_available())

# 현재 사용 중인 디바이스 확인
print("Current device:", torch.cuda.current_device())

# 사용 가능한 디바이스의 이름 확인
print("Device name:", torch.cuda.get_device_name(torch.cuda.current_device()))

CUDA available: True
Current device: 0
Device name: NVIDIA GeForce GTX 1650


In [2]:
import pandas as pd
import numpy as np
import re

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score

In [3]:
data=pd.read_excel('dataset0828.xlsx')

In [5]:
data['Assembly'].unique()

array(['BLOCK', 'TOW BLOCK', 'POWER PACK AS', ...,
       'JRV-FF21 PRESSURE SAFETY VALVE', 'COLOR SCANNING SONAR',
       'COLOR FISH FINDER'], dtype=object)

In [9]:
#!conda install conda-forge/label/cf202003::fasttext -y

In [4]:
import re

def preprocess_text(text):
    # 괄호 안의 내용 제거
    text = re.sub(r'\([^)]*\)', '', text)
    # 특수 문자 제거 (알파벳, 숫자, 일부 허용된 특수문자 제외)
    text = re.sub(r'[^\w\s\*\-\+/.,]', '', text)
    # 여러 공백을 언더스코어로 변환
    text = re.sub(r'\s+', '_', text)
    # 텍스트 중간의 연속된 언더스코어를 하나로 줄임
    text = re.sub(r'_+', '_', text)
    # 중간에 언더스코어가 불필요하게 남아있는 경우 처리
    text = re.sub(r'(?<!\w)_(?!\w)', '', text)
    # 언더스코어 앞뒤로 존재하는 특수문자 제거
    text = re.sub(r'_([^\w]+)_', '_', text)
    text = re.sub(r'_([^\w]+)$', '', text)
    text = re.sub(r'^([^\w]+)_', '', text)
    # 텍스트 끝부분의 불필요한 언더스코어 제거
    text = re.sub(r'_+$', '', text)
    # 영어 단어는 소문자로 변환
    text = ' '.join([word.lower() if re.match(r'[A-Za-z]', word) else word for word in text.split()])
    text = text.strip()
    return text

def clean_supplier_name(name):
    # 접미사 제거
    suffixes = r'\b(Corp\.?|Corporation|Company|Co\.?|Incorporated|Inc\.?|Limited|Ltd\.?|GmbH|S\.L\.|SDN\. BHD\.)\b'
    name = re.sub(suffixes, '', name, flags=re.IGNORECASE)
    # 특수 문자 제거
    name = re.sub(r'[^\w\s]', '', name)
    # 불필요한 단어 제거
    name = re.sub(r'\b(사용금지|사)\b', '', name, flags=re.IGNORECASE)
    # 공백 정리
    name = re.sub(r'\s+', ' ', name).strip()
    # 오타 수정 및 문자열 정리
    name = re.sub(r'coporation|coropration|coproration|corporration', 'corporation', name, flags=re.IGNORECASE)
    name = name.lower().strip()
    return name

In [5]:
# 각 칼럼 전처리
data['cleaned_item'] = data['청구품목'].apply(preprocess_text)
data['cleaned_supplier'] = data['발주처'].apply(clean_supplier_name)

# 전처리된 칼럼 결합
data['combined_text'] = data['cleaned_item'] + " " + data['cleaned_supplier']

In [6]:
# 전처리된 데이터 확인
print(data[['combined_text']])

                                           combined_text
0      mckissick_construction_blocks matsuiusa corpor...
1      mckissick_construction_blocks matsuiusa corpor...
2           westec_20ton_tow_block matsuiusa corporation
3            ge_power_pack_fork_e7 matsuiusa corporation
4            ge_power_pack_fork_e7 matsuiusa corporation
...                                                  ...
24659                   ring-o haein corporation_cheonan
24660           ring-retaining haein corporation_cheonan
24661           sleeve-bearing haein corporation_cheonan
24662             bearing-ball haein corporation_cheonan
24663          bearing-ball_de haein corporation_cheonan

[24664 rows x 1 columns]


In [8]:
!conda install conda-forge::gensim -y

^C
Channels:
 - defaults
 - conda-forge
 - anaconda
Platform: win-64
Collecting package metadata (repodata.json): ...working... done
Solving environment: ...working... done

# All requested packages already installed.



In [7]:
from gensim.models import FastText
import torch


# 문장을 토큰화하여 리스트로 만들어야 합니다.
sentences = [text.split() for text in data['combined_text']]

# Gensim을 사용하여 FastText 모델 학습
model = FastText(vector_size=100, window=3, min_count=1)  # 파라미터를 원하는 대로 조정 가능
model.build_vocab(sentences)  # 어휘 빌드
model.train(sentences, total_examples=len(sentences), epochs=10)  # 모델 학습

ModuleNotFoundError: No module named 'gensim'

In [120]:

# FastText 임베딩 생성
embeddings = []
for text in data['combined_text']:
    # 각 텍스트의 평균 임베딩을 구합니다
    words = text.split()
    word_vectors = [model.wv[word] for word in words if word in model.wv]
    if word_vectors:  # 단어가 모델 어휘에 있는 경우
        embedding = torch.tensor(word_vectors).mean(dim=0)  # 단어 벡터의 평균 계산
    else:
        embedding = torch.zeros(model.vector_size)  # 단어가 없는 경우, 0 벡터로 처리
    embeddings.append(embedding)

# 임베딩 리스트를 텐서로 변환
embeddings_tensor = torch.stack(embeddings)

print(embeddings_tensor.shape)  # 확인용 출력

torch.Size([24664, 100])


In [121]:
import torch
import torch.nn as nn
import torch.optim as optim
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

# 데이터 준비
machinery = data['Machinery'].values
assembly = data['Assembly'].values

# LabelEncoder를 사용하여 문자열을 정수로 인코딩
machinery_encoder = LabelEncoder()
assembly_encoder = LabelEncoder()

machinery_labels = machinery_encoder.fit_transform(machinery)
assembly_labels = assembly_encoder.fit_transform(assembly)

In [138]:
import torch
import torch.nn as nn
import torch.optim as optim
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split

# 1. 데이터 준비 및 인코딩
machinery = data['Machinery'].values
assembly = data['Assembly'].values

machinery_encoder = LabelEncoder()
assembly_encoder = LabelEncoder()

machinery_labels = machinery_encoder.fit_transform(machinery)
assembly_labels = assembly_encoder.fit_transform(assembly)

# 2. 임베딩을 numpy 배열로 변환
X = embeddings_tensor.numpy()

# 3. Train-Test Split (각 레이블에 대해 동일한 분할 사용)
X_train, X_test, y_train_machinery, y_test_machinery, y_train_assembly, y_test_assembly = train_test_split(
    X, machinery_labels, assembly_labels, test_size=0.2, random_state=42)

# 4. 데이터 정규화 (StandardScaler)
scaler = StandardScaler()
X_train_normalized = scaler.fit_transform(X_train)
X_test_normalized = scaler.transform(X_test)


# 5. Train 데이터를 torch Tensor로 변환
X_train_tensor = torch.tensor(X_train_normalized, dtype=torch.float32)
X_test_tensor = torch.tensor(X_test_normalized, dtype=torch.float32)
y_train_machinery_tensor = torch.tensor(y_train_machinery, dtype=torch.long)
y_test_machinery_tensor = torch.tensor(y_test_machinery, dtype=torch.long)
y_train_assembly_tensor = torch.tensor(y_train_assembly, dtype=torch.long)
y_test_assembly_tensor = torch.tensor(y_test_assembly, dtype=torch.long)

In [148]:
class SharedTransformer(nn.Module):
    def __init__(self, input_dim, hidden_dim, num_heads, num_layers, dropout=0.1):
        super(SharedTransformer, self).__init__()
        self.embedding = nn.Linear(input_dim, hidden_dim)
        transformer_layer = nn.TransformerEncoderLayer(d_model=hidden_dim, nhead=num_heads, dropout=dropout)
        self.transformer_encoder = nn.TransformerEncoder(transformer_layer, num_layers=num_layers)
        
    def forward(self, x):
        x = self.embedding(x)
        x = self.transformer_encoder(x)
        return x

class MachineryHead(nn.Module):
    def __init__(self, hidden_dim, output_dim):
        super(MachineryHead, self).__init__()
        self.fc = nn.Linear(hidden_dim, output_dim)
    
    def forward(self, x):
        return self.fc(x.mean(dim=1))

class AssemblyHead(nn.Module):
    def __init__(self, hidden_dim, output_dim):
        super(AssemblyHead, self).__init__()
        self.fc = nn.Linear(hidden_dim, output_dim)
    
    def forward(self, x):
        return self.fc(x.mean(dim=1))

In [149]:
input_dim = X_train_tensor.size(1)
hidden_dim = 256
# 각 레이블의 클래스 수에 따라 모델의 출력 차원 정의
machinery_output_dim = len(np.unique(machine..y_labels))  # 전체 클래스 개수
assembly_output_dim = len(np.unique(assembly_labels))  # 전체 클래스 개수
num_heads = 4
num_layers = 2

In [150]:
# 공유된 Transformer .
shared_transformer = SharedTransformer(input_dim, hidden_dim, num_heads, num_layers)

# Machinery와 Assembly에 대한 개별 최종 레이어
machinery_head = MachineryHead(hidden_dim, machinery_output_dim)
assembly_head = AssemblyHead(hidden_dim, assembly_output_dim)

In [151]:
criterion_machinery = nn.CrossEntropyLoss()
criterion_assembly = nn.CrossEntropyLoss()

In [152]:
optimizer = optim.Adam(list(shared_transformer.parameters()) +
                       list(machinery_head.parameters()) +
                       list(assembly_head.parameters()), lr=0.001)


In [153]:
scheduler = optim.lr_scheduler.StepLR(optimizer, step_size=20, gamma=0.5)


In [154]:
#!conda install conda-forge::xgboost -y

In [155]:
num_epochs = 200
for epoch in range(num_epochs):
    optimizer.zero_grad()
    
    # 공유된 Transformer 모델로 입력 데이터 처리
    shared_output = shared_transformer(X_train_tensor.unsqueeze(1))
    
    # Machinery 예측
    machinery_output = machinery_head(shared_output)
    loss_machinery = criterion_machinery(machinery_output, y_train_machinery_tensor)
    
    # Assembly 예측
    assembly_output = assembly_head(shared_output)
    loss_assembly = criterion_assembly(assembly_output, y_train_assembly_tensor)
    
    # 총 손실 계산 및 역전파
    total_loss = loss_machinery + loss_assembly
    total_loss.backward()
    optimizer.step()
    
    # 학습률 스케줄러 업데이트
    scheduler.step()

    if (epoch + 1) % 10 == 0:
        print(f"Epoch [{epoch + 1}/{num_epochs}], "
              f"Machinery Loss: {loss_machinery.item():.4f}, "
              f"Assembly Loss: {loss_assembly.item():.4f}, "
              f"Total Loss: {total_loss.item():.4f}")

KeyboardInterrupt: 

In [147]:
machinery_output_dim = len(np.unique(y_train_machinery))
assembly_output_dim = len(np.unique(y_train_assembly))

print(f"Expected Machinery Output Dim: {machinery_output_dim}")
print(f"Actual Machinery Head Output Dim: {machinery_head.fc.out_features}")

print(f"Expected Assembly Output Dim: {assembly_output_dim}")
print(f"Actual Assembly Head Output Dim: {assembly_head.fc.out_features}")

Expected Machinery Output Dim: 164
Actual Machinery Head Output Dim: 164
Expected Assembly Output Dim: 1709
Actual Assembly Head Output Dim: 1709
