In [1]:
import torch

print("CUDA available:", torch.cuda.is_available())

print("Current device:", torch.cuda.current_device())
print("Device name:", torch.cuda.get_device_name(torch.cuda.current_device()))

CUDA available: True
Current device: 0
Device name: NVIDIA GeForce GTX 1650


In [2]:
import pandas as pd
import numpy as np
import re

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score

In [3]:
data=pd.read_excel('filtered_dataset0.1.xlsx')

In [4]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 15162 entries, 0 to 15161
Data columns (total 32 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   청구서번호        15162 non-null  object 
 1   No.          15162 non-null  int64  
 2   Subject      15152 non-null  object 
 3   Machinery    15162 non-null  object 
 4   Assembly     15162 non-null  object 
 5   청구품목         15162 non-null  object 
 6   Unnamed: 6   0 non-null      float64
 7   Part No.1    15154 non-null  object 
 8   Part No.2    2614 non-null   object 
 9   청구량          15092 non-null  float64
 10  견적           14957 non-null  object 
 11  견적수량         15092 non-null  float64
 12  견적화폐         15092 non-null  object 
 13  견적단가         15162 non-null  float64
 14  발주번호         15162 non-null  object 
 15  발주처          15162 non-null  object 
 16  발주           15162 non-null  object 
 17  발주수량         15092 non-null  float64
 18  발주금액         15092 non-null  float64
 19  D/T 

In [97]:
import re

def preprocess_text(text):
    # 괄호 안의 내용 제거
    text = re.sub(r'\([^)]*\)', '', text)
    # 특수 문자 제거 (알파벳, 숫자, 일부 허용된 특수문자 제외)
    text = re.sub(r'[^\w\s\*\-\+/.,]', '', text)
    # 여러 공백을 언더스코어로 변환
    text = re.sub(r'\s+', '_', text)
    # 텍스트 중간의 연속된 언더스코어를 하나로 줄임
    text = re.sub(r'_+', '_', text)
    # 중간에 언더스코어가 불필요하게 남아있는 경우 처리
    text = re.sub(r'(?<!\w)_(?!\w)', '', text)
    # 언더스코어 앞뒤로 존재하는 특수문자 제거
    text = re.sub(r'_([^\w]+)_', '_', text)
    text = re.sub(r'_([^\w]+)$', '', text)
    text = re.sub(r'^([^\w]+)_', '', text)
    # 텍스트 끝부분의 불필요한 언더스코어 제거
    text = re.sub(r'_+$', '', text)
    # 영어 단어는 소문자로 변환
    text = ' '.join([word.lower() if re.match(r'[A-Za-z]', word) else word for word in text.split()])
    text = text.strip()
    return text

def clean_supplier_name(name):
    # 접미사 제거
    suffixes = r'\b(Corp\.?|Corporation|Company|Co\.?|Incorporated|Inc\.?|Limited|Ltd\.?|GmbH|S\.L\.|SDN\. BHD\.)\b'
    name = re.sub(suffixes, '', name, flags=re.IGNORECASE)
    # 특수 문자 제거
    name = re.sub(r'[^\w\s]', '', name)
    # 불필요한 단어 제거
    name = re.sub(r'\b(사용금지|사)\b', '', name, flags=re.IGNORECASE)
    # 공백 정리
    name = re.sub(r'\s+', ' ', name).strip()
    # 오타 수정 및 문자열 정리
    name = re.sub(r'coporation|coropration|coproration|corporration', 'corporation', name, flags=re.IGNORECASE)
    name = name.lower().strip()
    return name

In [98]:
# 각 칼럼 전처리
data['cleaned_item'] = data['청구품목'].apply(preprocess_text)
data['cleaned_supplier'] = data['발주처'].apply(clean_supplier_name)

# 전처리된 칼럼 결합
data['combined_text'] =data['cleaned_item'].fillna('') + " " + data['cleaned_supplier'].fillna('')


In [99]:
print(data[['combined_text']])

                                     combined_text
0      ge_power_pack_fork_e7 matsuiusa corporation
1      ge_power_pack_fork_e7 matsuiusa corporation
2                  nylon_54_4_1/4,_100md_50fms kti
3                  nylon_48_4_1/4,_100md_50fms kti
4                  nylon_42_4_1/4,_100md_50fms kti
...                                            ...
15157             ring-o haein corporation_cheonan
15158     ring-retaining haein corporation_cheonan
15159     sleeve-bearing haein corporation_cheonan
15160       bearing-ball haein corporation_cheonan
15161    bearing-ball_de haein corporation_cheonan

[15162 rows x 1 columns]


In [100]:
from gensim.models import FastText, Word2Vec
import torch

# 문장을 토큰화하여 리스트로 만들어야 합니다.
sentences = [text.split() for text in data['combined_text']]

# FastText 모델 학습
ft_model = FastText(vector_size=100, window=5, min_count=1, min_n=3, max_n=6, sg=1)
ft_model.build_vocab(sentences)
ft_model.train(sentences, total_examples=len(sentences), epochs=10)

# Word2Vec 모델 학습
w2v_model = Word2Vec(sentences, vector_size=100, window=5, min_count=1, sg=1)
w2v_model.train(sentences, total_examples=len(sentences), epochs=10)

(216309, 443560)

In [101]:
# FastText 임베딩을 가져오는 함수
def get_embedding(word, model):
    if word in model.wv:
        return torch.tensor(model.wv[word])
    else:
        # 서브워드 임베딩의 평균을 계산
        subwords = [word[i:j] for i in range(len(word)) for j in range(i+1, len(word)+1)]
        subword_vectors = [model.wv[subword] for subword in subwords if subword in model.wv]
        
        if subword_vectors:
            return torch.tensor(subword_vectors).mean(dim=0)
        else:
            return torch.zeros(model.vector_size)  # 단어가 없는 경우 0 벡터로 처리
# FastText 임베딩과 Word2Vec 임베딩을 결합한 함수
def get_combined_embedding(word, ft_model, w2v_model):
    ft_vector = get_embedding(word, ft_model)  # FastText에서 얻은 임베딩
    if word in w2v_model.wv:
        w2v_vector = torch.tensor(w2v_model.wv[word])  # Word2Vec에서 얻은 임베딩
    else:
        w2v_vector = torch.zeros(w2v_model.vector_size)  # 단어가 없는 경우 0 벡터로 처리

    combined_vector = torch.cat((ft_vector, w2v_vector))  # 두 임베딩을 결합 (concatenate)
    return combined_vector

# 결합된 임베딩을 생성
combined_embeddings = []
for text in data['combined_text']:
    words = text.split()
    word_vectors = [get_combined_embedding(word, ft_model, w2v_model) for word in words]
    if word_vectors:
        embedding = torch.stack(word_vectors).mean(dim=0)  # 단어 벡터의 평균 계산
    else:
        embedding = torch.zeros(model.vector_size + w2v_model.vector_size)  # 단어가 없는 경우 0 벡터로 처리
    combined_embeddings.append(embedding)

# 결합된 임베딩 리스트를 텐서로 변환
combined_embeddings_tensor = torch.stack(combined_embeddings)

print(combined_embeddings_tensor.shape)  # 결합된 임베딩 텐서

torch.Size([15162, 200])


In [10]:
import torch
import torch.nn.functional as F

# 모든 단어에 대해 결합된 임베딩을 계산하고 저장
combined_word_vectors = {}
for word in ft_model.wv.index_to_key:  # FastText 모델의 모든 단어에 대해 반복
    combined_word_vectors[word] = get_combined_embedding(word, ft_model, w2v_model)

# 특정 단어와 가장 유사한 단어 5개를 찾는 함수 정의
def find_similar_words(target_word, combined_word_vectors, topn=5):
    if target_word not in combined_word_vectors:
        print(f"Word '{target_word}' not in vocabulary.")
        return []

    target_vector = combined_word_vectors[target_word]
    similarities = {}

    for word, vector in combined_word_vectors.items():
        similarity = F.cosine_similarity(target_vector.unsqueeze(0), vector.unsqueeze(0)).item()
        similarities[word] = similarity

    # 상위 topn개의 유사 단어를 찾음
    sorted_similarities = sorted(similarities.items(), key=lambda item: item[1], reverse=True)
    return sorted_similarities[:topn]

In [102]:
word = "SCREW".lower()  # 소문자로 변환
if word in combined_word_vectors:
    print(f"Embedding vector for '{word}': {combined_word_vectors[word]}")
else:
    print(f"Word '{word}' not in vocabulary.")

Embedding vector for 'screw': tensor([-2.6953e-01,  1.0618e+00,  9.7410e-02,  8.4718e-02, -1.4775e-01,
         3.5462e-01,  2.9825e-01,  2.1511e-01, -6.5101e-01,  2.9122e-01,
        -4.4524e-01,  4.6695e-01, -6.2215e-02,  3.8362e-01,  3.8894e-01,
         9.5985e-02, -9.2560e-02, -4.6851e-01,  2.4317e-01, -4.0242e-01,
        -1.1162e-01, -8.0334e-01, -2.9055e-01,  3.7296e-01,  5.1049e-01,
        -4.0715e-01,  2.8165e-01, -7.3628e-01,  4.3052e-01,  1.3294e-01,
         5.5091e-01,  1.5341e-02, -3.1327e-03, -1.1906e-01,  3.4249e-01,
         6.6751e-01,  3.4811e-01,  3.3393e-01, -9.9389e-01,  2.7712e-01,
         2.8915e-01, -1.1394e+00,  1.0444e-01, -9.9757e-02, -3.5578e-01,
        -7.7339e-01,  2.0051e-01, -1.0524e-01, -1.9788e-01, -2.9482e-02,
         6.0938e-01,  4.6622e-01,  5.4943e-02,  7.0522e-01,  2.0831e-01,
        -3.7474e-02, -1.8525e-01, -5.6877e-01, -2.3078e-01, -6.2758e-01,
        -2.9584e-01, -3.1617e-01, -8.6180e-01,  3.4211e-01,  8.8007e-02,
        -2.9705e-01, 

In [103]:
# 특정 단어의 유사 단어 찾기
word = "valve".lower()   # 여기에는 확인하고 싶은 단어를 넣으세요
similar_words = find_similar_words(word, combined_word_vectors, topn=5)

# 결과 출력
print(f"Words most similar to '{word}':")
for similar_word, similarity in similar_words:
    print(f"{similar_word}: {similarity:.4f}")

Words most similar to 'valve':
valve: 1.0000
check_valve: 0.9892
valve_c/bal_h-1180: 0.9868
valve_pcl402-db_sae: 0.9867
bearing_cover: 0.9846


In [107]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

Using device: cuda


In [119]:
import torch
import torch.nn as nn
import torch.optim as optim
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.decomposition import PCA

# 1. 데이터 준비 및 인코딩
machinery = data['Machinery'].values
assembly = data['Assembly'].values

machinery_encoder = LabelEncoder()
assembly_encoder = LabelEncoder()

machinery_labels = machinery_encoder.fit_transform(machinery)
assembly_labels = assembly_encoder.fit_transform(assembly)

# 임베딩을 numpy 배열로 변환
X = combined_embeddings_tensor.numpy()

# 데이터 스케일링
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)  

# PCA를 통한 차원 축소
pca = PCA(n_components=100)  # 예시로 50개의 주성분만 유지
X_pca = pca.fit_transform(X_scaled)

# 데이터셋을 훈련, 검증, 테스트 세트로 분할
X_train_val, X_test, y_train_val_machinery, y_test_machinery, y_train_val_assembly, y_test_assembly = train_test_split(
    X_pca,  # PCA 적용 데이터 사용
    machinery_labels,  # 머시너리 레이블
    assembly_labels,  # 어셈블리 레이블
    test_size=0.2,  # 테스트 세트는 전체 데이터의 20%
    random_state=42  # 난수 생성기 시드 값
)

# 훈련 세트를 다시 훈련과 검증 세트로 분할
X_train, X_val, y_train_machinery, y_val_machinery, y_train_assembly, y_val_assembly = train_test_split(
    X_train_val,  # 훈련 및 검증 세트
    y_train_val_machinery,  # 훈련 및 검증 머시너리 레이블
    y_train_val_assembly,  # 훈련 및 검증 어셈블리 레이블
    test_size=0.25,  # 검증 세트는 훈련 및 검증 세트의 25% (전체 데이터의 20%)
    random_state=42  # 난수 생성기 시드 값
)

In [123]:
#!conda install conda-forge::catboost -y

In [125]:
# XGBoost 모델 학습 및 평가
from catboost import CatBoostClassifier
from sklearn.metrics import accuracy_score

# CatBoost 분류기 준비
machinery_model = CatBoostClassifier(
    iterations=500,          # 반복 수 (트리 수)
    depth=6,                 # 트리의 깊이
    learning_rate=0.05,      # 학습률
    loss_function='MultiClass',  # 손실 함수 설정
    eval_metric='Accuracy',  # 평가 메트릭
    random_seed=42,          # 난수 시드
    od_type='Iter',          # 조기 종료 옵션
    od_wait=20,              # 조기 종료를 위한 대기 단계
    verbose=10               # 학습 과정에서 진행 상황 출력 간격
)

assembly_model = CatBoostClassifier(
    iterations=500,
    depth=6,
    learning_rate=0.05,
    loss_function='MultiClass',
    eval_metric='Accuracy',
    random_seed=42,
    od_type='Iter',
    od_wait=20,
    verbose=10
)

# Machinery 레이블에 대한 모델 훈련
machinery_model.fit(X_train, y_train_machinery, eval_set=(X_val, y_val_machinery))

# Assembly 레이블에 대한 모델 훈련
assembly_model.fit(X_train, y_train_assembly, eval_set=(X_val, y_val_assembly))


0:	learn: 0.5198989	test: 0.5225849	best: 0.5225849 (0)	total: 888ms	remaining: 7m 22s
10:	learn: 0.5731091	test: 0.5700626	best: 0.5700626 (10)	total: 8.68s	remaining: 6m 25s
20:	learn: 0.5914688	test: 0.5865480	best: 0.5865480 (20)	total: 16.4s	remaining: 6m 14s
30:	learn: 0.6035620	test: 0.5970986	best: 0.5970986 (30)	total: 24.2s	remaining: 6m 6s
40:	learn: 0.6142260	test: 0.6073195	best: 0.6073195 (40)	total: 32s	remaining: 5m 58s
50:	learn: 0.6225814	test: 0.6142433	best: 0.6142433 (50)	total: 39.8s	remaining: 5m 50s
60:	learn: 0.6299472	test: 0.6195186	best: 0.6195186 (60)	total: 47.6s	remaining: 5m 42s
70:	learn: 0.6409411	test: 0.6294098	best: 0.6300692 (67)	total: 55.3s	remaining: 5m 34s
80:	learn: 0.6519349	test: 0.6369931	best: 0.6369931 (80)	total: 1m 3s	remaining: 5m 26s
90:	learn: 0.6608399	test: 0.6452357	best: 0.6452357 (89)	total: 1m 10s	remaining: 5m 18s
100:	learn: 0.6691953	test: 0.6524893	best: 0.6524893 (100)	total: 1m 18s	remaining: 5m 10s
110:	learn: 0.6797493	

KeyboardInterrupt: 

In [121]:
# 예측 및 평가
y_pred_machinery = machinery_model.predict(X_val)
y_pred_assembly = assembly_model.predict(X_val)

accuracy_machinery = accuracy_score(y_val_machinery, y_pred_machinery)
accuracy_assembly = accuracy_score(y_val_assembly, y_pred_assembly)

print(f"Validation Accuracy for Machinery: {accuracy_machinery}")
print(f"Validation Accuracy for Assembly: {accuracy_assembly}")

Validation Accuracy for Machinery: 0.7309594460929772
Validation Accuracy for Assembly: 0.5792944279591163


In [15]:
# 크기 출력 - tensor를 안붙이고 그냥 size를 달라고 하니까 못뱉어냄.
print(f"X_train_tensor size: {X_train_tensor.size()}")
print(f"y_train_machinery_tensor size: {y_train_machinery_tensor.size()}")
print(f"y_train_assembly_tensor size: {y_train_assembly_tensor.size()}")

print(f"X_val_tensor size: {X_val_tensor.size()}")
print(f"y_val_machinery_tensor size: {y_val_machinery_tensor.size()}")
print(f"y_val_assembly_tensor size: {y_val_assembly_tensor.size()}")

print(f"X_test_tensor size: {X_test_tensor.size()}")
print(f"y_test_machinery_tensor size: {y_test_machinery_tensor.size()}")
print(f"y_test_assembly_tensor size: {y_test_assembly_tensor.size()}")


X_train_tensor size: torch.Size([9703, 200])
y_train_machinery_tensor size: torch.Size([9703])
y_train_assembly_tensor size: torch.Size([9703])
X_val_tensor size: torch.Size([2426, 200])
y_val_machinery_tensor size: torch.Size([2426])
y_val_assembly_tensor size: torch.Size([2426])
X_test_tensor size: torch.Size([3033, 200])
y_test_machinery_tensor size: torch.Size([3033])
y_test_assembly_tensor size: torch.Size([3033])


In [64]:
from torch.utils.data import TensorDataset, DataLoader

# TensorDataset 정의
train_dataset = TensorDataset(X_train_tensor, y_train_machinery_tensor, y_train_assembly_tensor)
val_dataset = TensorDataset(X_val_tensor, y_val_machinery_tensor, y_val_assembly_tensor)
test_dataset = TensorDataset(X_test_tensor, y_test_machinery_tensor, y_test_assembly_tensor)

# DataLoader 정의
batch_size = 32

train_loader = DataLoader(dataset=train_dataset, batch_size=batch_size, shuffle=True, drop_last=True)
val_loader = DataLoader(dataset=val_dataset, batch_size=batch_size, shuffle=False)
test_loader = DataLoader(dataset=test_dataset, batch_size=batch_size, shuffle=False)


In [90]:
import torch
import torch.nn as nn
import torch.optim as optim


class SharedTransformer(nn.Module):
    def __init__(self, input_dim, hidden_dim, num_heads, num_layers, dropout=0.3):
        super(SharedTransformer, self).__init__()
        self.embedding = nn.Linear(input_dim, hidden_dim)
        transformer_layer = nn.TransformerEncoderLayer(
            d_model=hidden_dim, nhead=num_heads, dropout=dropout, batch_first=True
        )
        self.transformer_encoder = nn.TransformerEncoder(transformer_layer, num_layers=num_layers)

    def forward(self, x):
        x = self.embedding(x)
        x = self.transformer_encoder(x)
        return x.mean(dim=1)  # 평균을 취해 시퀀스 차원을 축소

class MachineryHead(nn.Module):
    def __init__(self, hidden_dim, output_dim):
        super(MachineryHead, self).__init__()
        self.fc = nn.Linear(hidden_dim, output_dim)

    def forward(self, x):
        return self.fc(x)

class AssemblyHead(nn.Module):
    def __init__(self, hidden_dim, output_dim):
        super(AssemblyHead, self).__init__()
        self.fc = nn.Linear(hidden_dim, output_dim)

    def forward(self, x):
        return self.fc(x)

class MultiOutputModel(nn.Module):
    def __init__(self, input_dim, hidden_dim, num_heads, num_layers, machinery_classes, assembly_classes):
        super(MultiOutputModel, self).__init__()
        self.shared_transformer = SharedTransformer(input_dim, hidden_dim, num_heads, num_layers)
        self.machinery_head = MachineryHead(hidden_dim, machinery_classes)
        self.assembly_head = AssemblyHead(hidden_dim, assembly_classes)

    def forward(self, x):
        shared_features = self.shared_transformer(x)
        machinery_output = self.machinery_head(shared_features)
        assembly_output = self.assembly_head(shared_features)
        return machinery_output, assembly_output

In [91]:
input_dim = X_train_tensor.shape[1]
hidden_dim = 256
machinery_output_dim = len(np.unique(machinery_labels))  # 전체 클래스 개수
assembly_output_dim = len(np.unique(assembly_labels))  # 전체 클래스 개수
num_heads = 4
num_layers = 2

In [93]:
# 공유된 Transformer .
shared_transformer = SharedTransformer(input_dim, hidden_dim, num_heads, num_layers).to(device)
print(shared_transformer)

# Machinery와 Assembly에 대한 개별 최종 레이어
machinery_head = MachineryHead(hidden_dim, machinery_output_dim).to(device)
print(machinery_head)

assembly_head = AssemblyHead(hidden_dim, assembly_output_dim).to(device)
print(assembly_head)

model = MultiOutputModel(input_dim, hidden_dim, num_heads, num_layers, machinery_classes, assembly_classes).to(device)

print(model)

SharedTransformer(
  (embedding): Linear(in_features=200, out_features=256, bias=True)
  (transformer_encoder): TransformerEncoder(
    (layers): ModuleList(
      (0-1): 2 x TransformerEncoderLayer(
        (self_attn): MultiheadAttention(
          (out_proj): NonDynamicallyQuantizableLinear(in_features=256, out_features=256, bias=True)
        )
        (linear1): Linear(in_features=256, out_features=2048, bias=True)
        (dropout): Dropout(p=0.3, inplace=False)
        (linear2): Linear(in_features=2048, out_features=256, bias=True)
        (norm1): LayerNorm((256,), eps=1e-05, elementwise_affine=True)
        (norm2): LayerNorm((256,), eps=1e-05, elementwise_affine=True)
        (dropout1): Dropout(p=0.3, inplace=False)
        (dropout2): Dropout(p=0.3, inplace=False)
      )
    )
  )
)
MachineryHead(
  (fc): Linear(in_features=256, out_features=68, bias=True)
)
AssemblyHead(
  (fc): Linear(in_features=256, out_features=256, bias=True)
)
MultiOutputModel(
  (shared_transforme

In [94]:
# 옵티마이저 및 손실 함수
optimizer = optim.Adam(model.parameters(), lr=0.001)
criterion = nn.CrossEntropyLoss().to(device)

In [95]:
num_epochs = 100  # 예시로 50 epoch 설정
def train_model(model, train_loader, val_loader, optimizer, criterion, epochs=50):
    for epoch in range(epochs):
        model.train()
        total_loss = 0
        for inputs, labels_machinery, labels_assembly in train_loader:
            inputs = inputs.to(device)
            labels_machinery = labels_machinery.to(device)
            labels_assembly = labels_assembly.to(device)
            
            optimizer.zero_grad()
            outputs_machinery, outputs_assembly = model(inputs)
            
            loss_machinery = criterion(outputs_machinery, labels_machinery)
            loss_assembly = criterion(outputs_assembly, labels_assembly)
            loss = loss_machinery + loss_assembly
            loss.backward()
            optimizer.step()
            total_loss += loss.item()
        
        print(f'Epoch {epoch+1}/{epochs}, Loss: {total_loss/len(train_loader)}')

        # 검증 단계
        model.eval()
        with torch.no_grad():
            val_loss = 0
            for inputs, labels_machinery, labels_assembly in val_loader:
                inputs = inputs.to(device)
                labels_machinery = labels_machinery.to(device)
                labels_assembly = labels_assembly.to(device)
                outputs_machinery, outputs_assembly = model(inputs)
                loss_machinery = criterion(outputs_machinery, labels_machinery)
                loss_assembly = criterion(outputs_assembly, labels_assembly)
                val_loss += (loss_machinery.item() + loss_assembly.item())
            
            print(f'Validation Loss: {val_loss/len(val_loader)}')

In [96]:
train_model(model, train_loader, val_loader, optimizer, criterion)


RuntimeError: mat1 and mat2 shapes cannot be multiplied (1x32 and 256x68)

In [89]:
print(f'X_train shape: {X_train.shape}')
print(f'y_train_machinery shape: {y_train_machinery.shape}')
print(f'y_train_assembly shape: {y_train_assembly.shape}')

X_train shape: (9703, 200)
y_train_machinery shape: (9703,)
y_train_assembly shape: (9703,)
