In [1]:
import torch

print("CUDA available:", torch.cuda.is_available())

print("Current device:", torch.cuda.current_device())
print("Device name:", torch.cuda.get_device_name(torch.cuda.current_device()))

CUDA available: True
Current device: 0
Device name: NVIDIA GeForce GTX 1650


In [2]:
import pandas as pd
import numpy as np
import re

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score

In [3]:
data=pd.read_excel('filtered_dataset0.1.xlsx')

In [4]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 15162 entries, 0 to 15161
Data columns (total 32 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   청구서번호        15162 non-null  object 
 1   No.          15162 non-null  int64  
 2   Subject      15152 non-null  object 
 3   Machinery    15162 non-null  object 
 4   Assembly     15162 non-null  object 
 5   청구품목         15162 non-null  object 
 6   Unnamed: 6   0 non-null      float64
 7   Part No.1    15154 non-null  object 
 8   Part No.2    2614 non-null   object 
 9   청구량          15092 non-null  float64
 10  견적           14957 non-null  object 
 11  견적수량         15092 non-null  float64
 12  견적화폐         15092 non-null  object 
 13  견적단가         15162 non-null  float64
 14  발주번호         15162 non-null  object 
 15  발주처          15162 non-null  object 
 16  발주           15162 non-null  object 
 17  발주수량         15092 non-null  float64
 18  발주금액         15092 non-null  float64
 19  D/T 

In [5]:
import re

def preprocess_text(text):
    # 괄호 안의 내용 제거
    text = re.sub(r'\([^)]*\)', '', text)
    # 특수 문자 제거 (알파벳, 숫자, 일부 허용된 특수문자 제외)
    text = re.sub(r'[^\w\s\*\-\+/.,]', '', text)
    # 여러 공백을 언더스코어로 변환
    text = re.sub(r'\s+', '_', text)
    # 텍스트 중간의 연속된 언더스코어를 하나로 줄임
    text = re.sub(r'_+', '_', text)
    # 중간에 언더스코어가 불필요하게 남아있는 경우 처리
    text = re.sub(r'(?<!\w)_(?!\w)', '', text)
    # 언더스코어 앞뒤로 존재하는 특수문자 제거
    text = re.sub(r'_([^\w]+)_', '_', text)
    text = re.sub(r'_([^\w]+)$', '', text)
    text = re.sub(r'^([^\w]+)_', '', text)
    # 텍스트 끝부분의 불필요한 언더스코어 제거
    text = re.sub(r'_+$', '', text)
    # 영어 단어는 소문자로 변환
    text = ' '.join([word.lower() if re.match(r'[A-Za-z]', word) else word for word in text.split()])
    text = text.strip()
    return text

def clean_supplier_name(name):
    # 접미사 제거
    suffixes = r'\b(Corp\.?|Corporation|Company|Co\.?|Incorporated|Inc\.?|Limited|Ltd\.?|GmbH|S\.L\.|SDN\. BHD\.)\b'
    name = re.sub(suffixes, '', name, flags=re.IGNORECASE)
    # 특수 문자 제거
    name = re.sub(r'[^\w\s]', '', name)
    # 불필요한 단어 제거
    name = re.sub(r'\b(사용금지|사)\b', '', name, flags=re.IGNORECASE)
    # 공백 정리
    name = re.sub(r'\s+', ' ', name).strip()
    # 오타 수정 및 문자열 정리
    name = re.sub(r'coporation|coropration|coproration|corporration', 'corporation', name, flags=re.IGNORECASE)
    name = name.lower().strip()
    return name

In [6]:
# 각 칼럼 전처리
data['cleaned_item'] = data['청구품목'].apply(preprocess_text)
data['cleaned_supplier'] = data['발주처'].apply(clean_supplier_name)

# 전처리된 칼럼 결합
data['combined_text'] =data['cleaned_item'].fillna('') + " " + data['cleaned_supplier'].fillna('')


In [7]:
print(data[['combined_text']])

                                     combined_text
0      ge_power_pack_fork_e7 matsuiusa corporation
1      ge_power_pack_fork_e7 matsuiusa corporation
2                  nylon_54_4_1/4,_100md_50fms kti
3                  nylon_48_4_1/4,_100md_50fms kti
4                  nylon_42_4_1/4,_100md_50fms kti
...                                            ...
15157             ring-o haein corporation_cheonan
15158     ring-retaining haein corporation_cheonan
15159     sleeve-bearing haein corporation_cheonan
15160       bearing-ball haein corporation_cheonan
15161    bearing-ball_de haein corporation_cheonan

[15162 rows x 1 columns]


In [8]:
from gensim.models import FastText, Word2Vec
import torch

# 문장을 토큰화하여 리스트로 만들어야 합니다.
sentences = [text.split() for text in data['combined_text']]

# FastText 모델 학습
ft_model = FastText(vector_size=100, window=5, min_count=1, min_n=3, max_n=6, sg=1)
ft_model.build_vocab(sentences)
ft_model.train(sentences, total_examples=len(sentences), epochs=10)

# Word2Vec 모델 학습
w2v_model = Word2Vec(sentences, vector_size=100, window=5, min_count=1, sg=1)
w2v_model.train(sentences, total_examples=len(sentences), epochs=10)

(216327, 443560)

In [9]:
# FastText 임베딩을 가져오는 함수
def get_embedding(word, model):
    if word in model.wv:
        return torch.tensor(model.wv[word])
    else:
        # 서브워드 임베딩의 평균을 계산
        subwords = [word[i:j] for i in range(len(word)) for j in range(i+1, len(word)+1)]
        subword_vectors = [model.wv[subword] for subword in subwords if subword in model.wv]
        
        if subword_vectors:
            return torch.tensor(subword_vectors).mean(dim=0)
        else:
            return torch.zeros(model.vector_size)  # 단어가 없는 경우 0 벡터로 처리
# FastText 임베딩과 Word2Vec 임베딩을 결합한 함수
def get_combined_embedding(word, ft_model, w2v_model):
    ft_vector = get_embedding(word, ft_model)  # FastText에서 얻은 임베딩
    if word in w2v_model.wv:
        w2v_vector = torch.tensor(w2v_model.wv[word])  # Word2Vec에서 얻은 임베딩
    else:
        w2v_vector = torch.zeros(w2v_model.vector_size)  # 단어가 없는 경우 0 벡터로 처리

    combined_vector = torch.cat((ft_vector, w2v_vector))  # 두 임베딩을 결합 (concatenate)
    return combined_vector

# 결합된 임베딩을 생성
combined_embeddings = []
for text in data['combined_text']:
    words = text.split()
    word_vectors = [get_combined_embedding(word, ft_model, w2v_model) for word in words]
    if word_vectors:
        embedding = torch.stack(word_vectors).mean(dim=0)  # 단어 벡터의 평균 계산
    else:
        embedding = torch.zeros(model.vector_size + w2v_model.vector_size)  # 단어가 없는 경우 0 벡터로 처리
    combined_embeddings.append(embedding)

# 결합된 임베딩 리스트를 텐서로 변환
combined_embeddings_tensor = torch.stack(combined_embeddings)

print(combined_embeddings_tensor.shape)  # 결합된 임베딩 텐서

torch.Size([15162, 200])


In [20]:
import torch
import torch.nn.functional as F

# 모든 단어에 대해 결합된 임베딩을 계산하고 저장
combined_word_vectors = {}
for word in ft_model.wv.index_to_key:  # FastText 모델의 모든 단어에 대해 반복
    combined_word_vectors[word] = get_combined_embedding(word, ft_model, w2v_model)

# 특정 단어와 가장 유사한 단어 5개를 찾는 함수 정의
def find_similar_words(target_word, combined_word_vectors, topn=5):
    if target_word not in combined_word_vectors:
        print(f"Word '{target_word}' not in vocabulary.")
        return []

    target_vector = combined_word_vectors[target_word]
    similarities = {}

    for word, vector in combined_word_vectors.items():
        similarity = F.cosine_similarity(target_vector.unsqueeze(0), vector.unsqueeze(0)).item()
        similarities[word] = similarity

    # 상위 topn개의 유사 단어를 찾음
    sorted_similarities = sorted(similarities.items(), key=lambda item: item[1], reverse=True)
    return sorted_similarities[:topn]

In [28]:
word = "SCREW".lower()  # 소문자로 변환
if word in combined_word_vectors:
    print(f"Embedding vector for '{word}': {combined_word_vectors[word]}")
else:
    print(f"Word '{word}' not in vocabulary.")

Embedding vector for 'screw': tensor([-2.3752e-01,  8.8612e-01,  1.1302e-01,  3.6747e-02, -1.2051e-01,
         3.2228e-01,  2.9404e-01,  1.9792e-01, -5.9556e-01,  2.9528e-01,
        -4.4746e-01,  4.3174e-01, -4.1572e-02,  3.1334e-01,  3.6142e-01,
         1.3623e-01, -8.2154e-02, -4.5074e-01,  2.8683e-01, -4.0106e-01,
        -1.4053e-01, -7.0436e-01, -3.3354e-01,  3.9041e-01,  5.1434e-01,
        -3.4883e-01,  2.8162e-01, -6.8667e-01,  4.3864e-01,  1.0361e-01,
         4.6554e-01, -6.1421e-03, -2.9522e-02, -1.0084e-01,  3.3759e-01,
         6.2895e-01,  3.7990e-01,  3.6856e-01, -1.0317e+00,  3.2129e-01,
         3.3814e-01, -1.1638e+00,  6.3883e-02, -6.3173e-02, -3.2802e-01,
        -8.2053e-01,  2.4092e-01, -1.1650e-01, -1.6257e-01, -1.8954e-02,
         6.2246e-01,  4.4317e-01,  5.2351e-02,  7.2208e-01,  2.6477e-01,
        -4.7466e-02, -1.4115e-01, -6.3458e-01, -1.4125e-01, -6.3156e-01,
        -2.6505e-01, -2.7526e-01, -8.7236e-01,  4.1500e-01,  1.1056e-01,
        -3.2957e-01, 

In [105]:
# 특정 단어의 유사 단어 찾기
word = "bearing".lower()   # 여기에는 확인하고 싶은 단어를 넣으세요
similar_words = find_similar_words(word, combined_word_vectors, topn=5)

# 결과 출력
print(f"Words most similar to '{word}':")
for similar_word, similarity in similar_words:
    print(f"{similar_word}: {similarity:.4f}")

Words most similar to 'bearing':
bearing: 1.0000
bearing..needle: 0.9888
rod_bearing: 0.9870
bearing_ta04: 0.9867
sleeve-bearing: 0.9864


In [31]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

Using device: cuda


In [32]:
import torch
import torch.nn as nn
import torch.optim as optim
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split

# 1. 데이터 준비 및 인코딩
machinery = data['Machinery'].values
assembly = data['Assembly'].values

machinery_encoder = LabelEncoder()
assembly_encoder = LabelEncoder()

machinery_labels = machinery_encoder.fit_transform(machinery)
assembly_labels = assembly_encoder.fit_transform(assembly)

# 2. 임베딩을 numpy 배열로 변환-스케일러 위해서. 스케일러 안쓸거면 그냥써도됨
X = combined_embeddings_tensor

# 3. Train-Test Split (각 레이블에 대해 동일한 분할 사용)
X_train_val, X_test, y_train_val_machinery, y_test_machinery, y_train_val_assembly, y_test_assembly = train_test_split(
    X.numpy(), machinery_labels, assembly_labels, test_size=0.2, random_state=42
)

X_train, X_val, y_train_machinery, y_val_machinery, y_train_assembly, y_val_assembly = train_test_split(
    X_train_val, y_train_val_machinery, y_train_val_assembly, test_size=0.2, random_state=42
)

# 5. Train 데이터를 torch Tensor로 변환
X_train_tensor = torch.tensor(X_train, dtype=torch.float32).to(device)
X_val_tensor = torch.tensor(X_val, dtype=torch.float32).to(device)
X_test_tensor = torch.tensor(X_test, dtype=torch.float32).to(device)

y_train_machinery_tensor = torch.tensor(y_train_machinery, dtype=torch.long).to(device)
y_val_machinery_tensor = torch.tensor(y_val_machinery, dtype=torch.long).to(device)
y_test_machinery_tensor = torch.tensor(y_test_machinery, dtype=torch.long).to(device)

y_train_assembly_tensor = torch.tensor(y_train_assembly, dtype=torch.long).to(device)
y_val_assembly_tensor = torch.tensor(y_val_assembly, dtype=torch.long).to(device)
y_test_assembly_tensor = torch.tensor(y_test_assembly, dtype=torch.long).to(device)


In [36]:
# 크기 출력 - tensor를 안붙이고 그냥 size를 달라고 하니까 못뱉어냄.
print(f"X_train_tensor size: {X_train_tensor.size()}")
print(f"y_train_machinery_tensor size: {y_train_machinery_tensor.size()}")
print(f"y_train_assembly_tensor size: {y_train_assembly_tensor.size()}")

print(f"X_val_tensor size: {X_val_tensor.size()}")
print(f"y_val_machinery_tensor size: {y_val_machinery_tensor.size()}")
print(f"y_val_assembly_tensor size: {y_val_assembly_tensor.size()}")

print(f"X_test_tensor size: {X_test_tensor.size()}")
print(f"y_test_machinery_tensor size: {y_test_machinery_tensor.size()}")
print(f"y_test_assembly_tensor size: {y_test_assembly_tensor.size()}")


X_train_tensor size: torch.Size([9703, 200])
y_train_machinery_tensor size: torch.Size([9703])
y_train_assembly_tensor size: torch.Size([9703])
X_val_tensor size: torch.Size([2426, 200])
y_val_machinery_tensor size: torch.Size([2426])
y_val_assembly_tensor size: torch.Size([2426])
X_test_tensor size: torch.Size([3033, 200])
y_test_machinery_tensor size: torch.Size([3033])
y_test_assembly_tensor size: torch.Size([3033])


In [37]:
from torch.utils.data import TensorDataset, DataLoader

# TensorDataset 정의
train_dataset = TensorDataset(X_train_tensor, y_train_machinery_tensor, y_train_assembly_tensor)
val_dataset = TensorDataset(X_val_tensor, y_val_machinery_tensor, y_val_assembly_tensor)
test_dataset = TensorDataset(X_test_tensor, y_test_machinery_tensor, y_test_assembly_tensor)

# DataLoader 정의
batch_size = 32

train_loader = DataLoader(dataset=train_dataset, batch_size=batch_size, shuffle=True, drop_last=True)
val_loader = DataLoader(dataset=val_dataset, batch_size=batch_size, shuffle=False)
test_loader = DataLoader(dataset=test_dataset, batch_size=batch_size, shuffle=False)


In [98]:
import torch
import torch.nn as nn
import torch.optim as optim

class SharedTransformer(nn.Module):
    def __init__(self, input_dim, hidden_dim, num_heads, num_layers, dropout=0.1):
        super(SharedTransformer, self).__init__()
        self.embedding = nn.Linear(input_dim, hidden_dim)
        transformer_layer = nn.TransformerEncoderLayer(d_model=hidden_dim, nhead=num_heads, dropout=dropout)
        self.transformer_encoder = nn.TransformerEncoder(transformer_layer, num_layers=num_layers)
        
    def forward(self, x):
        x = self.embedding(x)
        x = self.transformer_encoder(x)
        return x

class MachineryHead(nn.Module):
    def __init__(self, hidden_dim, output_dim):
        super(MachineryHead, self).__init__()
        self.fc1 = nn.Linear(256, 256)  # 추가된 레이어
        self.relu = nn.ReLU()                  # 활성화 함수 추가
        self.fc2 = nn.Linear(256, output_dim)  # 기존 레이어
    
    def forward(self, x):
        x = self.fc1(x)
        x = self.relu(x)
        x = self.fc2(x)
        return x

class AssemblyHead(nn.Module):
    def __init__(self, hidden_dim, output_dim):
        super(AssemblyHead, self).__init__()
        self.flatten = nn.Flatten()
        self.fc = nn.Linear(hidden_dim, output_dim)
    
    def forward(self, x):
        x = self.flatten(x)
        return self.fc(x)

class CombinedModel(nn.Module):
    def __init__(self, shared_transformer, machinery_head, assembly_head):
        super(CombinedModel, self).__init__()
        self.shared_transformer = shared_transformer
        self.machinery_head = machinery_head
        self.assembly_head = assembly_head

    def forward(self, x):
        shared_output = self.shared_transformer(x)
        machinery_out = self.machinery_head(shared_output)
        assembly_out = self.assembly_head(shared_output)
        return machinery_out, assembly_out

In [99]:
# 하이퍼파라미터 설정
sequence_length = X_train_tensor.size(1)
input_dim = combined_embeddings_tensor.size(1)
hidden_dim = 256
num_heads = 8
num_layers = 2
dropout = 0.1

num_classes_machinery = len(machinery_encoder.classes_)
num_classes_assembly = len(assembly_encoder.classes_)


In [100]:
#모델 초기화
shared_transformer = SharedTransformer(input_dim, hidden_dim, num_heads, num_layers, dropout)
machinery_head = MachineryHead(hidden_dim, num_classes_machinery)
assembly_head = AssemblyHead(hidden_dim, num_classes_assembly)

model = CombinedModel(shared_transformer, machinery_head, assembly_head).to(device)

# 옵티마이저와 손실 함수 설정
optimizer = optim.Adam(list(shared_transformer.parameters()) +
                       list(machinery_head.parameters()) +
                       list(assembly_head.parameters()), lr=0.001)

criterion_machinery = nn.CrossEntropyLoss().to(device)
criterion_assembly = nn.CrossEntropyLoss().to(device)

scheduler = optim.lr_scheduler.StepLR(optimizer, step_size=20, gamma=0.5)

print(model)

CombinedModel(
  (shared_transformer): SharedTransformer(
    (embedding): Linear(in_features=200, out_features=256, bias=True)
    (transformer_encoder): TransformerEncoder(
      (layers): ModuleList(
        (0-1): 2 x TransformerEncoderLayer(
          (self_attn): MultiheadAttention(
            (out_proj): NonDynamicallyQuantizableLinear(in_features=256, out_features=256, bias=True)
          )
          (linear1): Linear(in_features=256, out_features=2048, bias=True)
          (dropout): Dropout(p=0.1, inplace=False)
          (linear2): Linear(in_features=2048, out_features=256, bias=True)
          (norm1): LayerNorm((256,), eps=1e-05, elementwise_affine=True)
          (norm2): LayerNorm((256,), eps=1e-05, elementwise_affine=True)
          (dropout1): Dropout(p=0.1, inplace=False)
          (dropout2): Dropout(p=0.1, inplace=False)
        )
      )
    )
  )
  (machinery_head): MachineryHead(
    (fc1): Linear(in_features=256, out_features=256, bias=True)
    (relu): ReLU()

In [101]:
num_epochs = 100  # 예시로 50 epoch 설정
for epoch in range(num_epochs):
    # Training Phase
    model.train()
    total_loss = 0
    for i, (inputs, y_machinery, y_assembly) in enumerate(train_loader):
        inputs = inputs.to(device)
        y_machinery = y_machinery.to(device)
        y_assembly = y_assembly.to(device)

        optimizer.zero_grad()
        machinery_out, assembly_out = model(inputs)
        loss_machinery = criterion_machinery(machinery_out, y_machinery)
        loss_assembly = criterion_assembly(assembly_out, y_assembly)
        loss = loss_machinery + loss_assembly
        loss.backward()
        optimizer.step()

        total_loss += loss.item()

    avg_train_loss = total_loss / len(train_loader)

    # Validation Phase
    model.eval()
    total_val_loss = 0
    with torch.no_grad():
        for i, (val_inputs, val_y_machinery, val_y_assembly) in enumerate(val_loader):
            val_inputs = val_inputs.to(device)
            val_y_machinery = val_y_machinery.to(device)
            val_y_assembly = val_y_assembly.to(device)

            val_machinery_out, val_assembly_out = model(val_inputs)
            val_loss_machinery = criterion_machinery(val_machinery_out, val_y_machinery)
            val_loss_assembly = criterion_assembly(val_assembly_out, val_y_assembly)
            val_loss = val_loss_machinery + val_loss_assembly

            total_val_loss += val_loss.item()

    avg_val_loss = total_val_loss / len(val_loader)

    # Learning rate scheduler step
    scheduler.step()

    print(f"Epoch {epoch + 1}/{num_epochs} - Train Loss: {avg_train_loss:.4f}, Val Loss: {avg_val_loss:.4f}")

Epoch 1/100 - Train Loss: 5.7193, Val Loss: 5.1514
Epoch 2/100 - Train Loss: 5.0071, Val Loss: 4.8910
Epoch 3/100 - Train Loss: 4.8208, Val Loss: 4.8667
Epoch 4/100 - Train Loss: 4.7510, Val Loss: 4.6347
Epoch 5/100 - Train Loss: 4.6629, Val Loss: 4.8315
Epoch 6/100 - Train Loss: 4.6482, Val Loss: 4.5361
Epoch 7/100 - Train Loss: 4.5711, Val Loss: 4.5218
Epoch 8/100 - Train Loss: 4.5317, Val Loss: 4.5296
Epoch 9/100 - Train Loss: 4.4256, Val Loss: 4.4170
Epoch 10/100 - Train Loss: 4.3537, Val Loss: 4.3311
Epoch 11/100 - Train Loss: 4.3017, Val Loss: 4.3248
Epoch 12/100 - Train Loss: 4.3498, Val Loss: 4.2287
Epoch 13/100 - Train Loss: 4.2211, Val Loss: 4.3157
Epoch 14/100 - Train Loss: 4.1571, Val Loss: 4.1097
Epoch 15/100 - Train Loss: 4.1987, Val Loss: 4.2147
Epoch 16/100 - Train Loss: 4.2590, Val Loss: 4.6742
Epoch 17/100 - Train Loss: 4.2031, Val Loss: 4.0491
Epoch 18/100 - Train Loss: 4.0360, Val Loss: 4.0230
Epoch 19/100 - Train Loss: 3.9713, Val Loss: 4.0059
Epoch 20/100 - Train 

KeyboardInterrupt: 

In [None]:
print(f'X_train shape: {X_train.shape}')
print(f'y_train_machinery shape: {y_train_machinery.shape}')
print(f'y_train_assembly shape: {y_train_assembly.shape}')