In [1]:
import torch

print("CUDA available:", torch.cuda.is_available())

print("Current device:", torch.cuda.current_device())
print("Device name:", torch.cuda.get_device_name(torch.cuda.current_device()))

CUDA available: True
Current device: 0
Device name: NVIDIA GeForce GTX 1650


In [2]:
import pandas as pd
import numpy as np
import re

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score

In [3]:
data=pd.read_excel('filtered_dataset0.1.xlsx')

In [4]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 15162 entries, 0 to 15161
Data columns (total 32 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   청구서번호        15162 non-null  object 
 1   No.          15162 non-null  int64  
 2   Subject      15152 non-null  object 
 3   Machinery    15162 non-null  object 
 4   Assembly     15162 non-null  object 
 5   청구품목         15162 non-null  object 
 6   Unnamed: 6   0 non-null      float64
 7   Part No.1    15154 non-null  object 
 8   Part No.2    2614 non-null   object 
 9   청구량          15092 non-null  float64
 10  견적           14957 non-null  object 
 11  견적수량         15092 non-null  float64
 12  견적화폐         15092 non-null  object 
 13  견적단가         15162 non-null  float64
 14  발주번호         15162 non-null  object 
 15  발주처          15162 non-null  object 
 16  발주           15162 non-null  object 
 17  발주수량         15092 non-null  float64
 18  발주금액         15092 non-null  float64
 19  D/T 

In [5]:
import re

def preprocess_text(text):
    # 괄호 안의 내용 제거
    text = re.sub(r'\([^)]*\)', '', text)
    # 특수 문자 제거 (알파벳, 숫자, 일부 허용된 특수문자 제외)
    text = re.sub(r'[^\w\s\*\-\+/.,]', '', text)
    # 여러 공백을 언더스코어로 변환
    text = re.sub(r'\s+', '_', text)
    # 텍스트 중간의 연속된 언더스코어를 하나로 줄임
    text = re.sub(r'_+', '_', text)
    # 중간에 언더스코어가 불필요하게 남아있는 경우 처리
    text = re.sub(r'(?<!\w)_(?!\w)', '', text)
    # 언더스코어 앞뒤로 존재하는 특수문자 제거
    text = re.sub(r'_([^\w]+)_', '_', text)
    text = re.sub(r'_([^\w]+)$', '', text)
    text = re.sub(r'^([^\w]+)_', '', text)
    # 텍스트 끝부분의 불필요한 언더스코어 제거
    text = re.sub(r'_+$', '', text)
    # 영어 단어는 소문자로 변환
    text = ' '.join([word.lower() if re.match(r'[A-Za-z]', word) else word for word in text.split()])
    text = text.strip()
    return text

def clean_supplier_name(name):
    # 접미사 제거
    suffixes = r'\b(Corp\.?|Corporation|Company|Co\.?|Incorporated|Inc\.?|Limited|Ltd\.?|GmbH|S\.L\.|SDN\. BHD\.)\b'
    name = re.sub(suffixes, '', name, flags=re.IGNORECASE)
    # 특수 문자 제거
    name = re.sub(r'[^\w\s]', '', name)
    # 불필요한 단어 제거
    name = re.sub(r'\b(사용금지|사)\b', '', name, flags=re.IGNORECASE)
    # 공백 정리
    name = re.sub(r'\s+', ' ', name).strip()
    # 오타 수정 및 문자열 정리
    name = re.sub(r'coporation|coropration|coproration|corporration', 'corporation', name, flags=re.IGNORECASE)
    name = name.lower().strip()
    return name

In [6]:
# 각 칼럼 전처리
data['cleaned_item'] = data['청구품목'].apply(preprocess_text)
data['cleaned_supplier'] = data['발주처'].apply(clean_supplier_name)

# 전처리된 칼럼 결합
data['combined_text'] =data['cleaned_item'].fillna('') + " " + data['cleaned_supplier'].fillna('')


In [7]:
print(data[['combined_text']])

                                     combined_text
0      ge_power_pack_fork_e7 matsuiusa corporation
1      ge_power_pack_fork_e7 matsuiusa corporation
2                  nylon_54_4_1/4,_100md_50fms kti
3                  nylon_48_4_1/4,_100md_50fms kti
4                  nylon_42_4_1/4,_100md_50fms kti
...                                            ...
15157             ring-o haein corporation_cheonan
15158     ring-retaining haein corporation_cheonan
15159     sleeve-bearing haein corporation_cheonan
15160       bearing-ball haein corporation_cheonan
15161    bearing-ball_de haein corporation_cheonan

[15162 rows x 1 columns]


In [8]:
from gensim.models import FastText, Word2Vec
import torch

# 문장을 토큰화하여 리스트로 만들어야 합니다.
sentences = [text.split() for text in data['combined_text']]

# FastText 모델 학습
ft_model = FastText(vector_size=100, window=5, min_count=1, min_n=3, max_n=6, sg=1)
ft_model.build_vocab(sentences)
ft_model.train(sentences, total_examples=len(sentences), epochs=10)

# Word2Vec 모델 학습
w2v_model = Word2Vec(sentences, vector_size=100, window=5, min_count=1, sg=1)
w2v_model.train(sentences, total_examples=len(sentences), epochs=10)

(216300, 443560)

In [11]:
# FastText 임베딩을 가져오는 함수
def get_embedding(word, model):
    if word in model.wv:
        return torch.tensor(model.wv[word])
    else:
        # 서브워드 임베딩의 평균을 계산
        subwords = [word[i:j] for i in range(len(word)) for j in range(i+1, len(word)+1)]
        subword_vectors = [model.wv[subword] for subword in subwords if subword in model.wv]
        
        if subword_vectors:
            return torch.tensor(subword_vectors).mean(dim=0)
        else:
            return torch.zeros(model.vector_size)  # 단어가 없는 경우 0 벡터로 처리
# FastText 임베딩과 Word2Vec 임베딩을 결합한 함수
def get_combined_embedding(word, ft_model, w2v_model):
    ft_vector = get_embedding(word, ft_model)  # FastText에서 얻은 임베딩
    if word in w2v_model.wv:
        w2v_vector = torch.tensor(w2v_model.wv[word])  # Word2Vec에서 얻은 임베딩
    else:
        w2v_vector = torch.zeros(w2v_model.vector_size)  # 단어가 없는 경우 0 벡터로 처리

    combined_vector = torch.cat((ft_vector, w2v_vector))  # 두 임베딩을 결합 (concatenate)
    return combined_vector

# 결합된 임베딩을 생성
combined_embeddings = []
for text in data['combined_text']:
    words = text.split()
    word_vectors = [get_combined_embedding(word, ft_model, w2v_model) for word in words]
    if word_vectors:
        embedding = torch.stack(word_vectors).mean(dim=0)  # 단어 벡터의 평균 계산
    else:
        embedding = torch.zeros(model.vector_size + w2v_model.vector_size)  # 단어가 없는 경우 0 벡터로 처리
    combined_embeddings.append(embedding)

# 결합된 임베딩 리스트를 텐서로 변환
combined_embeddings_tensor = torch.stack(combined_embeddings)

print(combined_embeddings_tensor.shape)  # 결합된 임베딩 텐서

torch.Size([15162, 200])


In [13]:
import torch
import torch.nn.functional as F

# 모든 단어에 대해 결합된 임베딩을 계산하고 저장
combined_word_vectors = {}
for word in ft_model.wv.index_to_key:  # FastText 모델의 모든 단어에 대해 반복
    combined_word_vectors[word] = get_combined_embedding(word, ft_model, w2v_model)

# 특정 단어와 가장 유사한 단어 5개를 찾는 함수 정의
def find_similar_words(target_word, combined_word_vectors, topn=5):
    if target_word not in combined_word_vectors:
        print(f"Word '{target_word}' not in vocabulary.")
        return []

    target_vector = combined_word_vectors[target_word]
    similarities = {}

    for word, vector in combined_word_vectors.items():
        similarity = F.cosine_similarity(target_vector.unsqueeze(0), vector.unsqueeze(0)).item()
        similarities[word] = similarity

    # 상위 topn개의 유사 단어를 찾음
    sorted_similarities = sorted(similarities.items(), key=lambda item: item[1], reverse=True)
    return sorted_similarities[:topn]

In [14]:
# 특정 단어의 유사 단어 찾기
word = "valve"  # 여기에는 확인하고 싶은 단어를 넣으세요
similar_words = find_similar_words(word, combined_word_vectors, topn=5)

# 결과 출력
print(f"Words most similar to '{word}':")
for similar_word, similarity in similar_words:
    print(f"{similar_word}: {similarity:.4f}")

Words most similar to 'valve':
valve: 1.0000
valve_pcl402/3-sc-bsp: 0.9899
valve_c/bal_h-1180: 0.9895
check_valve: 0.9864
valve_pcl402/2-se-sae: 0.9858


In [15]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

Using device: cuda


In [16]:
import torch
import torch.nn as nn
import torch.optim as optim
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split

# 1. 데이터 준비 및 인코딩
machinery = data['Machinery'].values
assembly = data['Assembly'].values

machinery_encoder = LabelEncoder()
assembly_encoder = LabelEncoder()

machinery_labels = machinery_encoder.fit_transform(machinery)
assembly_labels = assembly_encoder.fit_transform(assembly)

# 2. 임베딩을 numpy 배열로 변환-스케일러 위해서. 스케일러 안쓸거면 그냥써도됨
X = combined_embeddings_tensor

# 3. Train-Test Split (각 레이블에 대해 동일한 분할 사용)
X_train_val, X_test, y_train_val_machinery, y_test_machinery, y_train_val_assembly, y_test_assembly = train_test_split(
    X.numpy(), machinery_labels, assembly_labels, test_size=0.2, random_state=42
)

X_train, X_val, y_train_machinery, y_val_machinery, y_train_assembly, y_val_assembly = train_test_split(
    X_train_val, y_train_val_machinery, y_train_val_assembly, test_size=0.2, random_state=42
)

# 5. Train 데이터를 torch Tensor로 변환
X_train_tensor = torch.tensor(X_train, dtype=torch.float32).to(device)
X_val_tensor = torch.tensor(X_val, dtype=torch.float32).to(device)
X_test_tensor = torch.tensor(X_test, dtype=torch.float32).to(device)

y_train_machinery_tensor = torch.tensor(y_train_machinery, dtype=torch.long).to(device)
y_val_machinery_tensor = torch.tensor(y_val_machinery, dtype=torch.long).to(device)
y_test_machinery_tensor = torch.tensor(y_test_machinery, dtype=torch.long).to(device)

y_train_assembly_tensor = torch.tensor(y_train_assembly, dtype=torch.long).to(device)
y_val_assembly_tensor = torch.tensor(y_val_assembly, dtype=torch.long).to(device)
y_test_assembly_tensor = torch.tensor(y_test_assembly, dtype=torch.long).to(device)


In [30]:
from transformers import BertTokenizer

# 토크나이저 초기화
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# 1.텍스트를 BERT 입력 형식으로 변환
def encode_texts(texts, tokenizer, max_length=128):
    encoding = tokenizer.batch_encode_plus(
        texts,
        truncation=True,
        padding=True,
        max_length=max_length,
        return_tensors='pt'
    )
    return encoding['input_ids'], encoding['attention_mask']
    
X_input_ids, X_attention_mask = encode_texts(data['combined_text'], tokenizer)


In [34]:
#2. train_test split
# 80%의 데이터를 train_val (train+validation) 세트로, 20%를 test 세트로 분할
X_train_val_input_ids, X_test_input_ids, X_train_val_attention_mask, X_test_attention_mask, y_train_val_machinery, y_test_machinery, y_train_val_assembly, y_test_assembly = train_test_split(
    X_input_ids, X_attention_mask, machinery_labels, assembly_labels, test_size=0.2, random_state=42
)

# 80%의 train_val 세트를 다시 75% (train)와 25% (validation)로 분할
X_train_input_ids, X_val_input_ids, X_train_attention_mask, X_val_attention_mask, y_train_machinery, y_val_machinery, y_train_assembly, y_val_assembly = train_test_split(
    X_train_val_input_ids, X_train_val_attention_mask, y_train_val_machinery, y_train_val_assembly, test_size=0.25, random_state=42
)
print(type(y_train_machinery))
print(y_train_machinery[:5])  #

<class 'numpy.ndarray'>
[54 38 43 39 30]


In [32]:
# 3. 텐서로 변환
y_train_machinery_tensor = torch.tensor(y_train_machinery, dtype=torch.long)
y_val_machinery_tensor = torch.tensor(y_val_machinery, dtype=torch.long)
y_test_machinery_tensor = torch.tensor(y_test_machinery, dtype=torch.long)

y_train_assembly_tensor = torch.tensor(y_train_assembly, dtype=torch.long)
y_val_assembly_tensor = torch.tensor(y_val_assembly, dtype=torch.long)
y_test_assembly_tensor = torch.tensor(y_test_assembly, dtype=torch.long)


In [35]:
# 크기 출력 - tensor를 안붙이고 그냥 size를 달라고 하니까 못뱉어냄.
print(f"y_train_machinery size: {y_train_machinery_tensor.size()}")
print(f"y_train_assembly size: {y_train_assembly_tensor.size()}")

print(f"X_val_input_ids size: {X_val_input_ids.size()}")
print(f"X_val_attention_mask size: {X_val_attention_mask.size()}")
print(f"y_val_machinery size: {y_val_machinery_tensor.size()}")
print(f"y_val_assembly size: {y_val_assembly_tensor.size()}")

y_train_machinery size: torch.Size([9096])
y_train_assembly size: torch.Size([9096])
X_val_input_ids size: torch.Size([3033, 64])
X_val_attention_mask size: torch.Size([3033, 64])
y_val_machinery size: torch.Size([3033])
y_val_assembly size: torch.Size([3033])


In [36]:

# 4. TensorDataset과 DataLoader 생성
train_dataset = TensorDataset(X_train_input_ids, X_train_attention_mask, y_train_machinery_tensor, y_train_assembly_tensor)
val_dataset = TensorDataset(X_val_input_ids, X_val_attention_mask, y_val_machinery_tensor, y_val_assembly_tensor)

batch_size = 32
train_loader = DataLoader(dataset=train_dataset, batch_size=batch_size, shuffle=True, drop_last=True)
val_loader = DataLoader(dataset=val_dataset, batch_size=batch_size, shuffle=False)

In [37]:
import torch
import torch.nn as nn
from transformers import BertModel, BertTokenizer 

class BERTClassifier(nn.Module):
    def __init__(self, bert_model_name, embedding_dim, num_classes_machinery, num_classes_assembly):
        super(BERTClassifier, self).__init__()
        self.bert = BertModel.from_pretrained(bert_model_name)
        self.fc_machinery = nn.Linear(embedding_dim, num_classes_machinery)
        self.fc_assembly = nn.Linear(embedding_dim, num_classes_assembly)

    def forward(self, input_ids, attention_mask):
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        pooled_output = outputs[1]  # BERT의 [CLS] 토큰 출력 사용
        machinery_out = self.fc_machinery(pooled_output)
        assembly_out = self.fc_assembly(pooled_output)
        return machinery_out, assembly_out

In [38]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = BERTClassifier('bert-base-uncased', embedding_dim=768, num_classes_machinery=len(machinery_encoder.classes_), num_classes_assembly=len(assembly_encoder.classes_)).to(device)


In [40]:
# 옵티마이저와 손실 함수 설정
optimizer = torch.optim.Adam(model.parameters(), lr=2e-5)
criterion = nn.CrossEntropyLoss()

num_epochs=100
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

for epoch in range(num_epochs):
    model.train()
    for i, (input_ids, attention_mask, y_machinery, y_assembly) in enumerate(train_loader):

        input_ids = input_ids.to(device)
        attention_mask = attention_mask.to(device)
        y_machinery = y_machinery.to(device)
        y_assembly = y_assembly.to(device)
        
        optimizer.zero_grad()
        machinery_out, assembly_out = model(input_ids, attention_mask)
        loss_machinery = criterion(machinery_out, y_machinery)
        loss_assembly = criterion(assembly_out, y_assembly)
        loss = loss_machinery + loss_assembly
        loss.backward()
        optimizer.step()

    print(f"Epoch {epoch + 1}/{num_epochs} - Loss: {loss.item()}")

  attn_output = torch.nn.functional.scaled_dot_product_attention(


Epoch 1/100 - Loss: 6.135068416595459
Epoch 2/100 - Loss: 5.425899982452393
Epoch 3/100 - Loss: 4.623385429382324
Epoch 4/100 - Loss: 4.47451114654541
Epoch 5/100 - Loss: 3.911362648010254
Epoch 6/100 - Loss: 3.908961296081543


KeyboardInterrupt: 

In [None]:
print(f'X_train shape: {X_train.shape}')
print(f'y_train_machinery shape: {y_train_machinery.shape}')
print(f'y_train_assembly shape: {y_train_assembly.shape}')