In [1]:
import torch

print("CUDA available:", torch.cuda.is_available())

print("Current device:", torch.cuda.current_device())
print("Device name:", torch.cuda.get_device_name(torch.cuda.current_device()))

CUDA available: True
Current device: 0
Device name: NVIDIA GeForce GTX 1650


In [2]:
import pandas as pd
import numpy as np
import re

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score

## 모델이 클래스 특성을 학습하기에 충분한 표본 갯수로 데이터 제거

> Machinery에서 데이터가 30개 이하인 클래스 수: 100
> 
> Assembly에서 데이터가 30개 이하인 클래스 수: 1583
>
> 제거 후, 남은 데이터: 13882, MACHINERY : 62 ASSEMBLY:209

In [3]:
data=pd.read_excel('filtered_dataset_30.xlsx')

In [4]:
print(len(data['Machinery'].unique()),len(data['Assembly'].unique()))

62 209


In [5]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13882 entries, 0 to 13881
Data columns (total 32 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   청구서번호        13882 non-null  object 
 1   No.          13882 non-null  int64  
 2   Subject      13872 non-null  object 
 3   Machinery    13882 non-null  object 
 4   Assembly     13882 non-null  object 
 5   청구품목         13882 non-null  object 
 6   Unnamed: 6   0 non-null      float64
 7   Part No.1    13881 non-null  object 
 8   Part No.2    2430 non-null   object 
 9   청구량          13818 non-null  float64
 10  견적           13698 non-null  object 
 11  견적수량         13818 non-null  float64
 12  견적화폐         13818 non-null  object 
 13  견적단가         13882 non-null  float64
 14  발주번호         13882 non-null  object 
 15  발주처          13882 non-null  object 
 16  발주           13882 non-null  object 
 17  발주수량         13818 non-null  float64
 18  발주금액         13818 non-null  float64
 19  D/T 

In [6]:
print(data['청구품목'].unique())

['GE POWER PACK FORK - E7(B)'
 'SAMSON SUPER STRONG DOUBLE BRAID ROPE 1 3/4", 300FT'
 'WIRE ROPE G)6X(S)19 A3 CMP SLPP 28MM X 400M' ... 'BRACKET '
 'WASHER, 10 ' 'COVER,MANIFOLD.EXH ']


### 청구품목 전처리 

1. 텍스트 전처리
2. TF-IDF 기반 강조 (엠퍼사이징)
3. FastText 임베딩
   
### part no.1 전처리

> 콤마 위치에 따른 세부적인 차이가 많은 텍스트이므로 특수기호 및 문자 유지 필요 => 별도 전처리 X

In [7]:
import re

def preprocess_text(text):
    text = text.lower()
    text = re.sub(r'\([^)]*\)', '', text)   
    text = re.sub(r'[^\w\s\*/\-\+.,#&]', '', text) 
    text = re.sub(r'\s+', ' ', text)    
    text = re.sub(r'\b(사용금지|사)\b', '', text, flags=re.IGNORECASE)    
    text = text.strip()    
    return text


In [8]:
# 청구품목 클리닝
data['cleaned_item'] = data['청구품목'].apply(preprocess_text)

> 청구품목 데이터에서는 각 단어 의미적 연관성보다 주요단어가 있는 것이므로, 가중치 부여하는 것으로 접근함

In [10]:
from sklearn.feature_extraction.text import TfidfVectorizer

# 확인용
claim_items = data['청구품목'].tolist() 

# TF-IDF 벡터화
tfidf = TfidfVectorizer(max_features=30)
tfidf_matrix = tfidf.fit_transform(claim_items)

# 중요한 단어 추출
important_words = tfidf.get_feature_names_out()

print("청구품목 내 주요단어:", important_words)

청구품목 내 주요단어: ['as' 'bearing' 'bolt' 'charges' 'core' 'cover' 'cylinder' 'for' 'fuel'
 'gasket' 'gear' 'gp' 'head' 'hex' 'in' 'kit' 'nut' 'oil' 'plate' 'pump'
 'ring' 'screw' 'seal' 'sensor' 'set' 'shaft' 'spring' 'valve' 'washer'
 'water']


In [11]:
from sklearn.feature_extraction.text import TfidfVectorizer

# 2. TF-IDF 벡터화
tfidf = TfidfVectorizer(max_features=30) 
tfidf_matrix = tfidf.fit_transform(data['cleaned_item'])

### 발주처 전처리 강화

> 부가단어 (CORPORATION, Corp, CO., Ltd, GmbH, Co., Inc, 주식회사, 상사, 공사, Co.,Ltd, Ltd, Pte Ltd, LLC) 제거

> 핵심 정보(회사명 직접 관련) emphasizing 함 > 증강함수 적용

In [12]:
def clean_supplier_name(name):
    name = name.lower()
    name = re.sub(r'coporation|coropration|coproration|corporration', 'corporation', name)
    name = re.sub(r'\(사용금지\)', '', name)
    name = re.sub(r'u\.s\.a', '_usa', name)
    name = re.sub(r'\.', '', name)
    suffixes = r'(corporation|corp|company|co|incorporated|inc|limited|ltd|상사|공사|엔지니어링|주식회사|주|gmbh|pte ltd|llc)'
    name = re.sub(suffixes, '', name, flags=re.IGNORECASE)
    name = re.sub(r'[^\w\s-]', '', name)
    name = re.sub(r'\s+', ' ', name).strip()
    return name

In [13]:
suppliers = [
    'MATSUI(U.S.A) COROPRATION', 'KTI', '대광기업(주)', 'K.TH MARCO',
    'HAEIN Coporation_Cheonan', 'KOREA UCD CO.,LTD.', 'EAST WIND Gmbh', '인스알파',
    'ICON INTERNATIONAL, INC', '한국쉘석유㈜', 'EURO KYTEX ENGINEERING BV', '대동베아링상사',
    'MARINE HYDROTEC CO.,LTD.', '금안상사', 'TEST COMPANY',
    'PORT RELIEF ENGINEERING CO.,LTD.',
    'Caterpillar Marine Asia Pacific Pte Ltd', '(주)혜인',
    'SANWA COMMERCIAL CO.,LTD.', 'yusinHR Co., Ltd.', '(주)선진종합', 'FURUNO',
    'NISSIN REFRIGERATION  ENGINEERIN', '(주)우림공사',
    'HAEIN Coporation_Cheonan(사용금지)', 'KEMEL', 'REXNORD LLC-FALK MARINE GROUP',
    '유신에이치알(사용금지)', 'GEA KOREA LTD', '주안에너지㈜', 'SUNJIN ETECH Co.,Ltd.',
    '디에스알제강주식회사', '(주)한국에프에이디', 'ALBERT GMBH', 'Wartsila Korea Ltd.',
    '(주)선진엔텍(사용금지)', 'PIRIOU NAVAL', '(주)프러스엔지니어링', 'Taeyoung Enterprise',
    'SHINA', 'INS ALFA', 'KEMEL(KOMARINE)', '누리엔지니어링', 'RNK TECH CO.,LTD',
    'OS SYSTEM CO.,LTD', '씨코리아엔지니어링(주)', '(주)두원알앤에이', '합동듸젤사', '하이에어코리아(주)',
    'DESMI PUMPING TECHNOLOGY(SUZHOU) CO.,LTD', '한국마이콤',
    'HUMAN & ENGINEERING CO.,LTD'
]

cleaned_suppliers = [clean_supplier_name(supplier) for supplier in suppliers]
print(cleaned_suppliers)


['matsui_usa', 'kti', '대광기업', 'kth mar', 'haein _cheonan', 'korea ucd', 'east wind', '인스알파', 'in international', '한국쉘석유', 'euro kytex engineering bv', '대동베아링', 'marine hydrotec', '금안', 'test', 'port relief engineering', 'caterpillar marine asia pacific', '혜인', 'sanwa mmercial', 'yusinhr', '선진종합', 'furuno', 'nissin refrigeration engineerin', '우림', 'haein _cheonan', 'kemel', 'rexnord -falk marine group', '유신에이치알', 'gea korea', '안에너지', 'sunjin etech', '디에스알제강', '한국에프에이디', 'albert', 'wartsila korea', '선진엔텍', 'piriou naval', '프러스', 'taeyoung enterprise', 'shina', 'ins alfa', 'kemelkomarine', '누리', 'rnk tech', 'os system', '씨코리아', '두원알앤에이', '합동듸젤사', '하이에어코리아', 'desmi pumping technologysuzhou', '한국마이콤', 'human engineering']


In [14]:
#  HAEIN Corporation => HAEIN
def extract_important_part(name):
    if re.search(r'[가-힣]', name):
        name = re.sub(r'(기업|상사|종합|공사)', '', name)
        important_part = name.split()[0]  # 첫 단어 추출
    else:
        # 영문 이름의 경우 첫 번째 단어만 추출
        important_part = name.split()[0]
    
    return important_part

# 한번더 반복 HAEIN HAEIN Corporation
def emphasize_supplier_name(name):
    important_part = extract_important_part(name)
    emphasized_name = f"{important_part} {important_part} {name}"  # 중요한 부분을 반복
    return emphasized_name

In [15]:
suppliers = ['MATSUI(U.S.A) COROPRATION', 'taeyoung enterprise','HAEIN Coporation_Cheonan(사용금지)']
cleaned_suppliers = [clean_supplier_name(supplier) for supplier in suppliers]  # 전처리
emphasized_suppliers = [emphasize_supplier_name(supplier) for supplier in cleaned_suppliers]  # 강조

print(emphasized_suppliers)

['matsui_usa matsui_usa matsui_usa', 'taeyoung taeyoung taeyoung enterprise', 'haein haein haein _cheonan']


### 결합

In [16]:
# 청구품목 전처리 (TF-IDF 벡터화 적용)
data['cleaned_item'] = data['청구품목'].apply(preprocess_text)

# 파트 넘버 전처리 (별도 전처리 없음)
data['Part No.1'] = data['Part No.1'].astype(str)

# 발주처 전처리
data['cleaned_supplier'] = data['발주처'].apply(clean_supplier_name)
# data['emphasized_supplier'] = data['cleaned_supplier'].apply(emphasize_supplier_name)

# 4. 청구품목 + Part No.1 + 발주처 결합 (증강 없이)
data['combined_text'] = data['cleaned_item'].fillna('') + " " + data['Part No.1'].fillna('') + " " + data['cleaned_supplier'].fillna('')



In [None]:
import re

# 6. combined_text에서 중요한 단어 강조 함수 (단어 경계 사용)
#def emphasize_important_words_in_combined_text(combined_text, important_words):
#    for word in important_words:
#        # 정확한 단어를 찾아서 반복 (단어 경계 \b 사용)
#        combined_text = re.sub(rf'\b{word}\b', f'{word} {word}', combined_text)
#    return combined_text

# 7. combined_text에서 중요한 단어 강조 적용
# data['emphasized_combined_text'] = data['combined_text'].apply(
#    lambda x: emphasize_important_words_in_combined_text(x, important_words)
#)

# 최종 출력 확인
#print(data[['combined_text', 'emphasized_combined_text']].tail(20))

In [21]:
from sklearn.feature_extraction.text import TfidfVectorizer

# 1. TF-IDF 벡터화 (combined_text)
tfidf = TfidfVectorizer(max_features=500)
X_tfidf = tfidf.fit_transform(data['combined_text'])  # TF-IDF 벡터화


In [22]:
from gensim.models import Word2Vec

# 2. Word2Vec 학습 (combined_text)
sentences = [text.split() for text in data['combined_text'].tolist()]
w2v_model = Word2Vec(sentences, vector_size=120, window=5, min_count=1, sg=1)


In [23]:

# 3. TF-IDF 가중치를 적용한 Word2Vec 임베딩 계산
tfidf_feature_names = tfidf.get_feature_names_out()

def get_weighted_word2vec(text, model, tfidf_vector, feature_names):
    words = text.split()
    weighted_embedding = np.zeros(model.vector_size)
    total_weight = 0.0
    
    for word in words:
        if word in model.wv:
            # 단어가 TF-IDF 벡터에서 존재할 경우 가중치 부여
            try:
                idx = feature_names.tolist().index(word)
                weight = tfidf_vector[0, idx]  # TF-IDF 가중치
                weighted_embedding += weight * model.wv[word]
                total_weight += weight
            except ValueError:
                continue
    
    if total_weight > 0:
        weighted_embedding /= total_weight  # 가중합 계산
    
    return weighted_embedding

# 4. TF-IDF와 Word2Vec 결합 임베딩 생성
combined_embeddings = []

for i, row in data.iterrows():
    tfidf_vector = X_tfidf[i]
    embedding = get_weighted_word2vec(row['combined_text'], w2v_model, tfidf_vector, tfidf_feature_names)
    combined_embeddings.append(embedding)

# 5. Tensor로 변환
combined_embeddings_tensor = torch.tensor(combined_embeddings)

print(combined_embeddings_tensor.shape)

torch.Size([13882, 120])


  combined_embeddings_tensor = torch.tensor(combined_embeddings)


In [24]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import MinMaxScaler
from sklearn.feature_extraction.text import TfidfVectorizer
import torch


# 2. 레이블 준비 (Machinery, Asembly 등)
machinery_labels = data['Machinery'].values
assembly_labels = data['Assembly'].values

label_encoder_machinery = LabelEncoder()
y_machinery = label_encoder_machinery.fit_transform(machinery_labels)

label_encoder_assembly = LabelEncoder()
y_assembly = label_encoder_assembly.fit_transform(assembly_labels)

# 3. 스케일러 
# TF-IDF 벡터는 이미 스케일링된 값(0~1)

X=combined_embeddings_tensor

# 4. Train-Test Split
X_train_val, X_test, y_train_val_machinery, y_test_machinery, y_train_val_assembly, y_test_assembly = train_test_split(
    X, y_machinery, y_assembly, test_size=0.2, random_state=42, stratify=y_machinery
)

X_train, X_val, y_train_machinery, y_val_machinery, y_train_assembly, y_val_assembly = train_test_split(
    X_train_val, y_train_val_machinery, y_train_val_assembly, test_size=0.2, random_state=42, stratify=y_train_val_machinery
)

# 5. 텐서 정리 해놓기(이미 텐서 형태임)
X_train_tensor = torch.tensor(X_train, dtype=torch.float32)
X_val_tensor = torch.tensor(X_val, dtype=torch.float32)
X_test_tensor = torch.tensor(X_test, dtype=torch.float32)

y_train_tensor_machinery = torch.tensor(y_train_machinery, dtype=torch.long)
y_val_tensor_machinery = torch.tensor(y_val_machinery, dtype=torch.long)
y_test_tensor_machinery = torch.tensor(y_test_machinery, dtype=torch.long)

y_train_tensor_assembly = torch.tensor(y_train_assembly, dtype=torch.long)
y_val_tensor_assembly = torch.tensor(y_val_assembly, dtype=torch.long)
y_test_tensor_assembly = torch.tensor(y_test_assembly, dtype=torch.long)


# MLP나 XGBoost용 NumPy 배열로 변환
X_train_np = X_train_tensor.numpy()
X_val_np = X_val_tensor.numpy()
X_test_np = X_test_tensor.numpy()

y_train_np_machinery = y_train_machinery  # 이미 NumPy 배열 상태
y_val_np_machinery = y_val_machinery
y_test_np_machinery = y_test_machinery

y_train_np_assembly = y_train_assembly
y_val_np_assembly = y_val_assembly
y_test_np_assembly = y_test_assembly

print(f"X_train_np shape: {X_train_np.shape}")
print(f"X_train_tensor shape: {X_train_tensor.shape}")
print(f"y_train_machinery shape: {y_train_np_machinery.shape}")
print(f"y_train_assembly shape: {y_train_np_assembly.shape}")

X_train_np shape: (8884, 120)
X_train_tensor shape: torch.Size([8884, 120])
y_train_machinery shape: (8884,)
y_train_assembly shape: (8884,)


  X_train_tensor = torch.tensor(X_train, dtype=torch.float32)
  X_val_tensor = torch.tensor(X_val, dtype=torch.float32)
  X_test_tensor = torch.tensor(X_test, dtype=torch.float32)


### MLP

In [40]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, BatchNormalization
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau
from tensorflow.keras.optimizers import Adam

def build_mlp_model(dropout_rate=[0.3, 0.3, 0.3], learning_rate=0.005):
    model = Sequential([
        Dense(512, activation='relu', input_shape=(X_train.shape[1],)),  # 입력 크기 맞춤
        Dropout(dropout_rate[0]),  # 첫 번째 Dropout 비율
        BatchNormalization(),
        
        Dense(256, activation='relu'),
        Dropout(dropout_rate[1]),  # 두 번째 Dropout 비율
        BatchNormalization(),
        
        Dense(128, activation='relu'),
        Dropout(dropout_rate[2]),  # 세 번째 Dropout 비율

        Dense(64, activation='relu'),
        Dropout(dropout_rate[1]),  
        BatchNormalization(),
        
        Dense(62, activation='softmax')  # Machinery 클래스 수
    ])

    # 옵티마이저와 학습률 설정
    optimizer = Adam(learning_rate=0.005)
    model.compile(optimizer=optimizer, loss='sparse_categorical_crossentropy', metrics=['accuracy'])

    return model


In [41]:
# 모델 학습 및 평가
def train_and_evaluate(dropout_rate=[0.3, 0.3, 0.3], learning_rate=0.005, batch_size=32):
    model = build_mlp_model(dropout_rate=dropout_rate, learning_rate=learning_rate)
    
    # 콜백 설정
    early_stopping = EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True)
    reduce_lr = ReduceLROnPlateau(monitor='val_loss', factor=0.5, patience=3, min_lr=0.00001)

    # 모델 학습
    history = model.fit(
        X_train_np, y_train_machinery, 
        epochs=100, batch_size=batch_size, 
        validation_data=(X_val, y_val_machinery), 
        callbacks=[early_stopping, reduce_lr],
        verbose=1  # 학습 진행 상황을 출력
    )

    # 성능 평가
    loss, accuracy = model.evaluate(X_test_np, y_test_machinery, verbose=1)
    print(f"Test Loss: {loss}, Test Accuracy: {accuracy}")

    return model, history

model, history = train_and_evaluate()


Epoch 1/100
[1m278/278[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 3ms/step - accuracy: 0.4286 - loss: 2.4623 - val_accuracy: 0.5385 - val_loss: 1.6300 - learning_rate: 0.0050
Epoch 2/100
[1m278/278[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - accuracy: 0.5266 - loss: 1.7535 - val_accuracy: 0.5543 - val_loss: 1.5286 - learning_rate: 0.0050
Epoch 3/100
[1m278/278[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - accuracy: 0.5403 - loss: 1.6200 - val_accuracy: 0.5592 - val_loss: 1.4859 - learning_rate: 0.0050
Epoch 4/100
[1m278/278[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - accuracy: 0.5432 - loss: 1.6046 - val_accuracy: 0.5723 - val_loss: 1.4349 - learning_rate: 0.0050
Epoch 5/100
[1m278/278[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - accuracy: 0.5478 - loss: 1.5399 - val_accuracy: 0.5606 - val_loss: 1.4536 - learning_rate: 0.0050
Epoch 6/100
[1m278/278[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0

In [32]:
print("X_train type:", type(X_train))
print("X_train_np type:", type(X_train_np))
print("X_train shape:", X_train.shape)
print("X_train_np shape:", X_train_np.shape)

X_train type: <class 'torch.Tensor'>
X_train_np type: <class 'numpy.ndarray'>
X_train shape: torch.Size([8884, 120])
X_train_np shape: (8884, 120)


In [37]:
# 하이퍼파라미터 조정
dropout_rates = [[0.3, 0.4, 0.5], [0.2, 0.3, 0.4]] 
learning_rates = [0.0001, 0.0005, 0.001]
batch_sizes = [32, 64]

# 여러 하이퍼파라미터 조합에 대해 학습 및 평가
for dropout_rate in dropout_rates:
    for learning_rate in learning_rates:
        for batch_size in batch_sizes:
            print(f"\nTraining with dropout_rate={dropout_rate}, learning_rate={learning_rate}, batch_size={batch_size}")
            train_and_evaluate(dropout_rate, learning_rate, batch_size)


Training with dropout_rate=[0.3, 0.4, 0.5], learning_rate=0.0001, batch_size=32
Epoch 1/100
[1m278/278[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 2ms/step - accuracy: 0.3476 - loss: 2.8334 - val_accuracy: 0.5493 - val_loss: 1.8859 - learning_rate: 0.0010
Epoch 2/100
[1m278/278[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - accuracy: 0.4986 - loss: 1.9063 - val_accuracy: 0.5687 - val_loss: 1.4998 - learning_rate: 0.0010
Epoch 3/100
[1m278/278[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - accuracy: 0.5375 - loss: 1.6818 - val_accuracy: 0.5727 - val_loss: 1.4658 - learning_rate: 0.0010
Epoch 4/100
[1m278/278[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - accuracy: 0.5508 - loss: 1.5990 - val_accuracy: 0.5844 - val_loss: 1.4090 - learning_rate: 0.0010
Epoch 5/100
[1m278/278[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - accuracy: 0.5533 - loss: 1.5509 - val_accuracy: 0.5691 - val_loss: 1.3807 - learni

KeyboardInterrupt: 