In [1]:
import torch

print("CUDA available:", torch.cuda.is_available())

print("Current device:", torch.cuda.current_device())
print("Device name:", torch.cuda.get_device_name(torch.cuda.current_device()))

CUDA available: True
Current device: 0
Device name: NVIDIA GeForce GTX 1650


In [2]:
import pandas as pd
import numpy as np
import re

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score

## 모델이 클래스 특성을 학습하기에 충분한 표본 갯수로 데이터 제거

> Machinery에서 데이터가 30개 이하인 클래스 수: 100
> 
> Assembly에서 데이터가 30개 이하인 클래스 수: 1583
>
> 제거 후, 남은 데이터: 13882, MACHINERY : 62 ASSEMBLY:209

In [3]:
data=pd.read_excel('filtered_dataset_30.xlsx')

In [4]:
print(len(data['Machinery'].unique()),len(data['Assembly'].unique()))

62 209


In [5]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13882 entries, 0 to 13881
Data columns (total 32 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   청구서번호        13882 non-null  object 
 1   No.          13882 non-null  int64  
 2   Subject      13872 non-null  object 
 3   Machinery    13882 non-null  object 
 4   Assembly     13882 non-null  object 
 5   청구품목         13882 non-null  object 
 6   Unnamed: 6   0 non-null      float64
 7   Part No.1    13881 non-null  object 
 8   Part No.2    2430 non-null   object 
 9   청구량          13818 non-null  float64
 10  견적           13698 non-null  object 
 11  견적수량         13818 non-null  float64
 12  견적화폐         13818 non-null  object 
 13  견적단가         13882 non-null  float64
 14  발주번호         13882 non-null  object 
 15  발주처          13882 non-null  object 
 16  발주           13882 non-null  object 
 17  발주수량         13818 non-null  float64
 18  발주금액         13818 non-null  float64
 19  D/T 

In [6]:
import re

def preprocess_text(text):
    # 괄호 안의 내용 제거
    text = re.sub(r'\([^)]*\)', '', text)  
    # 알파벳, 숫자, 필요한 특수문자 (/, *, -, +, ., ,, #, &, 등)만 허용
    text = re.sub(r'[^\w\s\*/\-\+.,#&]', '', text) 
    # 여러 공백을 언더스코어로 변환
    text = re.sub(r'\s+', '_', text)
    # 텍스트 중간의 연속된 언더스코어를 하나로 줄임
    text = re.sub(r'_+', '_', text) 
    # 불필요한 단어 제거
    text = re.sub(r'\b(사용금지|사)\b', '', text, flags=re.IGNORECASE)
    # 중간에 언더스코어가 불필요하게 남아있는 경우 처리
    text = re.sub(r'(?<!\w)_(?!\w)', '', text)
    # 언더스코어 앞뒤로 존재하는 특수문자 제거
    text = re.sub(r'_([^\w]+)_', '_', text)
    text = re.sub(r'_([^\w]+)$', '', text)
    text = re.sub(r'^([^\w]+)_', '', text)
    # 텍스트 끝부분의 불필요한 언더스코어 제거
    text = re.sub(r'_+$', '', text)
    # 영어 단어는 소문자로 변환
    text = ' '.join([word.lower() if re.match(r'[A-Za-z]', word) else word for word in text.split()])
    text = text.strip()
    return text

def clean_supplier_name(name):
    # 접미사 제거
    suffixes = r'\b(Corp\.?|Corporation|Company|Co\.?|Incorporated|Inc\.?|Limited|Ltd\.?|GmbH|S\.L\.|SDN\. BHD\.)\b'
    name = re.sub(suffixes, '', name, flags=re.IGNORECASE)
    # 특수 문자 제거
    name = re.sub(r'[^\w\s]', '', name)
    # 불필요한 단어 제거
    name = re.sub(r'\b(사용금지|사)\b', '', name, flags=re.IGNORECASE)
    # 공백 정리
    name = re.sub(r'\s+', ' ', name).strip()
    # 오타 수정 및 문자열 정리
    name = re.sub(r'coporation|coropration|coproration|corporration', 'corporation', name, flags=re.IGNORECASE)
    name = name.lower().strip()
    return name

In [7]:
# 각 칼럼 전처리
data['cleaned_item'] = data['청구품목'].apply(preprocess_text)
data['Part No.1'] = data['Part No.1'].astype(str)
data['cleaned_part']=data['Part No.1'].apply(preprocess_text)
data['cleaned_supplier'] = data['발주처'].apply(clean_supplier_name)

# 전처리된 칼럼 결합
data['combined_text'] =data['cleaned_item'].fillna('') + " " + data['cleaned_part'] +  " " + data['cleaned_supplier'].fillna('')


In [8]:
print(data[['combined_text']])

                                           combined_text
0      ge_power_pack_fork_e7 40028340 matsuiusa corpo...
1      ge_power_pack_fork_e7 40028340 matsuiusa corpo...
2      samson_super_strong_double_braid_rope_1_3/4,_3...
3      wire_rope_g6x19_a3_cmp_slpp_28mm_x_400m 6X19X2...
4      wire_rope_g6x19_a3_cmp_slpp_25mm_x_400m 6X19X2...
...                                                  ...
13877       pin-spring 7M-5130 haein corporation_cheonan
13878     kit-bearing 342-2409 haein corporation_cheonan
13879            seal 127-4374 haein corporation_cheonan
13880    sleeve-shaft 206-5967 haein corporation_cheonan
13881  bearing-ball,_6326zzsc3p6 154-3032 haein corpo...

[13882 rows x 1 columns]


In [9]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

Using device: cuda


### 정수 인코딩 x + 정수 시퀀스 y

In [11]:

from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

#  텍스트를 정수 시퀀스로 변환
tokenizer = Tokenizer(num_words=20000)
tokenizer.fit_on_texts(data['combined_text'])
sequences = tokenizer.texts_to_sequences(data['combined_text'])

# 시퀀스 패딩 (최대 길이는 데이터에 맞게 조정 가능)
max_len = 50
X = pad_sequences(sequences, maxlen=max_len)

# 정수형 레이블 (y)
machinery_labels = data['Machinery'].values
assembly_labels = data['Assembly'].values

label_encoder_machinery = LabelEncoder()
y_machinery = label_encoder_machinery.fit_transform(machinery_labels)

label_encoder_assembly = LabelEncoder()
y_assembly = label_encoder_assembly.fit_transform(assembly_labels)

# 2. Train-Test Split
X_train_val, X_test, y_train_val_machinery, y_test_machinery, y_train_val_assembly, y_test_assembly = train_test_split(
    X, y_machinery, y_assembly, test_size=0.2, random_state=42, stratify=y_machinery
)

# Validation 데이터로 다시 분할
X_train, X_val, y_train_machinery, y_val_machinery, y_train_assembly, y_val_assembly = train_test_split(
    X_train_val, y_train_val_machinery, y_train_val_assembly, test_size=0.2, random_state=42, stratify=y_train_val_machinery
)

### CatBoost 모델

In [12]:
from catboost import CatBoostClassifier
from catboost import Pool

# CatBoost 모델 설정
machinery_model_catboost = CatBoostClassifier(
    iterations=1000,
    learning_rate=0.1,
    depth=6,
    loss_function='MultiClass',
    random_state=42,
    verbose=200
)

# CatBoost 모델 학습
machinery_model_catboost.fit(X_train, y_train_machinery)

0:	learn: 3.0234347	total: 238ms	remaining: 3m 58s
200:	learn: 0.7861684	total: 21s	remaining: 1m 23s
400:	learn: 0.5913207	total: 41.7s	remaining: 1m 2s
600:	learn: 0.4825913	total: 1m 2s	remaining: 41.6s
800:	learn: 0.4154074	total: 1m 23s	remaining: 20.8s
999:	learn: 0.3708096	total: 1m 44s	remaining: 0us


<catboost.core.CatBoostClassifier at 0x23833189010>

In [13]:
# 예측값 추가
machinery_pred_train = machinery_model_catboost.predict(X_train)
machinery_pred_val = machinery_model_catboost.predict(X_val)

X_train_with_machinery_catboost = np.column_stack((X_train, machinery_pred_train))
X_val_with_machinery_catboost = np.column_stack((X_val, machinery_pred_val))


In [16]:
from collections import Counter
from imblearn.combine import SMOTETomek

# 2. Assembly 클래스별 support 확인
assembly_class_counts = Counter(y_train_assembly)
small_assembly_classes = {cls: count for cls, count in assembly_class_counts.items() if count <= 30}
print(small_assembly_classes) 

{4: 25, 180: 29, 186: 30, 166: 24, 156: 21, 20: 19, 12: 23, 47: 21, 197: 25, 56: 19, 28: 27, 165: 22, 107: 23, 37: 18, 139: 22, 44: 30, 51: 28, 100: 27, 54: 27, 41: 29, 89: 29, 48: 30, 127: 27, 184: 29, 193: 24, 114: 21, 50: 21, 187: 27, 117: 25, 183: 27, 123: 29, 119: 19, 67: 26, 152: 23, 55: 23, 73: 26, 26: 25, 131: 21, 161: 25, 14: 28, 137: 23, 3: 27, 101: 21, 71: 29, 143: 30, 113: 29, 206: 22, 150: 23, 68: 24, 103: 19, 63: 28, 31: 27, 177: 23, 208: 23, 1: 28, 146: 26, 83: 28, 81: 21, 77: 19, 118: 28, 65: 15, 132: 30, 0: 24, 35: 27, 104: 26, 202: 21, 92: 29, 24: 19, 86: 26, 99: 25, 136: 18, 97: 19, 45: 25, 173: 25, 162: 15, 207: 25, 30: 17, 87: 22, 121: 19, 39: 19, 205: 27, 38: 18, 203: 25, 145: 20, 43: 19, 42: 14, 110: 16, 201: 23, 176: 23, 109: 18, 7: 19, 19: 19, 23: 30, 163: 22, 70: 21}


In [17]:

# Assembly 모델 설정 및 학습
assembly_model_catboost = CatBoostClassifier(
    iterations=1000,
    learning_rate=0.1,
    depth=6,
    loss_function='MultiClass',
    random_state=42,
    verbose=200
)

# SMOTE + Tomek Links 적용
smote_tomek = SMOTETomek(sampling_strategy=small_assembly_classes, random_state=42)
X_resampled_with_machinery_catboost, y_resampled_assembly = smote_tomek.fit_resample(X_train_with_machinery_catboost, y_train_assembly)

# Assembly 모델 학습
assembly_model_catboost.fit(X_resampled_with_machinery_catboost, y_resampled_assembly)

# Assembly 검증 성능 평가
assembly_pred_catboost = assembly_model_catboost.predict(X_val_with_machinery_catboost)
assembly_accuracy_catboost = accuracy_score(y_val_assembly, assembly_pred_catboost)
print(f'Assembly Validation Accuracy (CatBoost): {assembly_accuracy_catboost:.4f}')
print(classification_report(y_val_assembly, assembly_pred_catboost))

# Machinery 검증 성능 평가
machinery_pred_val_catboost = machinery_model_catboost.predict(X_val)
machinery_accuracy_catboost = accuracy_score(y_val_machinery, machinery_pred_val_catboost)
print(f'Machinery Validation Accuracy (CatBoost): {machinery_accuracy_catboost:.4f}')
print(classification_report(y_val_machinery, machinery_pred_val_catboost))

0:	learn: 4.8101727	total: 495ms	remaining: 8m 14s
200:	learn: 1.0450737	total: 1m 27s	remaining: 5m 48s
400:	learn: 0.6628693	total: 2m 54s	remaining: 4m 20s
600:	learn: 0.4857694	total: 4m 21s	remaining: 2m 53s
800:	learn: 0.3887814	total: 5m 47s	remaining: 1m 26s
999:	learn: 0.3285902	total: 7m 14s	remaining: 0us
Assembly Validation Accuracy (CatBoost): 0.7154
              precision    recall  f1-score   support

           0       1.00      0.60      0.75        10
           1       0.43      1.00      0.60         3
           2       0.56      1.00      0.71        10
           3       0.50      0.20      0.29         5
           4       0.86      1.00      0.92         6
           5       0.43      0.25      0.32        12
           6       0.88      0.97      0.92        30
           7       0.25      0.08      0.12        13
           8       0.58      0.78      0.67         9
           9       0.69      0.73      0.71        15
          10       0.33      0.71      

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [21]:
from catboost import CatBoostClassifier
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import randint, uniform

# 하이퍼파라미터 설정
param_distributions = {
    'iterations': randint(1000, 1500),
    'learning_rate': uniform(0.05, 0.1),
    'depth': randint(6, 8),
    'l2_leaf_reg': randint(1, 5),
    'border_count': randint(32, 128)
}

# Machinery 모델
machinery_model_catboost = CatBoostClassifier(
    loss_function='MultiClass',
    random_state=42,
    verbose=200
)

random_search_machinery = RandomizedSearchCV(
    estimator=machinery_model_catboost,
    param_distributions=param_distributions,
    n_iter=20,  # 시도할 하이퍼파라미터 조합의 수
    scoring='accuracy',  # 필요에 따라 적절한 평가 지표로 변경
    cv=3,  # 교차 검증 fold 수
    n_jobs=-1,  # 모든 CPU 코어 사용
    random_state=42  # 랜덤 시드를 설정하여 재현성 유지
)

random_search_machinery.fit(X_train, y_train_machinery)

print("Best parameters for machinery model:", random_search_machinery.best_params_)
print("Best score for machinery model:", random_search_machinery.best_score_)

KeyboardInterrupt: 

In [None]:
# Assembly 모델
assembly_model_catboost = CatBoostClassifier(
    loss_function='MultiClass',
    random_state=42,
    verbose=200
)

grid_search_assembly = GridSearchCV(
    estimator=assembly_model_catboost,
    param_grid=param_grid,
    scoring='accuracy',  # 필요에 따라 적절한 평가 지표로 변경
    cv=3,  # 교차 검증 fold 수
    n_jobs=-1  # 모든 CPU 코어 사용
)

grid_search_assembly.fit(X_train, y_train_assembly)

print("Best parameters for assembly model:", grid_search_assembly.best_params_)
print("Best score for assembly model:", grid_search_assembly.best_score_)