In [1]:
import torch

print("CUDA available:", torch.cuda.is_available())

print("Current device:", torch.cuda.current_device())
print("Device name:", torch.cuda.get_device_name(torch.cuda.current_device()))

CUDA available: True
Current device: 0
Device name: NVIDIA GeForce GTX 1650


In [2]:
import pandas as pd
import numpy as np
import re

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score

## 모델이 클래스 특성을 학습하기에 충분한 표본 갯수로 데이터 제거

> Machinery에서 데이터가 30개 이하인 클래스 수: 100
> 
> Assembly에서 데이터가 30개 이하인 클래스 수: 1583
>
> 제거 후, 남은 데이터: 13882, MACHINERY : 62 ASSEMBLY:209

In [3]:
data=pd.read_excel('filtered_dataset_30.xlsx')

In [4]:
print(len(data['Machinery'].unique()),len(data['Assembly'].unique()))

62 209


In [5]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13882 entries, 0 to 13881
Data columns (total 32 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   청구서번호        13882 non-null  object 
 1   No.          13882 non-null  int64  
 2   Subject      13872 non-null  object 
 3   Machinery    13882 non-null  object 
 4   Assembly     13882 non-null  object 
 5   청구품목         13882 non-null  object 
 6   Unnamed: 6   0 non-null      float64
 7   Part No.1    13881 non-null  object 
 8   Part No.2    2430 non-null   object 
 9   청구량          13818 non-null  float64
 10  견적           13698 non-null  object 
 11  견적수량         13818 non-null  float64
 12  견적화폐         13818 non-null  object 
 13  견적단가         13882 non-null  float64
 14  발주번호         13882 non-null  object 
 15  발주처          13882 non-null  object 
 16  발주           13882 non-null  object 
 17  발주수량         13818 non-null  float64
 18  발주금액         13818 non-null  float64
 19  D/T 

In [6]:
import re

def preprocess_text(text):
    # 괄호 안의 내용 제거
    text = re.sub(r'\([^)]*\)', '', text)  
    # 알파벳, 숫자, 필요한 특수문자 (/, *, -, +, ., ,, #, &, 등)만 허용
    text = re.sub(r'[^\w\s\*/\-\+.,#&]', '', text) 
    # 여러 공백을 언더스코어로 변환
    text = re.sub(r'\s+', '_', text)
    # 텍스트 중간의 연속된 언더스코어를 하나로 줄임
    text = re.sub(r'_+', '_', text) 
    # 불필요한 단어 제거
    text = re.sub(r'\b(사용금지|사)\b', '', text, flags=re.IGNORECASE)
    # 중간에 언더스코어가 불필요하게 남아있는 경우 처리
    text = re.sub(r'(?<!\w)_(?!\w)', '', text)
    # 언더스코어 앞뒤로 존재하는 특수문자 제거
    text = re.sub(r'_([^\w]+)_', '_', text)
    text = re.sub(r'_([^\w]+)$', '', text)
    text = re.sub(r'^([^\w]+)_', '', text)
    # 텍스트 끝부분의 불필요한 언더스코어 제거
    text = re.sub(r'_+$', '', text)
    # 영어 단어는 소문자로 변환
    text = ' '.join([word.lower() if re.match(r'[A-Za-z]', word) else word for word in text.split()])
    text = text.strip()
    return text

def clean_supplier_name(name):
    # 접미사 제거
    suffixes = r'\b(Corp\.?|Corporation|Company|Co\.?|Incorporated|Inc\.?|Limited|Ltd\.?|GmbH|S\.L\.|SDN\. BHD\.)\b'
    name = re.sub(suffixes, '', name, flags=re.IGNORECASE)
    # 특수 문자 제거
    name = re.sub(r'[^\w\s]', '', name)
    # 불필요한 단어 제거
    name = re.sub(r'\b(사용금지|사)\b', '', name, flags=re.IGNORECASE)
    # 공백 정리
    name = re.sub(r'\s+', ' ', name).strip()
    # 오타 수정 및 문자열 정리
    name = re.sub(r'coporation|coropration|coproration|corporration', 'corporation', name, flags=re.IGNORECASE)
    name = name.lower().strip()
    return name

In [7]:
# 각 칼럼 전처리
data['cleaned_item'] = data['청구품목'].apply(preprocess_text)
data['Part No.1'] = data['Part No.1'].astype(str)
data['cleaned_part']=data['Part No.1'].apply(preprocess_text)
data['cleaned_supplier'] = data['발주처'].apply(clean_supplier_name)

# 전처리된 칼럼 결합
data['combined_text'] =data['cleaned_item'].fillna('') + " " + data['cleaned_part'] +  " " + data['cleaned_supplier'].fillna('')


In [8]:
print(data[['combined_text']])

                                           combined_text
0      ge_power_pack_fork_e7 40028340 matsuiusa corpo...
1      ge_power_pack_fork_e7 40028340 matsuiusa corpo...
2      samson_super_strong_double_braid_rope_1_3/4,_3...
3      wire_rope_g6x19_a3_cmp_slpp_28mm_x_400m 6X19X2...
4      wire_rope_g6x19_a3_cmp_slpp_25mm_x_400m 6X19X2...
...                                                  ...
13877       pin-spring 7M-5130 haein corporation_cheonan
13878     kit-bearing 342-2409 haein corporation_cheonan
13879            seal 127-4374 haein corporation_cheonan
13880    sleeve-shaft 206-5967 haein corporation_cheonan
13881  bearing-ball,_6326zzsc3p6 154-3032 haein corpo...

[13882 rows x 1 columns]


In [9]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

Using device: cuda


### 정수 인코딩 x + 정수 시퀀스 y

In [9]:

from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

#  텍스트를 정수 시퀀스로 변환
tokenizer = Tokenizer(num_words=20000)
tokenizer.fit_on_texts(data['combined_text'])
sequences = tokenizer.texts_to_sequences(data['combined_text'])

# 시퀀스 패딩 
max_len = 50
X = pad_sequences(sequences, maxlen=max_len)

# 정수형 레이블 (y)
machinery_labels = data['Machinery'].values
assembly_labels = data['Assembly'].values

label_encoder_machinery = LabelEncoder()
y_machinery = label_encoder_machinery.fit_transform(machinery_labels)

label_encoder_assembly = LabelEncoder()
y_assembly = label_encoder_assembly.fit_transform(assembly_labels)

# 2. Train-Test Split
X_train_val, X_test, y_train_val_machinery, y_test_machinery, y_train_val_assembly, y_test_assembly = train_test_split(
    X, y_machinery, y_assembly, test_size=0.2, random_state=42, stratify=y_machinery
)

X_train, X_val, y_train_machinery, y_val_machinery, y_train_assembly, y_val_assembly = train_test_split(
    X_train_val, y_train_val_machinery, y_train_val_assembly, test_size=0.2, random_state=42, stratify=y_train_val_machinery
)

### XGB 체인모델

> Machinery 예측 받아서 assembly 예측함

In [24]:
#!conda install conda-forge::imbalanced-learn -y

In [12]:
from xgboost import XGBClassifier

# 1. Machinery 예측을 위한 XGBoost 모델
machinery_model = XGBClassifier(
    objective='multi:softmax',  # 다중 클래스 분류
    num_class=62,  # machinery 클래스 수
    learning_rate=0.1,
    max_depth=6,
    n_estimators=100,
    subsample=0.8,
    colsample_bytree=0.8,
    reg_lambda=1,
    random_state=42,
    verbosity=1
)

# 2. Assembly 예측을 위한 XGBoost 모델
assembly_model = XGBClassifier(
    objective='multi:softmax',  # 다중 클래스 분류
    num_class=209,  # assembly 클래스 수 (예시로 200)
    learning_rate=0.1,
    max_depth=6,
    n_estimators=100,
    subsample=0.8,
    colsample_bytree=0.8,
    reg_lambda=1,
    random_state=42,
    verbosity=1
)


In [60]:

# 1. Machinery 클래스별 support 확인
machinery_class_counts = Counter(y_train_machinery)
small_machinery_classes = {cls: count for cls, count in machinery_class_counts.items() if count <= 30}
print(small_machinery_classes)  

{42: 29, 30: 24, 45: 21, 50: 22, 13: 27, 24: 26, 26: 18, 39: 22, 5: 19, 23: 22, 54: 27, 55: 8, 48: 7, 20: 30, 32: 21, 44: 17, 22: 21, 17: 26, 9: 26, 8: 30, 3: 11, 29: 18, 41: 14, 40: 15, 12: 6, 56: 6, 11: 23, 43: 6}


### imbalanced-learn library 사용 (assembly에만 적용)

In [10]:
from collections import Counter
from imblearn.combine import SMOTETomek

# 2. Assembly 클래스별 support 확인
assembly_class_counts = Counter(y_train_assembly)
small_assembly_classes = {cls: count for cls, count in assembly_class_counts.items() if count <= 30}
print(small_assembly_classes) 

{4: 25, 180: 29, 186: 30, 166: 24, 156: 21, 20: 19, 12: 23, 47: 21, 197: 25, 56: 19, 28: 27, 165: 22, 107: 23, 37: 18, 139: 22, 44: 30, 51: 28, 100: 27, 54: 27, 41: 29, 89: 29, 48: 30, 127: 27, 184: 29, 193: 24, 114: 21, 50: 21, 187: 27, 117: 25, 183: 27, 123: 29, 119: 19, 67: 26, 152: 23, 55: 23, 73: 26, 26: 25, 131: 21, 161: 25, 14: 28, 137: 23, 3: 27, 101: 21, 71: 29, 143: 30, 113: 29, 206: 22, 150: 23, 68: 24, 103: 19, 63: 28, 31: 27, 177: 23, 208: 23, 1: 28, 146: 26, 83: 28, 81: 21, 77: 19, 118: 28, 65: 15, 132: 30, 0: 24, 35: 27, 104: 26, 202: 21, 92: 29, 24: 19, 86: 26, 99: 25, 136: 18, 97: 19, 45: 25, 173: 25, 162: 15, 207: 25, 30: 17, 87: 22, 121: 19, 39: 19, 205: 27, 38: 18, 203: 25, 145: 20, 43: 19, 42: 14, 110: 16, 201: 23, 176: 23, 109: 18, 7: 19, 19: 19, 23: 30, 163: 22, 70: 21}


In [13]:
# 1. Machinery 학습 
machinery_model.fit(X_train, y_train_machinery)

# Machinery 예측값을 Assembly 추가 피처로 사용
machinery_pred_train = machinery_model.predict(X_train)
machinery_pred_val = machinery_model.predict(X_val)

X_train_with_machinery = np.column_stack((X_train, machinery_pred_train))  # 훈련 데이터
X_val_with_machinery = np.column_stack((X_val, machinery_pred_val))  # 검증 데이터

# 3. SMOTE + Tomek Links 적용 (support 30개 이하인 클래스만 증강)
smote_tomek = SMOTETomek(sampling_strategy=small_assembly_classes, random_state=42)
X_resampled_with_machinery, y_resampled_assembly = smote_tomek.fit_resample(X_train_with_machinery, y_train_assembly)

# 4. Assembly 모델 학습 (SMOTE + Tomek Links 적용된 데이터로)
assembly_model.fit(X_resampled_with_machinery, y_resampled_assembly)

# 5. Assembly 검증 성능 평가
assembly_pred = assembly_model.predict(X_val_with_machinery)
assembly_accuracy = accuracy_score(y_val_assembly, assembly_pred)
print(f'Assembly Validation Accuracy: {assembly_accuracy:.4f}')
print(classification_report(y_val_assembly, assembly_pred))

# 6. Machinery 검증 성능 평가
machinery_pred_val_final = machinery_model.predict(X_val)
machinery_accuracy = accuracy_score(y_val_machinery, machinery_pred_val_final)
print(f'Machinery Validation Accuracy: {machinery_accuracy:.4f}')
print(classification_report(y_val_machinery, machinery_pred_val_final))

Assembly Validation Accuracy: 0.7524
              precision    recall  f1-score   support

           0       0.91      1.00      0.95        10
           1       0.60      1.00      0.75         3
           2       0.56      1.00      0.71        10
           3       0.33      0.20      0.25         5
           4       0.86      1.00      0.92         6
           5       0.43      0.25      0.32        12
           6       0.91      1.00      0.95        30
           7       0.80      0.31      0.44        13
           8       0.70      0.78      0.74         9
           9       0.75      0.80      0.77        15
          10       0.33      0.71      0.45        14
          11       0.95      0.80      0.87        25
          12       1.00      0.86      0.92         7
          13       0.92      0.92      0.92        13
          14       1.00      0.90      0.95        10
          15       0.62      0.42      0.50        12
          16       0.50      0.88      0.64 

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


### 결과해석 및 개선방향
> assembly 클래스는 support 30개 이하 클래스를 증강하는 것으로 소수클래스에 대한 성능 향상이 이루어졌으나,

> machinery 클래스의 macro avg는 smote를 해도 성능 향상이 이루어지지 않음

> machinery는 소수 클래스에 대한 더 정제된 처리가 필요하나, 전체 정확도가 0.79이므로 그리드서치 진행해보기로 함

## 그리드서치 

### machinery optimized model 생성

In [65]:
from sklearn.model_selection import GridSearchCV

# 하이퍼파라미터 그리드 설정
param_grid_machinery = {
    'learning_rate': [0.05, 0.1],  # 추가된 0.05
    'max_depth': [4, 6, 8],
    'n_estimators': [100, 150],  # 추가된 150
    'subsample': [0.8, 1.0],  # 추가된 0.7
    'colsample_bytree': [0.8, 1.0],  # 추가된 0.7
}

grid_search_machinery = GridSearchCV(
    estimator=machinery_model,
    param_grid=param_grid_machinery,
    scoring='accuracy',
    cv=3,  
    verbose=1,
    n_jobs=-1  
)

grid_search_machinery.fit(X_train, y_train_machinery)

print(f"Best parameters for Machinery: {grid_search_machinery.best_params_}")

Fitting 3 folds for each of 48 candidates, totalling 144 fits
Best parameters for Machinery: {'colsample_bytree': 0.8, 'learning_rate': 0.1, 'max_depth': 8, 'n_estimators': 150, 'subsample': 1.0}


In [15]:
# gridsearch 한 결과로 모델 재학습함

machinery_optimized_model = XGBClassifier(
    objective='multi:softmax',
    num_class=62,
    learning_rate=0.1,
    max_depth=8,
    n_estimators=150,
    subsample=1.0,
    colsample_bytree=0.8,
    reg_lambda=1,
    random_state=42,
    verbosity=1
)

machinery_optimized_model.fit(X_train, y_train_machinery)

from sklearn.metrics import accuracy_score

y_pred_machinery = machinery_optimized_model.predict(X_test)
accuracy = accuracy_score(y_test_machinery, y_pred_machinery)
print(f"Accuracy of the optimized model: {accuracy:.4f}")

Accuracy of the optimized model: 0.8055


### Assembly optimized model 생성

In [68]:
param_grid_assembly = {
    'learning_rate': [0.05, 0.1],  
    'max_depth': [4, 6, 8],
    'n_estimators': [150, 180],  
    'subsample': [0.8, 1.0],  
    'colsample_bytree': [0.7, 0.8], 
}

grid_search_assembly = GridSearchCV(
    estimator=assembly_model,
    param_grid=param_grid_assembly,
    scoring='accuracy',
    cv=3,  
    verbose=1,
    n_jobs=-1 
)

grid_search_assembly.fit(X_train_with_machinery, y_train_assembly)

print(f"Best parameters for Assembly: {grid_search_assembly.best_params_}")

Fitting 3 folds for each of 48 candidates, totalling 144 fits
Best parameters for Assembly: {'colsample_bytree': 0.8, 'learning_rate': 0.05, 'max_depth': 8, 'n_estimators': 180, 'subsample': 0.8}


In [17]:
# 최적 파라미터로 optimized 해서 test로 확인함

assembly_optimized_model = XGBClassifier(
    objective='multi:softmax',
    num_class=209,
    learning_rate=0.05,
    max_depth=8,
    n_estimators=180,
    subsample=0.8,
    colsample_bytree=0.8,
    reg_lambda=1,
    random_state=42,
    verbosity=1
)

# 모델 재학습
assembly_optimized_model.fit(X_train, y_train_assembly)

# 테스트 데이터로 예측
y_pred_assembly = assembly_optimized_model.predict(X_test)

# 정확도 계산
accuracy = accuracy_score(y_test_assembly, y_pred_assembly)
print(f"Accuracy of the Assembly optimized model: {accuracy:.4f}")

Accuracy of the Assembly optimized model: 0.7832


In [74]:
import pickle
from xgboost import XGBClassifier

# 토크나이저 저장
with open('tokenizer.pkl', 'wb') as f:
    pickle.dump(tokenizer, f)


In [None]:
# 모델저장 
with open('machinery_optimized_model.pkl', 'wb') as f:
    pickle.dump(machinery_optimized_model, f)
    
with open('assembly_optimized_model.pkl', 'wb') as f:
    pickle.dump(assembly_optimized_model, f)


In [11]:
import pickle

# LabelEncoder 저장
with open('label_encoder_machinery.pkl', 'wb') as f:
    pickle.dump(label_encoder_machinery, f)

with open('label_encoder_assembly.pkl', 'wb') as f:
    pickle.dump(label_encoder_assembly, f)