In [1]:
import torch

print("CUDA available:", torch.cuda.is_available())

print("Current device:", torch.cuda.current_device())
print("Device name:", torch.cuda.get_device_name(torch.cuda.current_device()))

CUDA available: True
Current device: 0
Device name: NVIDIA GeForce GTX 1650


In [2]:
import pandas as pd
import numpy as np
import re

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score

In [7]:
data=pd.read_excel('filtered_dataset0.1.xlsx')

In [4]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 24664 entries, 0 to 24663
Data columns (total 32 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   청구서번호        24664 non-null  object 
 1   No.          24664 non-null  int64  
 2   Subject      24642 non-null  object 
 3   Machinery    24664 non-null  object 
 4   Assembly     24664 non-null  object 
 5   청구품목         24664 non-null  object 
 6   Unnamed: 6   0 non-null      float64
 7   Part No.1    24645 non-null  object 
 8   Part No.2    3599 non-null   object 
 9   청구량          24546 non-null  float64
 10  견적           24200 non-null  object 
 11  견적수량         24546 non-null  float64
 12  견적화폐         24546 non-null  object 
 13  견적단가         24664 non-null  float64
 14  발주번호         24664 non-null  object 
 15  발주처          24664 non-null  object 
 16  발주           24664 non-null  object 
 17  발주수량         24546 non-null  float64
 18  발주금액         24546 non-null  float64
 19  D/T 

In [5]:
import re

def preprocess_text(text):
    # 괄호 안의 내용 제거
    text = re.sub(r'\([^)]*\)', '', text)
    # 특수 문자 제거 (알파벳, 숫자, 일부 허용된 특수문자 제외)
    text = re.sub(r'[^\w\s\*\-\+/.,]', '', text)
    # 여러 공백을 언더스코어로 변환
    text = re.sub(r'\s+', '_', text)
    # 텍스트 중간의 연속된 언더스코어를 하나로 줄임
    text = re.sub(r'_+', '_', text)
    # 중간에 언더스코어가 불필요하게 남아있는 경우 처리
    text = re.sub(r'(?<!\w)_(?!\w)', '', text)
    # 언더스코어 앞뒤로 존재하는 특수문자 제거
    text = re.sub(r'_([^\w]+)_', '_', text)
    text = re.sub(r'_([^\w]+)$', '', text)
    text = re.sub(r'^([^\w]+)_', '', text)
    # 텍스트 끝부분의 불필요한 언더스코어 제거
    text = re.sub(r'_+$', '', text)
    # 영어 단어는 소문자로 변환
    text = ' '.join([word.lower() if re.match(r'[A-Za-z]', word) else word for word in text.split()])
    text = text.strip()
    return text

def clean_supplier_name(name):
    # 접미사 제거
    suffixes = r'\b(Corp\.?|Corporation|Company|Co\.?|Incorporated|Inc\.?|Limited|Ltd\.?|GmbH|S\.L\.|SDN\. BHD\.)\b'
    name = re.sub(suffixes, '', name, flags=re.IGNORECASE)
    # 특수 문자 제거
    name = re.sub(r'[^\w\s]', '', name)
    # 불필요한 단어 제거
    name = re.sub(r'\b(사용금지|사)\b', '', name, flags=re.IGNORECASE)
    # 공백 정리
    name = re.sub(r'\s+', ' ', name).strip()
    # 오타 수정 및 문자열 정리
    name = re.sub(r'coporation|coropration|coproration|corporration', 'corporation', name, flags=re.IGNORECASE)
    name = name.lower().strip()
    return name

In [6]:
# 각 칼럼 전처리
data['cleaned_item'] = data['청구품목'].apply(preprocess_text)
data['cleaned_supplier'] = data['발주처'].apply(clean_supplier_name)

# 전처리된 칼럼 결합
data['combined_text'] =data['cleaned_item'].fillna('') + " " + data['cleaned_supplier'].fillna('')


In [7]:
print(data[['combined_text']])

                                     combined_text
0      ge_power_pack_fork_e7 matsuiusa corporation
1      ge_power_pack_fork_e7 matsuiusa corporation
2                  nylon_54_4_1/4,_100md_50fms kti
3                  nylon_48_4_1/4,_100md_50fms kti
4                  nylon_42_4_1/4,_100md_50fms kti
...                                            ...
15157             ring-o haein corporation_cheonan
15158     ring-retaining haein corporation_cheonan
15159     sleeve-bearing haein corporation_cheonan
15160       bearing-ball haein corporation_cheonan
15161    bearing-ball_de haein corporation_cheonan

[15162 rows x 1 columns]


In [10]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

Using device: cuda


In [11]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

# 데이터 준비
texts = data['combined_text'].values

# 토크나이저 설정
tokenizer = Tokenizer(num_words=10000)  # 고유한 단어 수 제한
tokenizer.fit_on_texts(texts)

# 텍스트를 정수 시퀀스로 변환
sequences = tokenizer.texts_to_sequences(texts)
# 패딩 추가 (시퀀스 길이를 동일하게 맞춤)
max_sequence_length = max(len(seq) for seq in sequences)
X = pad_sequences(sequences, maxlen=max_sequence_length, padding='post')

In [24]:


# 1. 데이터 준비 및 인코딩
machinery = data['Machinery'].values
assembly = data['Assembly'].values

machinery_encoder = LabelEncoder()
assembly_encoder = LabelEncoder()

machinery_labels = machinery_encoder.fit_transform(machinery)
assembly_labels = assembly_encoder.fit_transform(assembly)

# 레이블 인코딩
machinery_labels = machinery_encoder.fit_transform(data['Machinery'].values)
assembly_labels = assembly_encoder.fit_transform(data['Assembly'].values)

# 3. Train-Test Split (각 레이블에 대해 동일한 분할 사용)
X_train_val, X_test, y_train_val_machinery, y_test_machinery, y_train_val_assembly, y_test_assembly = train_test_split(
    X, machinery_labels, assembly_labels, test_size=0.2, random_state=42
)

X_train, X_val, y_train_machinery, y_val_machinery, y_train_assembly, y_val_assembly = train_test_split(
    X_train_val, y_train_val_machinery, y_train_val_assembly, test_size=0.2, random_state=42
)

In [25]:
# XGBoost 모델 학습 및 평가
from catboost import CatBoostClassifier
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import accuracy_score

# 하이퍼파라미터 그리드 설정
param_distributions = {
    'depth': [4, 6, 8, 10],
    'iterations': [500, 700, 1000],
    'learning_rate': [0.01, 0.05, 0.1],
    'l2_leaf_reg': [3, 5, 7, 9],
    'bagging_temperature': [0.5, 1, 2],
    'rsm': [0.8, 1.0]
}

# CatBoost 분류기 준비
machinery_model = CatBoostClassifier(
    loss_function='MultiClass',
    random_seed=42,
    od_type='Iter',
    verbose=10
)


random_search_machinery = RandomizedSearchCV(
    estimator=machinery_model,
    param_distributions=param_distributions,
    n_iter=10,  # 시도할 조합의 수
    scoring='accuracy',
    cv=3,  # 교차 검증 폴드 수
    verbose=2,
    random_state=42,
    n_jobs=-1  # 모든 CPU 코어 사용
)

random_search_machinery.fit(X_train, y_train_machinery)

print(f"Best Hyperparameters for Machinery: {random_search_machinery.best_params_}")
print(f"Best Score for Machinery: {random_search_machinery.best_score_}")



Fitting 3 folds for each of 10 candidates, totalling 30 fits




0:	learn: 3.3544795	total: 308ms	remaining: 5m 7s
10:	learn: 2.2148307	total: 3.42s	remaining: 5m 7s
20:	learn: 1.8971388	total: 6.3s	remaining: 4m 53s
30:	learn: 1.7520761	total: 9.26s	remaining: 4m 49s
40:	learn: 1.6553344	total: 12s	remaining: 4m 40s
50:	learn: 1.5980214	total: 15s	remaining: 4m 40s
60:	learn: 1.5384265	total: 18.1s	remaining: 4m 38s
70:	learn: 1.4776152	total: 21.1s	remaining: 4m 35s
80:	learn: 1.4282865	total: 24s	remaining: 4m 32s
90:	learn: 1.3763595	total: 27.2s	remaining: 4m 31s
100:	learn: 1.3168460	total: 30.3s	remaining: 4m 29s
110:	learn: 1.2569819	total: 33.5s	remaining: 4m 28s
120:	learn: 1.2036140	total: 36.5s	remaining: 4m 25s
130:	learn: 1.1611991	total: 39.5s	remaining: 4m 21s
140:	learn: 1.1250832	total: 42.5s	remaining: 4m 18s
150:	learn: 1.0976966	total: 45.5s	remaining: 4m 15s
160:	learn: 1.0657571	total: 48.5s	remaining: 4m 12s
170:	learn: 1.0404973	total: 51.6s	remaining: 4m 9s
180:	learn: 1.0174835	total: 54.6s	remaining: 4m 6s
190:	learn: 0.9

In [None]:
# 2. Assembly 모델 하이퍼파라미터 튜닝
assembly_model = CatBoostClassifier(
    loss_function='MultiClass',
    random_seed=42,
    od_type='Iter',
    verbose=10
)

random_search_assembly = RandomizedSearchCV(
    estimator=assembly_model,
    param_distributions=param_distributions,
    n_iter=10,  # 시도할 조합의 수
    scoring='accuracy',
    cv=3,  # 교차 검증 폴드 수
    verbose=2,
    random_state=42,
    n_jobs=-1  # 모든 CPU 코어 사용
)

random_search_assembly.fit(X_train, y_train_assembly)

print(f"Best Hyperparameters for Assembly: {random_search_assembly.best_params_}")
print(f"Best Score for Assembly: {random_search_assembly.best_score_}")

In [None]:

# Machinery 레이블에 대한 모델 훈련
machinery_model.fit(X_train, y_train_machinery, eval_set=(X_val, y_val_machinery))

# Assembly 레이블에 대한 모델 훈련
assembly_model.fit(X_train, y_train_assembly, eval_set=(X_val, y_val_assembly))

In [None]:
# 7. 모델 평가
machinery_preds = machinery_model.predict(X_test)
assembly_preds = assembly_model.predict(X_test)

# 8. 정확도 출력
machinery_accuracy = accuracy_score(y_test_machinery, machinery_preds)
assembly_accuracy = accuracy_score(y_test_assembly, assembly_preds)

print(f"Machinery Accuracy: {machinery_accuracy:.4f}")
print(f"Assembly Accuracy: {assembly_accuracy:.4f}")