In [28]:
import torch

print("Number of GPUs available:", torch.cuda.device_count())
for i in range(torch.cuda.device_count()):
    print(f"Device {i}: {torch.cuda.get_device_name(i)}")

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")


Number of GPUs available: 1
Device 0: NVIDIA GeForce GTX 1650


In [1]:
import pandas as pd
import numpy as np
import re

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score

## 모델이 클래스 특성을 학습하기에 충분한 표본 갯수로 데이터 제거

> Machinery에서 데이터가 30개 이하인 클래스 수: 100
> 
> Assembly에서 데이터가 30개 이하인 클래스 수: 1583
>
> 제거 후, 남은 데이터: 13882, MACHINERY : 62 ASSEMBLY:209

In [3]:
data=pd.read_excel('filtered_dataset_30.xlsx')

In [4]:
print(len(data['Machinery'].unique()),len(data['Assembly'].unique()))

62 209


In [5]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13882 entries, 0 to 13881
Data columns (total 32 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   청구서번호        13882 non-null  object 
 1   No.          13882 non-null  int64  
 2   Subject      13872 non-null  object 
 3   Machinery    13882 non-null  object 
 4   Assembly     13882 non-null  object 
 5   청구품목         13882 non-null  object 
 6   Unnamed: 6   0 non-null      float64
 7   Part No.1    13881 non-null  object 
 8   Part No.2    2430 non-null   object 
 9   청구량          13818 non-null  float64
 10  견적           13698 non-null  object 
 11  견적수량         13818 non-null  float64
 12  견적화폐         13818 non-null  object 
 13  견적단가         13882 non-null  float64
 14  발주번호         13882 non-null  object 
 15  발주처          13882 non-null  object 
 16  발주           13882 non-null  object 
 17  발주수량         13818 non-null  float64
 18  발주금액         13818 non-null  float64
 19  D/T 

In [6]:
print(data['청구품목'].unique())

['GE POWER PACK FORK - E7(B)'
 'SAMSON SUPER STRONG DOUBLE BRAID ROPE 1 3/4", 300FT'
 'WIRE ROPE G)6X(S)19 A3 CMP SLPP 28MM X 400M' ... 'BRACKET '
 'WASHER, 10 ' 'COVER,MANIFOLD.EXH ']


### 전처리
1. 텍스트 클리닝
2. 결합 후 TF-IDF

> part.no.1 은 콤마 위치에 따른 세부적인 차이가 많은 텍스트이므로 특수기호 및 문자 유지 필요하다고 판단되어 별도 전처리 X

In [7]:
import re

def preprocess_text(text):
    text = text.lower()
    text = re.sub(r'\([^)]*\)', '', text)   
    text = re.sub(r'[^\w\s\*/\-\+.,#&]', '', text) 
    text = re.sub(r'\s+', ' ', text)    
    text = re.sub(r'\b(사용금지|사)\b', '', text, flags=re.IGNORECASE)    
    text = text.strip()    
    return text


In [8]:
# 청구품목 클리닝
data['cleaned_item'] = data['청구품목'].apply(preprocess_text)

> 청구품목 데이터에서는 각 단어 의미적 연관성보다 주요단어가 있는 것이므로, 가중치 부여하는 것으로 접근함

In [9]:
from sklearn.feature_extraction.text import TfidfVectorizer
# 확인용
claim_items = data['청구품목'].tolist() 

# TF-IDF 벡터화
tfidf = TfidfVectorizer(max_features=30)
tfidf_matrix = tfidf.fit_transform(claim_items)

# 중요한 단어 추출
important_words = tfidf.get_feature_names_out()

print("청구품목 내 주요단어:", important_words)

청구품목 내 주요단어: ['as' 'bearing' 'bolt' 'charges' 'core' 'cover' 'cylinder' 'for' 'fuel'
 'gasket' 'gear' 'gp' 'head' 'hex' 'in' 'kit' 'nut' 'oil' 'plate' 'pump'
 'ring' 'screw' 'seal' 'sensor' 'set' 'shaft' 'spring' 'valve' 'washer'
 'water']


In [10]:
from sklearn.feature_extraction.text import TfidfVectorizer

# 2. TF-IDF 벡터화
tfidf = TfidfVectorizer(max_features=30) 
tfidf_matrix = tfidf.fit_transform(data['cleaned_item'])

### 발주처 클리닝

> 부가단어 (CORPORATION, Corp, CO., Ltd, GmbH, Co., Inc, 주식회사, 상사, 공사, Co.,Ltd, Ltd, Pte Ltd, LLC) 제거

> 핵심 정보(회사명 직접 관련) emphasizing 함

In [11]:
def clean_supplier_name(name):
    name = name.lower()
    name = re.sub(r'coporation|coropration|coproration|corporration', 'corporation', name)
    name = re.sub(r'\(사용금지\)', '', name)
    name = re.sub(r'u\.s\.a', '_usa', name)
    name = re.sub(r'\.', '', name)
    suffixes = r'(corporation|corp|company|co|incorporated|inc|limited|ltd|상사|공사|엔지니어링|주식회사|주|gmbh|pte ltd|llc)'
    name = re.sub(suffixes, '', name, flags=re.IGNORECASE)
    name = re.sub(r'[^\w\s-]', '', name)
    name = re.sub(r'\s+', ' ', name).strip()
    return name

In [12]:
suppliers = [
    'MATSUI(U.S.A) COROPRATION', 'KTI', '대광기업(주)', 'K.TH MARCO',
    'HAEIN Coporation_Cheonan', 'KOREA UCD CO.,LTD.', 'EAST WIND Gmbh', '인스알파',
    'ICON INTERNATIONAL, INC', '한국쉘석유㈜', 'EURO KYTEX ENGINEERING BV', '대동베아링상사',
    'MARINE HYDROTEC CO.,LTD.', '금안상사', 'TEST COMPANY',
    'PORT RELIEF ENGINEERING CO.,LTD.',
    'Caterpillar Marine Asia Pacific Pte Ltd', '(주)혜인',
    'SANWA COMMERCIAL CO.,LTD.', 'yusinHR Co., Ltd.', '(주)선진종합', 'FURUNO',
    'NISSIN REFRIGERATION  ENGINEERIN', '(주)우림공사',
    'HAEIN Coporation_Cheonan(사용금지)', 'KEMEL', 'REXNORD LLC-FALK MARINE GROUP',
    '유신에이치알(사용금지)', 'GEA KOREA LTD', '주안에너지㈜', 'SUNJIN ETECH Co.,Ltd.',
    '디에스알제강주식회사', '(주)한국에프에이디', 'ALBERT GMBH', 'Wartsila Korea Ltd.',
    '(주)선진엔텍(사용금지)', 'PIRIOU NAVAL', '(주)프러스엔지니어링', 'Taeyoung Enterprise',
    'SHINA', 'INS ALFA', 'KEMEL(KOMARINE)', '누리엔지니어링', 'RNK TECH CO.,LTD',
    'OS SYSTEM CO.,LTD', '씨코리아엔지니어링(주)', '(주)두원알앤에이', '합동듸젤사', '하이에어코리아(주)',
    'DESMI PUMPING TECHNOLOGY(SUZHOU) CO.,LTD', '한국마이콤',
    'HUMAN & ENGINEERING CO.,LTD'
]

cleaned_suppliers = [clean_supplier_name(supplier) for supplier in suppliers]
print(cleaned_suppliers)


['matsui_usa', 'kti', '대광기업', 'kth mar', 'haein _cheonan', 'korea ucd', 'east wind', '인스알파', 'in international', '한국쉘석유', 'euro kytex engineering bv', '대동베아링', 'marine hydrotec', '금안', 'test', 'port relief engineering', 'caterpillar marine asia pacific', '혜인', 'sanwa mmercial', 'yusinhr', '선진종합', 'furuno', 'nissin refrigeration engineerin', '우림', 'haein _cheonan', 'kemel', 'rexnord -falk marine group', '유신에이치알', 'gea korea', '안에너지', 'sunjin etech', '디에스알제강', '한국에프에이디', 'albert', 'wartsila korea', '선진엔텍', 'piriou naval', '프러스', 'taeyoung enterprise', 'shina', 'ins alfa', 'kemelkomarine', '누리', 'rnk tech', 'os system', '씨코리아', '두원알앤에이', '합동듸젤사', '하이에어코리아', 'desmi pumping technologysuzhou', '한국마이콤', 'human engineering']


In [13]:
#  HAEIN Corporation => HAEIN
def extract_important_part(name):
    if re.search(r'[가-힣]', name):
        name = re.sub(r'(기업|상사|종합|공사)', '', name)
        important_part = name.split()[0]  # 첫 단어 추출
    else:
        # 영문 이름의 경우 첫 번째 단어만 추출
        important_part = name.split()[0]
    
    return important_part

# 한번더 반복 HAEIN HAEIN Corporation
def emphasize_supplier_name(name):
    important_part = extract_important_part(name)
    emphasized_name = f"{important_part} {important_part} {name}"  # 중요한 부분을 반복
    return emphasized_name

In [14]:
suppliers = ['MATSUI(U.S.A) COROPRATION', 'taeyoung enterprise','HAEIN Coporation_Cheonan(사용금지)']
cleaned_suppliers = [clean_supplier_name(supplier) for supplier in suppliers]  # 전처리
emphasized_suppliers = [emphasize_supplier_name(supplier) for supplier in cleaned_suppliers]  # 강조

print(emphasized_suppliers)

['matsui_usa matsui_usa matsui_usa', 'taeyoung taeyoung taeyoung enterprise', 'haein haein haein _cheonan']


### tf-idf / 정수 시퀀스 임베딩 

In [54]:
# 청구품목 전처리 (TF-IDF 벡터화 적용)
data['cleaned_item'] = data['청구품목'].apply(preprocess_text)

# 파트 넘버 전처리 (별도 전처리 없음)
data['Part No.1'] = data['Part No.1'].astype(str)

# 발주처 전처리
data['cleaned_supplier'] = data['발주처'].apply(clean_supplier_name)
# data['emphasized_supplier'] = data['cleaned_supplier'].apply(emphasize_supplier_name)

# 4. 청구품목 + Part No.1 + 발주처 결합 (증강 없이)
data['combined_text'] = data['cleaned_item'].fillna('') + " " + data['Part No.1'].fillna('') + " " + data['cleaned_supplier'].fillna('')



In [55]:
from sklearn.feature_extraction.text import TfidfVectorizer

# TF-IDF 벡터화
tfidf = TfidfVectorizer(max_features=500)
X_tfidf = tfidf.fit_transform(data['combined_text']).toarray()

In [56]:
from tensorflow.keras.preprocessing.text import Tokenizer

# 2. 정수 시퀀스 임베딩
tokenizer = Tokenizer(num_words=20000)
tokenizer.fit_on_texts(data['combined_text'])
sequences = tokenizer.texts_to_sequences(data['combined_text'])
max_length = max(len(seq) for seq in sequences)
X_sequences = np.array([np.pad(seq, (0, max_length - len(seq)), mode='constant') for seq in sequences])


In [57]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import MinMaxScaler
from sklearn.feature_extraction.text import TfidfVectorizer
import torch

# 2. 레이블 준비 
machinery_labels = data['Machinery'].values
assembly_labels = data['Assembly'].values

label_encoder_machinery = LabelEncoder()
y_machinery = label_encoder_machinery.fit_transform(machinery_labels)

label_encoder_assembly = LabelEncoder()
y_assembly = label_encoder_assembly.fit_transform(assembly_labels)

X_train_tfidf, X_test_tfidf, y_train_machinery, y_test_machinery = train_test_split(
    X_tfidf, y_machinery, test_size=0.2, random_state=42
)
X_train_sequences, X_test_sequences, _, _ = train_test_split(
    X_sequences, y_machinery, test_size=0.2, random_state=42
)

# 3. Tensor로 변환
X_train_tensor = torch.tensor(X_train, dtype=torch.float32)
X_val_tensor = torch.tensor(X_val, dtype=torch.float32)
X_test_tensor = torch.tensor(X_test, dtype=torch.float32)

y_train_tensor = torch.tensor(y_train_machinery, dtype=torch.long)
y_val_tensor = torch.tensor(y_val_machinery, dtype=torch.long)
y_test_tensor = torch.tensor(y_test_machinery, dtype=torch.long)

print(f"X_train_tensor shape: {X_train_tensor.shape}")
print(f"X_val_tensor shape: {X_val_tensor.shape}")
print(f"X_test_tensor shape: {X_test_tensor.shape}")

X_train_tensor shape: torch.Size([8884, 100])
X_val_tensor shape: torch.Size([2221, 100])
X_test_tensor shape: torch.Size([2777, 100])


  X_train_tensor = torch.tensor(X_train, dtype=torch.float32)
  X_val_tensor = torch.tensor(X_val, dtype=torch.float32)
  X_test_tensor = torch.tensor(X_test, dtype=torch.float32)


### MLP

In [73]:
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Embedding, Flatten, Dense, Concatenate, Dropout, BatchNormalization
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.regularizers import l2
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau


input_tfidf = Input(shape=(X_tfidf.shape[1],), name='tfidf_input')
input_sequences = Input(shape=(max_length,), name='sequences_input')

embedding_layer = Embedding(input_dim=20000, output_dim=50, input_length=max_length)(input_sequences)
embedding_flattened = Flatten()(embedding_layer)

concat = Concatenate()([input_tfidf, embedding_flattened])

dense1 = Dense(512, activation='relu', kernel_regularizer=l2(0.001))(concat)  # L2 정규화 추가
batch_norm1 = BatchNormalization()(dense1)
dropout1 = Dropout(0.4)(batch_norm1)

dense2 = Dense(256, activation='relu', kernel_regularizer=l2(0.001))(dropout1)  # L2 정규화 추가
batch_norm2 = BatchNormalization()(dense2)
dropout2 = Dropout(0.4)(batch_norm2)

dense3 = Dense(128, activation='relu', kernel_regularizer=l2(0.001))(dropout2)
batch_norm3 = BatchNormalization()(dense3)
dropout3 = Dropout(0.4)(batch_norm2)

# 출력층
output = Dense(62, activation='softmax')(dropout3)  # 62 클래스 예측

model = Model(inputs=[input_tfidf, input_sequences], outputs=output)
model.compile(optimizer=Adam(learning_rate=0.001), loss='sparse_categorical_crossentropy', metrics=['accuracy'])



In [74]:

early_stopping = EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True)
reduce_lr = ReduceLROnPlateau(monitor='val_loss', factor=0.5, patience=3, min_lr=0.00001)


In [75]:

model.fit(
    [X_train_tfidf, X_train_sequences], y_train_machinery,
    validation_split=0.2,
    epochs=30,
    batch_size=64
)

loss, accuracy = model.evaluate([X_test_tfidf, X_test_sequences], y_test_machinery)
print(f"Test loss: {loss}, Test accuracy: {accuracy}")

Epoch 1/30
[1m139/139[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 15ms/step - accuracy: 0.4507 - loss: 3.5764 - val_accuracy: 0.2292 - val_loss: 3.7574
Epoch 2/30
[1m139/139[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 13ms/step - accuracy: 0.7370 - loss: 1.7063 - val_accuracy: 0.4791 - val_loss: 2.8044
Epoch 3/30
[1m139/139[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 13ms/step - accuracy: 0.8046 - loss: 1.3069 - val_accuracy: 0.7348 - val_loss: 1.7971
Epoch 4/30
[1m139/139[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 13ms/step - accuracy: 0.8482 - loss: 1.0340 - val_accuracy: 0.8001 - val_loss: 1.2401
Epoch 5/30
[1m139/139[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 13ms/step - accuracy: 0.8730 - loss: 0.8790 - val_accuracy: 0.8095 - val_loss: 1.1046
Epoch 6/30
[1m139/139[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 13ms/step - accuracy: 0.8831 - loss: 0.7894 - val_accuracy: 0.8073 - val_loss: 1.0771
Epoch 7/30
[1m139/139

### CNN

In [80]:
from tensorflow.keras.layers import Conv1D, GlobalMaxPooling1D
from tensorflow.keras.regularizers import l2

input_tfidf = Input(shape=(X_tfidf.shape[1],), name='tfidf_input')
input_sequences = Input(shape=(max_length,), name='sequences_input')

embedding_layer = Embedding(input_dim=20000, output_dim=50, input_length=max_length)(input_sequences)
conv_layer = Conv1D(128, 3, activation='relu')(embedding_layer)
pooling_layer = GlobalMaxPooling1D()(conv_layer)

concat = Concatenate()([input_tfidf, pooling_layer])
dense1 = Dense(512, activation='relu', kernel_regularizer=l2(0.001))(concat)
dropout1 = Dropout(0.5)(dense1)
dense2 = Dense(256, activation='relu', kernel_regularizer=l2(0.001))(dropout1)
dropout2 = Dropout(0.5)(dense2)
output = Dense(62, activation='softmax')(dropout2)

model = Model(inputs=[input_tfidf, input_sequences], outputs=output)
model.compile(optimizer=Adam(learning_rate=0.001), loss='sparse_categorical_crossentropy', metrics=['accuracy'])


In [81]:

early_stopping = EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True)
reduce_lr = ReduceLROnPlateau(monitor='val_loss', factor=0.5, patience=3, min_lr=0.00001)

In [82]:

model.fit(
    [X_train_tfidf, X_train_sequences], y_train_machinery,
    validation_split=0.2,
    epochs=30,
    batch_size=32
)

loss, accuracy = model.evaluate([X_test_tfidf, X_test_sequences], y_test_machinery)
print(f"Test loss: {loss}, Test accuracy: {accuracy}")

Epoch 1/30
[1m278/278[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 8ms/step - accuracy: 0.3601 - loss: 3.2431 - val_accuracy: 0.6047 - val_loss: 1.7203
Epoch 2/30
[1m278/278[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 8ms/step - accuracy: 0.6308 - loss: 1.5863 - val_accuracy: 0.6997 - val_loss: 1.3185
Epoch 3/30
[1m278/278[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 8ms/step - accuracy: 0.7404 - loss: 1.1440 - val_accuracy: 0.7492 - val_loss: 1.1122
Epoch 4/30
[1m278/278[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 8ms/step - accuracy: 0.7955 - loss: 0.9087 - val_accuracy: 0.7636 - val_loss: 1.0204
Epoch 5/30
[1m278/278[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 8ms/step - accuracy: 0.8154 - loss: 0.7701 - val_accuracy: 0.7816 - val_loss: 0.9662
Epoch 6/30
[1m278/278[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 8ms/step - accuracy: 0.8394 - loss: 0.6625 - val_accuracy: 0.7834 - val_loss: 0.9280
Epoch 7/30
[1m278/278[0m 

### RNN

In [86]:
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Embedding, LSTM, Dense, Concatenate, Dropout
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau
from tensorflow.keras.regularizers import l2


# 1. 입력 레이어 정의
input_tfidf = Input(shape=(X_tfidf.shape[1],), name='tfidf_input')  # TF-IDF 입력
input_sequences = Input(shape=(max_length,), name='sequences_input')  # 정수 시퀀스 입력

# 2. 임베딩 레이어와 LSTM 레이어
embedding_layer = Embedding(input_dim=20000, output_dim=50, input_length=max_length)(input_sequences)
lstm_layer = LSTM(128, return_sequences=False, dropout=0.3, recurrent_dropout=0.3)(embedding_layer)

# 3. TF-IDF와 LSTM 출력을 결합
concat = Concatenate()([input_tfidf, lstm_layer])

# 4. 밀집 레이어
dense1 = Dense(512, activation='relu', kernel_regularizer=l2(0.001))(concat)
dropout1 = Dropout(0.4)(dense1)
dense2 = Dense(256, activation='relu', kernel_regularizer=l2(0.001))(dropout1)
dropout2 = Dropout(0.4)(dense2)

# 5. 출력 레이어 (62개 클래스 예측)
output = Dense(62, activation='softmax')(dropout2)

# 6. 모델 컴파일
model = Model(inputs=[input_tfidf, input_sequences], outputs=output)
model.compile(optimizer=Adam(learning_rate=0.0001), loss='sparse_categorical_crossentropy', metrics=['accuracy'])


In [87]:

# 7. EarlyStopping과 ReduceLROnPlateau 콜백 추가
early_stopping = EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True)
reduce_lr = ReduceLROnPlateau(monitor='val_loss', factor=0.5, patience=3, min_lr=0.00001)


In [88]:
# 8. 모델 학습
history = model.fit(
    [X_train_tfidf, X_train_sequences], y_train_machinery,
    validation_split=0.2,
    epochs=30,
    batch_size=16,
    callbacks=[early_stopping, reduce_lr]
)

# 9. 모델 평가
loss, accuracy = model.evaluate([X_test_tfidf, X_test_sequences], y_test_machinery)
print(f"Test loss: {loss}, Test accuracy: {accuracy}")

Epoch 1/30
[1m556/556[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 12ms/step - accuracy: 0.1842 - loss: 4.3134 - val_accuracy: 0.3147 - val_loss: 3.2659 - learning_rate: 1.0000e-04
Epoch 2/30
[1m556/556[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 12ms/step - accuracy: 0.3901 - loss: 3.1607 - val_accuracy: 0.5250 - val_loss: 2.5683 - learning_rate: 1.0000e-04
Epoch 3/30
[1m556/556[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 12ms/step - accuracy: 0.5089 - loss: 2.5526 - val_accuracy: 0.5471 - val_loss: 2.2050 - learning_rate: 1.0000e-04
Epoch 4/30
[1m556/556[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 12ms/step - accuracy: 0.5539 - loss: 2.1866 - val_accuracy: 0.5727 - val_loss: 2.0089 - learning_rate: 1.0000e-04
Epoch 5/30
[1m556/556[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 12ms/step - accuracy: 0.5712 - loss: 2.0271 - val_accuracy: 0.6020 - val_loss: 1.8777 - learning_rate: 1.0000e-04
Epoch 6/30
[1m556/556[0m [32m━━━━━━━━━━━━━

In [108]:
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Embedding, Flatten, Dense, Concatenate, Dropout, BatchNormalization
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.regularizers import l1_l2
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau


input_tfidf = Input(shape=(X_tfidf.shape[1],), name='tfidf_input')
input_sequences = Input(shape=(max_length,), name='sequences_input')

embedding_layer = Embedding(input_dim=20000, output_dim=50, input_length=max_length)(input_sequences)
embedding_flattened = Flatten()(embedding_layer)

concat = Concatenate()([input_tfidf, embedding_flattened])

dense1 = Dense(128, activation='relu', kernel_regularizer=l2(0.001))(concat)  # 유닛 수를 줄임
batch_norm1 = BatchNormalization()(dense1)
dropout1 = Dropout(0.4)(batch_norm1)

# 출력층
output = Dense(62, activation='softmax')(dropout1)  

model = Model(inputs=[input_tfidf, input_sequences], outputs=output)
model.compile(optimizer=Adam(learning_rate=0.0005), loss='sparse_categorical_crossentropy', metrics=['accuracy'])



In [109]:

early_stopping = EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True)
reduce_lr = ReduceLROnPlateau(monitor='val_loss', factor=0.5, patience=3, min_lr=0.00001)


In [110]:

model.fit(
    [X_train_tfidf, X_train_sequences], y_train_machinery,
    validation_split=0.2,
    epochs=30,
    batch_size=64
)

loss, accuracy = model.evaluate([X_test_tfidf, X_test_sequences], y_test_machinery)
print(f"Test loss: {loss}, Test accuracy: {accuracy}")

Epoch 1/30
[1m139/139[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 8ms/step - accuracy: 0.3321 - loss: 3.3425 - val_accuracy: 0.6348 - val_loss: 3.6239
Epoch 2/30
[1m139/139[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 7ms/step - accuracy: 0.7263 - loss: 1.3002 - val_accuracy: 0.7425 - val_loss: 2.7700
Epoch 3/30
[1m139/139[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 7ms/step - accuracy: 0.7895 - loss: 0.9237 - val_accuracy: 0.7812 - val_loss: 1.7119
Epoch 4/30
[1m139/139[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 7ms/step - accuracy: 0.8329 - loss: 0.7267 - val_accuracy: 0.8019 - val_loss: 0.9696
Epoch 5/30
[1m139/139[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 7ms/step - accuracy: 0.8740 - loss: 0.5706 - val_accuracy: 0.7969 - val_loss: 0.8391
Epoch 6/30
[1m139/139[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 7ms/step - accuracy: 0.8830 - loss: 0.5038 - val_accuracy: 0.8086 - val_loss: 0.7772
Epoch 7/30
[1m139/139[0m 

### attention

In [115]:
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Embedding, Dense, Concatenate, Dropout, BatchNormalization, Attention, GlobalAveragePooling1D
from tensorflow.keras.regularizers import l2

input_tfidf = Input(shape=(X_tfidf.shape[1],), name='tfidf_input')
input_sequences = Input(shape=(max_length,), name='sequences_input')

# 임베딩 레이어
embedding_layer = Embedding(input_dim=20000, output_dim=50, input_length=max_length)(input_sequences)

# Attention 메커니즘 적용
attention = Attention()([embedding_layer, embedding_layer])
attention_pooled = GlobalAveragePooling1D()(attention)

# TF-IDF와 Attention 결합
concat = Concatenate()([input_tfidf, attention_pooled])

# 밀집 레이어 추가
dense1 = Dense(512, activation='relu', kernel_regularizer=l2(0.001))(concat)
dropout1 = Dropout(0.4)(dense1)
dense2 = Dense(256, activation='relu', kernel_regularizer=l2(0.001))(dropout1)
dropout2 = Dropout(0.5)(dense2)

# 출력층
output = Dense(62, activation='softmax')(dropout2)

# 모델 컴파일
model = Model(inputs=[input_tfidf, input_sequences], outputs=output)
model.compile(optimizer=Adam(learning_rate=0.001), loss='sparse_categorical_crossentropy', metrics=['accuracy'])


In [116]:
# 모델 학습
model.fit([X_train_tfidf, X_train_sequences], y_train_machinery, validation_split=0.2, epochs=30, batch_size=32)


Epoch 1/30
[1m278/278[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 8ms/step - accuracy: 0.3479 - loss: 3.3222 - val_accuracy: 0.5912 - val_loss: 1.8494
Epoch 2/30
[1m278/278[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 7ms/step - accuracy: 0.6100 - loss: 1.7920 - val_accuracy: 0.6452 - val_loss: 1.5575
Epoch 3/30
[1m278/278[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 7ms/step - accuracy: 0.6639 - loss: 1.5056 - val_accuracy: 0.6812 - val_loss: 1.3965
Epoch 4/30
[1m278/278[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 8ms/step - accuracy: 0.7054 - loss: 1.3353 - val_accuracy: 0.7217 - val_loss: 1.2889
Epoch 5/30
[1m278/278[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 7ms/step - accuracy: 0.7303 - loss: 1.2109 - val_accuracy: 0.7380 - val_loss: 1.1968
Epoch 6/30
[1m278/278[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 7ms/step - accuracy: 0.7613 - loss: 1.1035 - val_accuracy: 0.7492 - val_loss: 1.1533
Epoch 7/30
[1m278/278[0m 

<keras.src.callbacks.history.History at 0x1ed5c0b32c0>

In [117]:
# 모델 평가
loss, accuracy = model.evaluate([X_test_tfidf, X_test_sequences], y_test_machinery)
print(f"Test loss: {loss}, Test accuracy: {accuracy}")

[1m87/87[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - accuracy: 0.8266 - loss: 0.7453
Test loss: 0.8006126880645752, Test accuracy: 0.8170687556266785
