In [1]:
import pandas as pd
import numpy as np
import datetime
import os
import re
import tensorflow as tf
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint
from tensorflow.keras import layers
import random
import joblib
import pickle
from keras.models import model_from_json

## 파일 경로

In [2]:
base_path='./save_model/'
prediction_path='prediction_model/'
stopwords_path='stopwords/'
tokenizer_path='tokenizer_model/'

## CNN 구조 설정

In [3]:
class CNNClassifier(tf.keras.Model):
    def __init__(self, **kargs):
        super(CNNClassifier, self).__init__(name=kargs['model_name'])
        self.embedding=layers.Embedding(input_dim=kargs['vocab_size'],
                                       output_dim=kargs['embedding_size'])
        self.conv_list=[layers.Conv1D(filters=kargs['num_filters'],
                                     kernel_size=kernel_size,
                                     padding='valid',
                                     activation=tf.keras.activations.relu,
                                     kernel_constraint=tf.keras.constraints.MaxNorm(max_value=3.))
                       for kernel_size in [3,4,5]]
        self.pooling=layers.GlobalMaxPooling1D()
        self.dropout=layers.Dropout(kargs['dropout_rate'])
        self.fc1=layers.Dense(units=kargs['hidden_dimension'],
                             activation=tf.keras.activations.relu,
                             kernel_constraint=tf.keras.constraints.MaxNorm(max_value=3.))
        self.fc2=layers.Dense(units=kargs['output_dimension'],
                             activation=tf.keras.activations.sigmoid,
                             kernel_constraint=tf.keras.constraints.MaxNorm(max_value=3.))
        
    def call(self, x):
        x=self.embedding(x)
        x=self.dropout(x)
        x=tf.concat([self.pooling(conv(x)) for conv in self.conv_list], axis=-1)
        x=self.fc1(x)
        x=self.fc2(x)
        return x

## 모델 로드 함수 정의

In [4]:
# 토크나이저 로드
def get_tokenizer():
    with open(base_path+tokenizer_path+'tokenizer.pickle', 'rb') as f:
        loaded_tokenizer=pickle.load(f)
    return loaded_tokenizer

# rfc 모델 로드
def get_rfc_model():
    loaded_rfc_model= joblib.load(base_path+prediction_path+'rfc_model.pkl')
    return loaded_rfc_model

# xgb 모델 로드
def get_xgb_model():
    loaded_model = pickle.load(open('D:/우편물류/구성원 추정/save_model/prediction_model/'+"xgb_model.pickle", "rb"))
    return loaded_model

# rnn 모델 구조 & 가중치 로드
def get_rnn_model():
    json_file = open(base_path+prediction_path+'rnn_model_structure.json', 'r')
    loaded_rnn_structure = json_file.read()
    json_file.close()
    loaded_rnn_model = model_from_json(loaded_rnn_structure)
    loaded_rnn_model.load_weights(base_path+prediction_path+"rnn_model_weights.h5")
    return loaded_rnn_model

# cnn 모델 로드
def get_cnn_model():
    loaded_tokenizer=get_tokenizer()
    model_name='cnn_classifier_kr'
    BATCH_SIZE=16
    NUM_EPOCHS=100
    VALID_SPLIT=0.1
    MAX_LEN=20
    word_vocab=loaded_tokenizer.word_index
    kargs={'model_name':model_name,
          'vocab_size':len(word_vocab)+1,
          'embedding_size':64,
          'num_filters':100,
          'dropout_rate':0.2,
          'hidden_dimension':250,
          'output_dimension':1}
    
    loaded_cnn_model=CNNClassifier(**kargs)
    loaded_cnn_model.build((75381, 20))
    loaded_cnn_model.load_weights(base_path+prediction_path+'cnn_model_weights.h5')
    return loaded_cnn_model

## 상품명 전처리 함수

In [5]:
# 전처리 함수
def preprocessing(title, remove_stopwords=True, stop_words=[]):
    hangul = re.compile('[^ ㄱ-ㅣ가-힣]+') # 한글 아닌 문자 제거
    title=hangul.sub('', title)
    title=' '.join([i for i in title.split(' ') if i!='']) # 여러 공백 하나의 공백으로

    if remove_stopwords:
        title=[word for word in title.split(' ') if not word in stop_words]
        
    return title

## stop_words 로드 함수 정의

In [6]:
def get_stopwords():
    stop_words=[]
    with open("stop_words.txt", "r") as f:
        for line in f.readlines():
            stop_words.append(line.strip())
    return stop_words

## 결과 출력 함수 정의

In [7]:
def model_result(product_name, remove_stopwords=True):
    
    MAX_SEQUENCE_LENGTH=20
    
    clean_title=[]
    
    loaded_tokenizer=get_tokenizer()
    loaded_rfc_model=get_rfc_model()
    loaded_xgb_model=get_xgb_model()
    loaded_rnn_model=get_rnn_model()
    loaded_cnn_model=get_cnn_model()
    stop_words=get_stopwords()
    
    clean_title.append(preprocessing(product_name, remove_stopwords=remove_stopwords, stop_words=stop_words))
    product_sequences=loaded_tokenizer.texts_to_sequences(clean_title)
    product_input=pad_sequences(product_sequences, maxlen=MAX_SEQUENCE_LENGTH, padding='post')
    
    pred_by_rfc=loaded_rfc_model.predict(product_input)
    pred_by_xgb=loaded_xgb_model.predict(product_input)
    pred_by_rnn=(loaded_rnn_model.predict(product_input) > 0.5).astype("int32")
    pred_by_cnn=(loaded_cnn_model.predict(product_input) > 0.5).astype("int32")
    
    print('rfc 결과: ', pred_by_rfc)
    print('xgb 결과: ', pred_by_xgb)
    print('rnn 결과: ', pred_by_rnn[0])
    print('cnn 결과: ', pred_by_cnn[0])

In [8]:
model_result('리베로 기저귀 밴드형 팬티형 구매시 뽀로로 NEW쇼핑카트 증정', remove_stopwords=True)



rfc 결과:  [0]
xgb 결과:  [1]
rnn 결과:  [1]
cnn 결과:  [1]
