## 벌금 예측

In [1]:
import os
import pandas as pd
import numpy as np
import tensorflow as tf
from sklearn.model_selection import train_test_split
import random

seed_value= 42
os.environ['PYTHONHASHSEED']=str(seed_value)
random.seed(seed_value)
np.random.seed(seed_value)
tf.random.set_seed(seed_value)

2023-12-12 11:59:56.816355: I tensorflow/tsl/cuda/cudart_stub.cc:28] Could not find cuda drivers on your machine, GPU will not be used.
2023-12-12 11:59:56.879195: I tensorflow/tsl/cuda/cudart_stub.cc:28] Could not find cuda drivers on your machine, GPU will not be used.
2023-12-12 11:59:56.880544: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


### 데이터셋 가져오기 및 분할

In [2]:
dataframe = pd.read_excel("./output/02-extract-nouns.xlsx", engine='openpyxl')

In [3]:
X_train, X_test, y_train, y_test = train_test_split(dataframe['extracted_nouns'], dataframe['fine'], test_size=0.1, random_state=42)

### 토크나이저 정의 및 임베딩 행렬 생성

In [4]:
max_features = 30000
sequence_length = 256

tokenizer = tf.keras.preprocessing.text.Tokenizer(num_words=max_features, split=' ', oov_token='<unw>')
tokenizer.fit_on_texts(X_train)

X_train = tokenizer.texts_to_sequences(X_train)
X_test = tokenizer.texts_to_sequences(X_test)

X_train = tf.keras.preprocessing.sequence.pad_sequences(X_train, sequence_length)
X_test = tf.keras.preprocessing.sequence.pad_sequences(X_test, sequence_length)

# 인퍼런스 환경에서 만들어진 토크나이저를 사용하기 위해 피클로 저장
# import pickle

# with open('./model/tokenizer.pickle', 'wb') as handle:
#     pickle.dump(tokenizer, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [None]:
embeddings_index = {}
f = open("./model/ft_1119.txt",  encoding='utf-8')
for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    embeddings_index[word] = coefs
f.close()

word_index = tokenizer.word_index

num_words = min(max_features, len(word_index)) + 1
print(num_words)
embedding_dim = 200
num_filters = 100

embedding_matrix = np.zeros((num_words, embedding_dim))

for word, i in word_index.items():
    if i > max_features:
        continue
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector
    else:
        embedding_matrix[i] = np.random.randn(embedding_dim)

12673


### 모델링

In [None]:
input_3 = tf.keras.Input(dtype = tf.float32, shape = (sequence_length,))
embedding_layer_3 = tf.keras.layers.Embedding(num_words,
                            embedding_dim,
                            embeddings_initializer=tf.keras.initializers.Constant(embedding_matrix),
                            input_length=sequence_length,
                            trainable=True)(input_3)

reshape_3 = tf.keras.layers.Reshape((sequence_length, embedding_dim, 1))(embedding_layer_3)

conv_0_3 = tf.keras.layers.Conv2D(num_filters, kernel_size=(3, embedding_dim), activation=tf.nn.relu, kernel_regularizer=tf.keras.regularizers.l2(3))(reshape_3)
conv_1_3 = tf.keras.layers.Conv2D(num_filters, kernel_size=(4, embedding_dim), activation=tf.nn.relu, kernel_regularizer=tf.keras.regularizers.l2(3))(reshape_3)
conv_2_3 = tf.keras.layers.Conv2D(num_filters, kernel_size=(5, embedding_dim), activation=tf.nn.relu, kernel_regularizer=tf.keras.regularizers.l2(3))(reshape_3)

maxpool_0_3 = tf.keras.layers.MaxPool2D(pool_size=(sequence_length - 3 + 1, 1), strides=(1,1), padding='valid')(conv_0_3)
maxpool_1_3 = tf.keras.layers.MaxPool2D(pool_size=(sequence_length - 4 + 1, 1), strides=(1,1), padding='valid')(conv_1_3)
maxpool_2_3 = tf.keras.layers.MaxPool2D(pool_size=(sequence_length - 5 + 1, 1), strides=(1,1), padding='valid')(conv_2_3)

concatenated_tensor_3 = tf.keras.layers.Concatenate(axis=1)([maxpool_0_3, maxpool_1_3, maxpool_2_3])
flatten_3 = tf.keras.layers.Flatten()(concatenated_tensor_3)
dropout_3 = tf.keras.layers.Dropout(rate = 0.5)(flatten_3)

dense_layer_3 = tf.keras.layers.Dense(units = 256, activation = tf.nn.relu)(dropout_3)
dense_layer_4 = tf.keras.layers.Dense(units = 64, activation = tf.nn.relu)(dense_layer_3)
dense_layer_5 = tf.keras.layers.Dense(units = 16, activation = tf.nn.relu)(dense_layer_4)

output = tf.keras.layers.Dense(units = 1, activation = tf.nn.relu)(dense_layer_5)

model = tf.keras.Model(inputs=input_3, outputs=output)

In [None]:
model.compile(loss=tf.keras.losses.MeanSquaredError(), optimizer=tf.keras.optimizers.Adamax(learning_rate=0.001), metrics=['mse', 'mae'])
model.summary()

Model: "model_1"
__________________________________________________________________________________________________
 Layer (type)                Output Shape                 Param #   Connected to                  
 input_2 (InputLayer)        [(None, 256)]                0         []                            
                                                                                                  
 embedding_1 (Embedding)     (None, 256, 200)             2534600   ['input_2[0][0]']             
                                                                                                  
 reshape_1 (Reshape)         (None, 256, 200, 1)          0         ['embedding_1[0][0]']         
                                                                                                  
 conv2d_3 (Conv2D)           (None, 254, 1, 100)          60100     ['reshape_1[0][0]']           
                                                                                            

 concatenate_1 (Concatenate  (None, 3, 1, 100)            0         ['max_pooling2d_3[0][0]',     
 )                                                                   'max_pooling2d_4[0][0]',     
                                                                     'max_pooling2d_5[0][0]']     
                                                                                                  
 flatten_1 (Flatten)         (None, 300)                  0         ['concatenate_1[0][0]']       
                                                                                                  
 dropout_1 (Dropout)         (None, 300)                  0         ['flatten_1[0][0]']           
                                                                                                  
 dense_4 (Dense)             (None, 256)                  77056     ['dropout_1[0][0]']           
                                                                                                  
 dense_5 (

### 모델 학습

In [17]:
callback = tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=30)
history = model.fit(x=X_train, y=y_train, batch_size=32, epochs=100, verbose=1, validation_split=0.1, callbacks=[callback])

Epoch 1/10


Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [18]:
model.save('./model/fine_231122.keras')

### 최종 모델 평가

모델의 결과값이 회귀 형태로 도출되기 때문에 이를 분류 문제로 변경하기 위해 예측값과 실제값이 오차범위 내에 있다면 정답, 아니라면 오답이라 간주하여 정확도 측정

In [5]:
# 모델 평가
from keras.models import load_model

def calculate_accuracy(model, inputs, labels, error_torlerance=0.2):
    y_hats = model.predict(inputs)
    score = sum([True if abs(y_hat - labels[i]) <= error_torlerance else False for i, y_hat in enumerate(y_hats)])
    print(score / len(inputs))

model = load_model("./model/fine_231122.keras")
calculate_accuracy(model, X_test, y_test.to_numpy(), error_torlerance=5) # 예측과 정답 간 error_torlerance 차이는 정답이라 가정

0.8929633300297324
