### Sequence 모델링

In [65]:
# 필요한 패키지 불러오지
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [66]:
df_1 = pd.read_csv('book.csv')

df_1.tail()

Unnamed: 0.1,Unnamed: 0,ID,Title,Rating,Author,Price,Pdate,Publisher,Btype,Salseindex
9996,9996,97053281,인류사를 가로지른 스마트한 발명들 50,,알프리트 슈미츠,6700.0,2014년 06월 10일,서해문집,old,
9997,9997,97053282,,,,,,,new,
9998,9998,97053283,,,,,,,new,
9999,9999,97053284,행복의 기원,,서은국,10500.0,2014년 05월 22일,21세기북스,old,
10000,10000,97053285,,,,,,,new,


### 전처리

In [67]:
# 결측치 제거 (제목과 가격이 있는 행만 유지)
df_1.dropna(subset=['Title','Publisher','Btype','Price'], inplace=True)

# price  object => 수치형
df_1['Price'] = pd.to_numeric(df_1['Price'].str.replace('[^\d.]', ''), errors='coerce')

In [68]:
df_1['Price'].dropna(inplace=True)

In [69]:
df = df_1[df_1['Title'].str.contains('[가-힣]', regex=True)]

In [70]:
df=df_1[["Price","Title"]].dropna()

In [71]:
X = df['Title']
y = df['Price'] # 목표, 타겟, 정답, 레이블

In [72]:
df['Price'].mean()

645.4022988505748

In [73]:
lens = []
for i in df.Title.str.split(" "):
    lens.append(len(i))
print("책 최대 길이 : ", max(lens))

name_len = 24

책 최대 길이 :  11


In [74]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Embedding, Flatten
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split

In [75]:
# 텍스트 토큰화 및 시퀀스 변환
tokenizer = Tokenizer()
# Tokenizer 객체를 사용하여 텍스트를 토큰화, 각 토큰(단어)에 고유한 정수 인덱스 할당
tokenizer.fit_on_texts(X)
X_seq = tokenizer.texts_to_sequences(X)

In [76]:
len(tokenizer.word_index)

599

In [77]:
print(X.iloc[1])


삼국지 (상)


In [78]:
X_seq[0]

[31, 179, 180, 181, 182, 9]

In [79]:
# 패딩으로 시퀀스 길이 맞추기
X_pad = pad_sequences(X_seq, maxlen=name_len)

In [80]:
X_pad[0]

array([  0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
         0,   0,   0,   0,   0,  31, 179, 180, 181, 182,   9], dtype=int32)

In [81]:
# 훈련 데이터와 검증 데이터 분리
X_train, X_test, y_train, y_test = train_test_split(X_pad, y, test_size=0.2, random_state=42)

In [82]:
X_train[0]

array([  0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
         0,   0,   0,   0,   0,   0,   0,   2, 156, 157, 158], dtype=int32)

### 모델 설계 밑 구축

In [83]:
from tensorflow.keras.layers import LSTM, Dropout

# 모델 설계, 구축
model_fnn = Sequential()

model_fnn.add(Embedding(input_dim=len(tokenizer.word_index)+1, output_dim=128, input_length=name_len))
model_fnn.add(LSTM(64))
model_fnn.add(Dropout(0.5))
model_fnn.add(Dense(32, activation='relu'))
model_fnn.add(Dropout(0.5))
model_fnn.add(Dense(1, activation='linear'))

model_fnn.build(input_shape=(None, name_len))
model_fnn.summary()



In [84]:
# 임베딩 된 벡터 값 확인
# 모델의 임베딩 층 가중치 추출
embedding_layer_weights = model_fnn.layers[0].get_weights()[0]

# 가중치 배열의 형태 확인
print("Embedding layer weights shape:", embedding_layer_weights.shape)

# 예를 들어, 사전에 정의된 첫 번째 단어의 초기 밀집 벡터 값 확인
print("Initial vector for the first word:\n", (embedding_layer_weights.shape))

Embedding layer weights shape: (600, 128)
Initial vector for the first word:
 (600, 128)


In [85]:
# 모델 컴파일
model_fnn.compile(optimizer='adam', loss='mean_squared_error')

In [86]:
y_train

4293    400.0
2300    500.0
1021    900.0
2192    500.0
5843    900.0
        ...  
1165    900.0
4214    300.0
2310    800.0
2488    500.0
2453    900.0
Name: Price, Length: 208, dtype: float64

In [87]:
from tensorflow.keras.callbacks import EarlyStopping

EarlyStopping_ = EarlyStopping(patience=10)

# 모델 훈련
history = model_fnn.fit(X_train, y_train, epochs=100000, batch_size=32, validation_split=0.2, callbacks=[EarlyStopping_])

Epoch 1/100000


[1m6/6[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 31ms/step - loss: 490036.4688 - val_loss: 499105.6562
Epoch 2/100000
[1m6/6[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 7ms/step - loss: 503476.0000 - val_loss: 498200.4688
Epoch 3/100000
[1m6/6[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 7ms/step - loss: 469298.4062 - val_loss: 496620.2812
Epoch 4/100000
[1m6/6[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 7ms/step - loss: 488855.9375 - val_loss: 495325.7188
Epoch 5/100000
[1m6/6[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 7ms/step - loss: 485090.0625 - val_loss: 493937.9062
Epoch 6/100000
[1m6/6[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 8ms/step - loss: 490813.5625 - val_loss: 492304.9062
Epoch 7/100000
[1m6/6[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 7ms/step - loss: 463712.6875 - val_loss: 490373.3438
Epoch 8/100000
[1m6/6[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 8ms/step - loss: 488659.406

In [88]:
loss = model_fnn.evaluate(X_test, y_test, verbose=0)
print(f'Test Loss: {loss}')

Test Loss: 107542.2421875


### 예측 및 평가

In [89]:
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import numpy as np

# 모델 평가: Keras에서 반환하는 손실값
loss = model_fnn.evaluate(X_test, y_test, verbose=0)
print(f'Test Loss: {loss}')

# 예측값 생성
y_pred = model_fnn.predict(X_test).flatten()

# MSE 계산
mse = mean_squared_error(y_test, y_pred)
print(f'Mean Squared Error (MSE): {mse}')

# RMSE 계산
rmse = np.sqrt(mse)
print(f'Root Mean Squared Error (RMSE): {rmse}')

# MAE 계산
mae = mean_absolute_error(y_test, y_pred)
print(f'Mean Absolute Error (MAE): {mae}')

# R² 계산
r2 = r2_score(y_test, y_pred)
print(f'R-squared (R²): {r2}')

Test Loss: 107542.2421875
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 75ms/step
Mean Squared Error (MSE): 107542.24880247904
Root Mean Squared Error (RMSE): 327.9363487057802
Mean Absolute Error (MAE): 270.34268102106057
R-squared (R²): -0.22490244888773936
