<a href="https://colab.research.google.com/github/welcomeglory/CodingTest_Java/blob/master/LSTM.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install Korpora
!pip install konlpy
!pip install tensorflow==2.15.0

Collecting Korpora
  Downloading Korpora-0.2.0-py3-none-any.whl.metadata (26 kB)
Collecting dataclasses>=0.6 (from Korpora)
  Downloading dataclasses-0.6-py3-none-any.whl.metadata (3.0 kB)
Downloading Korpora-0.2.0-py3-none-any.whl (57 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m57.8/57.8 kB[0m [31m2.2 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dataclasses-0.6-py3-none-any.whl (14 kB)
Installing collected packages: dataclasses, Korpora
Successfully installed Korpora-0.2.0 dataclasses-0.6


Collecting konlpy
  Downloading konlpy-0.6.0-py2.py3-none-any.whl.metadata (1.9 kB)
Collecting JPype1>=0.7.0 (from konlpy)
  Downloading JPype1-1.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (4.9 kB)
Downloading konlpy-0.6.0-py2.py3-none-any.whl (19.4 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m19.4/19.4 MB[0m [31m44.5 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading JPype1-1.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (488 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m488.6/488.6 kB[0m [31m22.9 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: JPype1, konlpy
Successfully installed JPype1-1.5.0 konlpy-0.6.0
Collecting tensorflow==2.15.0
  Downloading tensorflow-2.15.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (4.4 kB)
Collecting ml-dtypes~=0.2.0 (from tensorflow==2.15.0)
  Downloading ml_dtypes-0.2.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64

In [8]:
import tensorflow as tf
import pandas as pd
import numpy as np
from Korpora import Korpora
from tqdm import tqdm
tf.__version__

'2.15.0'

In [3]:
Korpora.fetch("nsmc")
corpus = Korpora.load("nsmc")

[nsmc] download ratings_train.txt: 14.6MB [00:00, 86.9MB/s]                           
[nsmc] download ratings_test.txt: 4.90MB [00:00, 29.8MB/s]                           



    Korpora 는 다른 분들이 연구 목적으로 공유해주신 말뭉치들을
    손쉽게 다운로드, 사용할 수 있는 기능만을 제공합니다.

    말뭉치들을 공유해 주신 분들에게 감사드리며, 각 말뭉치 별 설명과 라이센스를 공유 드립니다.
    해당 말뭉치에 대해 자세히 알고 싶으신 분은 아래의 description 을 참고,
    해당 말뭉치를 연구/상용의 목적으로 이용하실 때에는 아래의 라이센스를 참고해 주시기 바랍니다.

    # Description
    Author : e9t@github
    Repository : https://github.com/e9t/nsmc
    References : www.lucypark.kr/docs/2015-pyconkr/#39

    Naver sentiment movie corpus v1.0
    This is a movie review dataset in the Korean language.
    Reviews were scraped from Naver Movies.

    The dataset construction is based on the method noted in
    [Large movie review dataset][^1] from Maas et al., 2011.

    [^1]: http://ai.stanford.edu/~amaas/data/sentiment/

    # License
    CC0 1.0 Universal (CC0 1.0) Public Domain Dedication
    Details in https://creativecommons.org/publicdomain/zero/1.0/

[Korpora] Corpus `nsmc` is already installed at /root/Korpora/nsmc/ratings_train.txt
[Korpora] Corpus `nsmc` is already installed at /root/Korpora/nsmc/ra

In [4]:
# Define the LSTM model class
class LSTMModel(tf.keras.Model):
    def __init__(self, sequence_length, vocab_size, embedding_dim, hidden_size, num_of_class):
        super(LSTMModel, self).__init__()
        self.embedding_layer = tf.keras.layers.Embedding(vocab_size, embedding_dim, input_length=sequence_length)
        self.lstm_layer = tf.keras.layers.LSTM(hidden_size, return_sequences=False, stateful=False, recurrent_initializer='glorot_uniform')
        self.hidden_layer1 = tf.keras.layers.Dense(256, activation='relu')
        self.hidden_layer2 = tf.keras.layers.Dense(256, activation='relu')
        if num_of_class == 2:
            self.output_layer = tf.keras.layers.Dense(1, activation='sigmoid')  # 이진 분류
        else:
            self.output_layer = tf.keras.layers.Dense(num_of_class, activation='softmax')  # 다중 클래스 분류

    def call(self, x):
        embedded_input = self.embedding_layer(x)
        features = self.lstm_layer(embedded_input)
        hid1 = self.hidden_layer1(features)
        hid2 = self.hidden_layer2(hid1)
        logits = self.output_layer(hid2)
        return logits

# Loss function
@tf.function
def cross_entropy_loss(logits, y, num_of_class):
    if num_of_class == 2:  # 이진 분류
        loss = tf.reduce_mean(tf.keras.losses.binary_crossentropy(y, logits, from_logits=True))
    else:  # 다중 클래스 분류
        loss = tf.reduce_mean(tf.keras.losses.categorical_crossentropy(y, logits, from_logits=True))
    return loss

# Backward propagation
@tf.function
def backward(model, x, y, optimizer, num_of_class):
    with tf.GradientTape() as tape:
        logits = model(x)
        loss = cross_entropy_loss(logits, y, num_of_class)
    gradients = tape.gradient(loss, model.trainable_variables)
    optimizer.apply_gradients(zip(gradients, model.trainable_variables))
    return loss

# Accuracy function
@tf.function
def accuracy(predY, y, num_of_class):
    y = tf.cast(y, tf.float32)  # 라벨을 float32로 캐스팅
    if num_of_class == 2:  # 이진 분류
        predY = tf.round(predY)  # sigmoid 출력에서 예측 클래스 계산
        correct_predictions = tf.equal(predY, y)
    else:  # 다중 클래스 분류
        correct_predictions = tf.equal(tf.argmax(predY, 1), tf.argmax(y, 1))
    acc = tf.reduce_mean(tf.cast(correct_predictions, tf.float32))
    return acc

# Training function
def train_learning(epochs, frequence, train_dataset, model, test_dataset, optimizer, num_of_class):
    y_loss = []
    for epoch in tqdm(range(epochs)):
        for batch, (bx, by) in enumerate(train_dataset):
            loss = backward(model, bx, by, optimizer, num_of_class)
            y_loss.append(loss)
        if epoch % frequence == 0:
            acc_list = []
            for test_batch, (tx, ty) in enumerate(test_dataset):
                predY = model(tx)
                acc = accuracy(predY, ty, num_of_class)
                acc_list.append(acc)
            print(f"Epoch: {epoch+1} ===> Loss: {loss.numpy()}, accuracy: {np.mean([a.numpy() for a in acc_list])}")
    return y_loss

In [5]:
# Data preprocessing function
def get_train_test_data(df_train, df_test, num_of_class=2):
    train_x = df_train["text"].values
    train_y = df_train["labels"].values
    test_x = df_test["text"].values
    test_y = df_test["labels"].values

    train_x = np.array(train_x)
    if num_of_class > 2:
        train_y = tf.keras.utils.to_categorical(train_y, num_of_class)
        test_y = tf.keras.utils.to_categorical(test_y, num_of_class)
    else:
        train_y = np.expand_dims(train_y, axis=-1)  # 이진 분류에서 형상 맞추기
        test_y = np.expand_dims(test_y, axis=-1)

    return train_x, train_y, test_x, test_y

In [6]:
train_text = pd.DataFrame(corpus.train.texts, columns=['text'])
train_label = pd.DataFrame(corpus.train.labels, columns=['labels'])
train = pd.concat([train_text, train_label], axis=1)
# print(train)


test_text = pd.DataFrame(corpus.test.texts, columns=['text'])
test_label = pd.DataFrame(corpus.test.labels, columns=['labels'])
test = pd.concat([test_text, test_label], axis=1)
# print(train)
cnt_labels =2

X_train, y_train, X_test, y_test = get_train_test_data(train, test, cnt_labels)
print(X_train)


['아 더빙.. 진짜 짜증나네요 목소리' '흠...포스터보고 초딩영화줄....오버연기조차 가볍지 않구나'
 '너무재밓었다그래서보는것을추천한다' ... '이게 뭐요? 한국인은 거들먹거리고 필리핀 혼혈은 착하다?'
 '청춘 영화의 최고봉.방황과 우울했던 날들의 자화상' '한국 영화 최초로 수간하는 내용이 담긴 영화']


In [7]:
# Vectorization
vectorize_layer = tf.keras.layers.TextVectorization(max_tokens=max_features, output_mode='int', output_sequence_length=sequence_length)
vectorize_layer.adapt(tf.constant(X_train))
vocab_set = vectorize_layer.get_vocabulary()
vocab_size = len(vocab_set)
print(vocab_size)

# Model parameters
max_features = vocab_size
sequence_length = 10
embedding_dim = 256  # 의미 입려값을 전환값
hidden_size = 1024
num_of_class = 2  # 이진 분류: 2, 다중 클래스 분류: 그 이상
batch_size = 32

# Vectorize the input data
X_Train_vectorized = vectorize_layer(tf.constant(X_train))
X_Test_vectorized = vectorize_layer(tf.constant(X_test))

# Create TensorFlow datasets
train_dataset = tf.data.Dataset.from_tensor_slices((X_Train_vectorized, y_train)).shuffle(10000).batch(batch_size)
test_dataset = tf.data.Dataset.from_tensor_slices((X_Test_vectorized, y_test)).batch(batch_size)

# Model initialization
model = LSTMModel(sequence_length, vocab_size, embedding_dim, hidden_size, num_of_class)

# Optimizer
learning_rate = 1e-4
optimizer = tf.optimizers.Adam(learning_rate)

# Model training
epochs = 30
frequence = 2
y_loss = train_learning(epochs, frequence, train_dataset, model, test_dataset, optimizer, num_of_class)
path = "/content/drive/MyDrive/세종교육/LSTM/model_version/"
model.save(f"{path}Movie_LSTM_model_tf_{tf.__version__}_v0")
# # Evaluate the model on test data
# loss, accuracy = model.evaluate(test_dataset)
# print(f'Test Loss: {loss}, Test Accuracy: {accuracy}')

NameError: name 'max_features' is not defined