In [1]:
import pickle
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import re
import urllib.request
from konlpy.tag import Mecab
from tqdm import tqdm
from sklearn.model_selection import train_test_split
from collections import Counter

urllib.request.urlretrieve("https://raw.githubusercontent.com/e9t/nsmc/master/ratings_train.txt", filename="ratings_train.txt")
urllib.request.urlretrieve("https://raw.githubusercontent.com/e9t/nsmc/master/ratings_test.txt", filename="ratings_test.txt")

train_data = pd.read_table('ratings_train.txt')
test_data = pd.read_table('ratings_test.txt')

In [2]:
train_data.drop_duplicates(subset=['document'], inplace=True)
train_data.dropna(inplace=True)
train_data['document'] = train_data['document'].str.replace("[^ㄱ-ㅎㅏ-ㅣ가-힣 ]","", regex=True)
train_data['document'] = train_data['document'].str.replace('^ +', "", regex=True)
train_data = train_data[~(train_data.document == '')]

In [3]:
test_data.drop_duplicates(subset=['document'], inplace=True)

test_data.dropna(inplace=True)

test_data['document'] = test_data['document'].str.replace("[^ㄱ-ㅎㅏ-ㅣ가-힣 ]","", regex=True)
test_data['document'] = test_data['document'].str.replace('^ +', "", regex=True)

test_data = test_data[~(test_data.document == '')]

In [4]:
from konlpy.tag import Komoran
komoran = Komoran()

komoran.pos(train_data.document[1])

stopwords = ['도', '는', '다', '의', '가', '이', '은', '한', '에', '하', '고', '을', 
             '를', '인', '듯', '과', '와', '네', '들', '듯', '지', '임', '게', '아', '나','네요']

In [5]:
X_train = []

for sentence in train_data.document:
    #print(sentence)
    tokenized_sentence = komoran.morphs(sentence)
    X_train.append([word for word in tokenized_sentence if not word in stopwords])

In [6]:
X_test = []
for sentence in tqdm(test_data['document']):
    tokenized_sentence = komoran.morphs(sentence) # 토큰화
    X_test.append([word for word in tokenized_sentence if not word in stopwords])

100%|███████████████████████████████████████████████████████████████████████████| 48852/48852 [00:14<00:00, 3446.66it/s]


In [7]:
y_train = train_data['label'].values
y_test = test_data['label'].values

In [8]:
X_train, X_valid, y_train, y_valid = train_test_split(X_train, y_train, test_size=0.2, random_state=0, stratify=y_train)
np.unique(y_train, return_counts=True)

word_list = []
for sent in X_train:
    for word in sent:
        word_list.append(word)

word_counts = Counter(word_list)

threshold = 3
total_freq = 0
rare_freq = 0
rare_cnt = 0

for key, value in word_counts.items():
    #총 누적 값 
    total_freq += value 
    if (value < threshold): 
        rare_cnt += 1
        rare_freq += value

vocab = sorted(word_counts.items(), key=lambda x : x[1], reverse=True)

word_to_index = {}
word_to_index['<PAD>'] = 0
word_to_index['<UNK>'] = 1 

In [9]:
#빈도수가 3개 이하인 것들 / len(vocab) - rare_cnt
vocab = vocab[:15955]

In [10]:
cnt = 2
for word, _ in vocab:
    word_to_index[word] = cnt
    cnt += 1 

In [11]:
word_counts['야']

1902

In [12]:
#인코딩 - 단어에 해당하는 개수 별로 인코딩 실시 
def texts_to_sequences(token_data, word_to_index):
    encode_data = []
    for sent in token_data:
        index_seq = []
        for word in sent:
            try:
                index_seq.append(word_to_index[word])
            except:
                index_seq.append(word_to_index['<UNK>'])
        encode_data.append(index_seq)
    return encode_data

In [13]:
en_X_train = texts_to_sequences(X_train, word_to_index)
en_X_val = texts_to_sequences(X_valid, word_to_index)
en_X_test = texts_to_sequences(X_test, word_to_index)

In [14]:
en_X_train[0]

[120, 1820, 85, 83, 6, 40]

In [15]:
index_to_word = {}
for key, val in word_to_index.items():
    index_to_word[val] = key

In [19]:
def check(max_len, nested_list):
    cnt = 0
    for sentence in nested_list:
        if sentence <= max_len:
            cnt += 1
    print(f"{max_len} 단어가 차지하는 비율 : {cnt / len(nested_list)}")

In [20]:
for x in range(15,40,5):
    check(x, [len(x) for x in en_X_train])

15 단어가 차지하는 비율 : 0.7473047096652166
20 단어가 차지하는 비율 : 0.8449025912615851
25 단어가 차지하는 비율 : 0.8881132107914782
30 단어가 차지하는 비율 : 0.9173874168199873
35 단어가 차지하는 비율 : 0.9385800505528139


In [21]:
def pad_sequences(sentences, max_len):
    features = np.zeros((len(sentences), max_len), dtype=int)
    for index, sentence in enumerate(sentences):
        if len(sentence) != 0:
          features[index, :len(sentence)] = np.array(sentence)[:max_len]
    return features

In [23]:
import torch
import torch.nn as nn
import torch.nn.functional as F

train_label_tensor = torch.tensor(np.array(y_train))
valid_label_tensor = torch.tensor(np.array(y_valid))
test_label_tensor = torch.tensor(np.array(y_test))
print(train_label_tensor[:5])

class TextClassifier(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, output_dim):
        super(TextClassifier, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, batch_first=True)
        self.fc = nn.Linear(hidden_dim, output_dim)

    def forward(self, x):
        # x: (batch_size, seq_length)
        embedded = self.embedding(x)  # (batch_size, seq_length, embedding_dim)

        # LSTM은 (hidden state, cell state)의 튜플을 반환합니다
        lstm_out, (hidden, cell) = self.lstm(embedded)  # lstm_out: (batch_size, seq_length, hidden_dim), hidden: (1, batch_size, hidden_dim)

        last_hidden = hidden.squeeze(0)  # (batch_size, hidden_dim)
        logits = self.fc(last_hidden)  # (batch_size, output_dim)
        return logits

tensor([0, 1, 1, 1, 1])
