<a href="https://colab.research.google.com/github/yejun2/ai_projects/blob/main/movie_review_based_textCNN.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

***영화 리뷰 데이터 링크***

https://github.com/e9t/nsmc/

***참조 블로그***

https://kaya-dev.tistory.com/6

https://wikidocs.net/50739

***데이터 읽어오기***

In [None]:
!pip install -U torchtext==0.6.0

In [None]:
import pandas as pd
import re
from tqdm import tqdm
from sklearn.model_selection import train_test_split

train_data = pd.read_table("drive/MyDrive/Colab Notebooks/ratings_train.txt")
test_data = pd.read_table("drive/MyDrive/Colab Notebooks/ratings_test.txt")

데이터 양이 너무 많아 프로젝트 진행에 차질이 있어 데이터 축소
(아이디어 토큰화한 데이터를 저장할 때, join을 써서 쉼표로 구분하여 저장하고 로드하여 사용하면 가진 데이터를 전체 이용할수 있을거 같음)

데이터 갯수 확인

In [None]:
print(len(train_data))
print(len(test_data))

150000
50000


In [None]:
train_data = train_data[:40000]
test_data = test_data[:5000]

데이터 갯수 확인

In [None]:
print(len(train_data))
print(len(test_data))

40000
5000


document를 전처리할 메소드 생성

In [None]:
def text_preprocessing(doc):
  doc = re.sub('[\t\r\n\f\v]', ' ', str(doc))
  doc = re.sub('[^가-힣ㄱ-ㅎㅏ-ㅣ]', ' ', str(doc))
  doc = " ".join(doc.split())
  return doc

def document_preprocessing(data):
  data.drop_duplicates(subset = ['document'], inplace=True)
  data.dropna(axis=0)
  data['document'] = [text_preprocessing(x) for x in tqdm(data['document'])]
  return data

train_data와 test_data를 전처리

In [None]:
train_data = document_preprocessing(train_data)
test_data = document_preprocessing(test_data)

100%|██████████| 39350/39350 [00:00<00:00, 164708.21it/s]
100%|██████████| 4969/4969 [00:00<00:00, 157820.78it/s]


전처리 데이터 저장

In [None]:
train_data.to_csv("preprocessed_train.csv")
test_data.to_csv("preprocessed_test.csv")

***colab에서 OKT를 사용할 수 있는 환경을 만들어주는 코드***

In [None]:
!curl -s https://raw.githubusercontent.com/teddylee777/machine-learning/master/99-Misc/01-Colab/mecab-colab.sh | bash

konply를 import해오고 okt 변수 생성

In [None]:
from konlpy.tag import Okt

okt = Okt()

토큰화하는 메소드를 생성

In [None]:
def tokenizing_method(input_data):
  stopwords = ['의','가','이','은','들','는','좀','잘','걍','과','도','를','으로','자','에','와','한','하다']
  tokenized_data = []
  for sentence in input_data:
    tokenized_sentence = okt.morphs(sentence, stem=True)
    stopwords_removed_sentence = [word for word in tokenized_sentence if not word in stopwords]
    tokenized_data.append(stopwords_removed_sentence)
  return tokenized_data

tokening_method로 토큰화된 train_data['document']를 생성

In [None]:
tokenized_data = tokenizing_method(train_data['document'])

확인해보는 코드

In [None]:
print(tokenized_data[:10])

토큰화된 데이터를 저장하는 코드

In [None]:
tokenized_data = pd.DataFrame(tokenized_data)
tokenized_data.to_csv("/content/drive/MyDrive/tokenized_train_data.csv", index=False, encoding='utf-8-sig')

In [None]:
tokenized_data_label = pd.DataFrame(tokenized_data, train_data['label'])
tokenized_data_label.to_csv("/content/drive/MyDrive/tokenized_train_data_label.csv", index=False, encoding='utf-8-sig')

토큰화된 데이터를 불러오고 확인하는 코드

In [None]:
saved_tokenized_data = pd.read_csv("/content/drive/MyDrive/tokenized_train_data.csv")
print(saved_tokenized_data.head(10))

In [None]:
from gensim.models import Word2Vec

embedding_model = Word2Vec(sentences=tokenized_data,
                           sg=1,
                           vector_size=100,
                           window=2,
                           min_count=1,
                           workers=4
                           )


In [None]:
print(embedding_model)
model_result = embedding_model.wv.most_similar('재미')
print(model_result)

In [None]:
from gensim.models import KeyedVectors


In [None]:
embedding_model.wv.save_word2vec_format('/content/drive/MyDrive/review_tokens_w2v') # 모델 저장

In [None]:
loaded_model = KeyedVectors.load_word2vec_format('/content/drive/MyDrive/review_tokens_w2v') # 모델 로드

model_result = loaded_model.most_similar("추천")
print(model_result)

[('강력', 0.8423237204551697), ('강추', 0.8385577201843262), ('적극', 0.8177915811538696), ('권하다', 0.7926265597343445), ('해드리다', 0.76430344581604), ('보삼', 0.7491391897201538), ('소장', 0.7400360703468323), ('남자라면', 0.7314027547836304), ('감사', 0.730396032333374), ('재방송', 0.7297847867012024)]


In [None]:
import torchtext
from torchtext.data import Field

In [None]:
from torchtext.data import TabularDataset

In [None]:
import torch

In [None]:
stopwords = ['의','가','이','은','들','는','좀','잘','걍','과','도','를','으로','자','에','와','한','하다']

def preprocess(text):
    #stopword를 제거합니다.
    word = [t for t in text if t not in stopwords]
    return word

ID = Field(sequential= False, use_vocab = False)

IDX = Field(sequential = False, use_vocab = False)
#사용할 예정
TEXT = Field(sequential = True, batch_first = True,
			is_target = False, use_vocab = True,
			tokenize = okt.morphs,
			preprocessing = preprocess) #형태소 분석 + 형태소 분석 이후 추가 처리 진행!
LABEL = Field(sequential = False,batch_first = True,is_target = True,
			use_vocab = False,dtype = torch.float32)

#필드 정의
field = [('idx', IDX),('id',ID),('document',TEXT),('label',LABEL)]

#이전에 처리한 문서를 불러와서 훈련에 사용할 데이터로 만들어줍니다.
train_dataset, validation_dataset = TabularDataset.splits(
    path = '/content/', #반드시 있어야함!
    train = 'preprocessed_train.csv',
    validation = "preprocessed_test.csv",
    format = 'csv',
    fields = field,
    skip_header = True,

)

In [None]:
print(train_dataset[0].document)

['아', '더빙', '진짜', '짜증나네요', '목소리']


https://wikidocs.net/60314

In [None]:
import torch
from torchtext.vocab import Vectors
from torchtext.data import BucketIterator

vectors = Vectors(name='/content/drive/MyDrive/review_tokens_w2v')

TEXT.build_vocab(train_dataset, vectors = vectors, min_freq = 1, max_size = None)
LABEL.build_vocab(train_dataset)
IDX.build_vocab(train_dataset)
ID.build_vocab(train_dataset)

vocab = TEXT.vocab

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)

train_iter, validation_iter = BucketIterator.splits(
    datasets = (train_dataset, validation_dataset),
    batch_size = 20,
    device = device,
    sort = False
)

print('임베딩 벡터의 개수와 차원 : {} '.format(TEXT.vocab.vectors.shape))

cuda
임베딩 벡터의 개수와 차원 : torch.Size([47156, 100]) 


여기까지

In [None]:
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F

class TextCNN(nn.Module):

    def __init__(self, vocab_built, emb_dim, dim_channel, kernel_wins, num_class):

        super(TextCNN, self).__init__()

        self.embed = nn.Embedding(len(vocab_built), emb_dim)
        self.embed.weight.data.copy_(vocab_built.vectors)

        self.convs = nn.ModuleList([nn.Conv2d(1, dim_channel, (w, emb_dim)) for w in kernel_wins])
        self.relu = nn.ReLU()
        self.dropout = nn.Dropout(0.4)
        self.fc = nn.Linear(len(kernel_wins)*dim_channel, num_class)

    def forward(self, x):

        emb_x = self.embed(x)
        #print(emb_x.shape)
        emb_x = emb_x.unsqueeze(1)
        #print(emb_x.shape)
        con_x = [self.relu(conv(emb_x)) for conv in self.convs]
        #print(con_x[0].shape)
        #print(con_x[1].shape)
        #print(con_x[2].shape)
        '''
        print("size--")
        for x in con_x:
          print(x.squeeze(-1).shape)
        '''
        pool_x = [F.max_pool1d(x.squeeze(-1), x.size()[2]) for x in con_x]

        '''
        for x in pool_x:
          print(x.shape)
        print("torch cat---")
        '''
        fc_x = torch.cat(pool_x, dim=1)
        #print(fc_x.shape)
        fc_x = fc_x.squeeze(-1)
        #print(fc_x.shape)
        fc_x = self.dropout(fc_x)

        logit = self.fc(fc_x)

        return logit

In [None]:
def train_model(model, device, train_itr, optimizer):

    model.train()
    corrects, train_loss = 0.0,0
    batch_idx = 1
    for batch in train_itr:

        id, idx, text, target = batch.id, batch.idx, batch.document, batch.label
        target = target.type(torch.LongTensor)
        #text = torch.transpose(text, 0, 1)
        text, target = text.to(device), target.to(device)
        #print(text.shape)
        #print(target.shape)
        optimizer.zero_grad()
        #print("to model______")
        logit = model(text)
        #print("out model______")
        #print(logit.data)
        loss = F.cross_entropy(logit, target)
        loss.backward()
        optimizer.step()

        train_loss += loss.item()
        result = torch.max(logit,1)[1]
        corrects += (result.view(target.size()).data == target.data).sum()
        #break
        #print(batch_idx)
        batch_idx+=1
    train_loss /= len(train_itr.dataset)
    accuracy = 100.0 * corrects / len(train_itr.dataset)

    return train_loss, accuracy

In [None]:
def evaluate(model, device, itr):

    model.eval()
    corrects, test_loss = 0.0, 0

    for batch in itr:

        text = batch.document
        target = batch.label
        target = target.type(torch.LongTensor)
        #text = torch.transpose(text, 0, 1)
        text, target = text.to(device), target.to(device)

        logit = model(text)
        loss = F.cross_entropy(logit, target)

        test_loss += loss.item()
        result = torch.max(logit,1)[1]
        corrects += (result.view(target.size()).data == target.data).sum()

    test_loss /= len(itr.dataset)
    accuracy = 100.0 * corrects / len(itr.dataset)

    return test_loss, accuracy

In [None]:
model = TextCNN(vocab, 100, 10, [3, 4, 5], 2).to(device)
print(model)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

optimizer = optim.Adam(model.parameters(), lr=0.001)

best_test_acc = -1

for epoch in range(1, 3+1):

    tr_loss, tr_acc = train_model(model, device, train_iter, optimizer)
    #break
    print('Train Epoch: {} \t Loss: {} \t Accuracy: {}%'.format(epoch, tr_loss, tr_acc))

    val_loss, val_acc = evaluate(model, device, validation_iter)
    print('Valid Epoch: {} \t Loss: {} \t Accuracy: {}%'.format(epoch, val_loss, val_acc))

    if val_acc > best_test_acc:
        best_test_acc = val_acc

        print("model saves at {} accuracy".format(best_test_acc))
        torch.save(model.state_dict(), "/content/drive/MyDrive/TextCNN_Best_Validation")

    print('-----------------------------------------------------------------------------')

In [None]:
model = TextCNN(vocab, 100, 10, [3, 4, 5], 2)
model.load_state_dict(torch.load("/content/drive/MyDrive/TextCNN_Best_Validation"))
model.eval()

TextCNN(
  (embed): Embedding(47156, 100)
  (convs): ModuleList(
    (0): Conv2d(1, 10, kernel_size=(3, 100), stride=(1, 1))
    (1): Conv2d(1, 10, kernel_size=(4, 100), stride=(1, 1))
    (2): Conv2d(1, 10, kernel_size=(5, 100), stride=(1, 1))
  )
  (relu): ReLU()
  (dropout): Dropout(p=0.4, inplace=False)
  (fc): Linear(in_features=30, out_features=2, bias=True)
)

In [None]:
import torch.nn.functional as F

In [None]:
def predict(model,sentence):
    model.eval()
    with torch.no_grad():
        sent = preprocess(sentence)
        sent = okt.morphs(sentence)
        sent = torch.tensor([TEXT.vocab.stoi[i] for i in sent])
        if len(sent) < 6:
          sent = F.pad(sent,pad = (1,6-len(sent)-1),value = 1)
        sent = sent.unsqueeze(dim = 0) #for batch
        output = model(sent)

        return output

In [None]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'
model.to(device)
while True:
  user = input("테스트 할 리뷰를 작성하세요 : ")
  if user == '0':
    break
  model = model.to('cpu')
  pred = predict(model,user)
  pred = torch.max(pred,1)[1].item()
  if (pred == 1) :
    print("긍정적인 리뷰입니다.")
  else :
    print("부정적인 리뷰입니다.")
  print(pred)

테스트 할 리뷰를 작성하세요 : 지루함
부정적인 리뷰입니다.
0
테스트 할 리뷰를 작성하세요 : 진짜 재미없음
부정적인 리뷰입니다.
0
테스트 할 리뷰를 작성하세요 : 다시는 보고싶지 않다
부정적인 리뷰입니다.
0
테스트 할 리뷰를 작성하세요 : 마음 따스해지는 영화였다
긍정적인 리뷰입니다.
1
테스트 할 리뷰를 작성하세요 : ㅋㅋㅋㅋㅋㅋ 웃긴 영화다 코미디 영화 찾는 분께 추천
긍정적인 리뷰입니다.
1


KeyboardInterrupt: Interrupted by user