<a href="https://colab.research.google.com/github/sunny0103/DeepLearning_nlp_projects/blob/main/Naver_movie_review%20/Naver_movie_review_cnn.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Naver Movie reviews text classification with a CNN in pytorch
## Install liabraries

In [1]:
!pip install konlpy
!pip install -U torchtext==0.6.0
!pip install -U torch



In [2]:
from konlpy.tag import Okt
from gensim.models import fasttext

import torch
import torchtext
from torchtext.data import TabularDataset, Field, BucketIterator


import copy
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import urllib.request
import re
import os
import random

from sklearn.model_selection import train_test_split

In [3]:
def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)


seed_everything(42)
SEED = 42

## Data Load and preprocessing

In [4]:
urllib.request.urlretrieve("https://raw.githubusercontent.com/e9t/nsmc/master/ratings_train.txt", filename="ratings_train.txt")
urllib.request.urlretrieve("https://raw.githubusercontent.com/e9t/nsmc/master/ratings_test.txt", filename="ratings_test.txt")

('ratings_test.txt', <http.client.HTTPMessage at 0x7da0cf988e20>)

In [5]:
train_df = pd.read_table('ratings_train.txt')
test_df = pd.read_table('ratings_test.txt')
print("Length of train data: ", len(train_df))
print("Length of test data: ", len(test_df))

Length of train data:  150000
Length of test data:  50000


In [6]:
def preprocess(df):
  df.drop('id', axis=1, inplace=True) # remove ID column
  df.drop(df[df['document'].isnull()==True].index, inplace=True) # remove null values in document column
  df = df.drop_duplicates(['document']).reset_index(drop=True) # remove duplicates
  df['document'] = df['document'].str.replace("[^ㄱ-ㅎㅏ-ㅣ가-힣 ]","") # leave only Korean characters
  df['document'] = df['document'].apply(lambda x: ' '.join([token for token in x.split() if len(token)>1]))
  df = df[df.document.apply(lambda x: len(str(x)) > 10 and len(str(x).split()) > 5)].reset_index(drop=True)
  return df

In [7]:
train_df = preprocess(train_df)
train_df.head()

  df['document'] = df['document'].str.replace("[^ㄱ-ㅎㅏ-ㅣ가-힣 ]","") # leave only Korean characters


Unnamed: 0,document,label
0,교도소 이야기구먼 솔직히 재미는 없다평점 조정,0
1,사이몬페그의 익살스런 연기가 돋보였던 영화스파이더맨에서 늙어보이기만 했던 커스틴 던...,1
2,걸음마 세부터 초등학교 학년생인 살용영화ㅋㅋㅋ별반개도 아까움,0
3,반개도 아깝다 욕나온다 이응경 길용우 연기생활이몇년인지정말 발로해도 그것보단 낫겟다...,0
4,액션이 없는데도 재미 있는 몇안되는 영화,1


In [8]:
test_df = preprocess(test_df)

  df['document'] = df['document'].str.replace("[^ㄱ-ㅎㅏ-ㅣ가-힣 ]","") # leave only Korean characters


In [9]:
labels = train_df['label'].values
num_labels = len(set(labels))
num_labels # number of labels

2

In [10]:

stopwords = ['의','가','이','은','들','는','좀','잘','걍','과','도','를','으로','자','에','와','한','하다']

#function for stopwords
def stopwords_process(text):
    word = [t for t in text if t not in stopwords]
    return word

In [11]:
train_df.to_csv("./train_data.csv", index=False)
test_df.to_csv("./test_data.csv", index=False)

In [12]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [13]:
cd "/content/drive/MyDrive/Data/navermovies"

/content/drive/MyDrive/Data/navermovies


In [14]:
tokenizer = Okt()

In [15]:
random_seed=42
np.random.seed(random_seed)
torch.manual_seed(random_seed)

<torch._C.Generator at 0x7da152720630>

In [16]:
pd.read_csv("./train_data.csv")

Unnamed: 0,document,label
0,교도소 이야기구먼 솔직히 재미는 없다평점 조정,0
1,사이몬페그의 익살스런 연기가 돋보였던 영화스파이더맨에서 늙어보이기만 했던 커스틴 던...,1
2,걸음마 세부터 초등학교 학년생인 살용영화ㅋㅋㅋ별반개도 아까움,0
3,반개도 아깝다 욕나온다 이응경 길용우 연기생활이몇년인지정말 발로해도 그것보단 낫겟다...,0
4,액션이 없는데도 재미 있는 몇안되는 영화,1
...,...,...
71090,공포나 재난영화가 아니라 아예 대놓고 비급 크리쳐개그물임ㅋㅋ 음악 완전 흥겹다ㅋ 점...,0
71091,디케이드 다음에 더블 다음에 오즈인데 더블은 조금밖에 안나오네요,1
71092,이게 뭐요 한국인은 거들먹거리고 필리핀 혼혈은 착하다,0
71093,청춘 영화의 최고봉방황과 우울했던 날들의 자화상,1


In [17]:
pd.read_csv("./test_data.csv")

Unnamed: 0,document,label
0,뭐야 평점들은 나쁘진 않지만 짜리는 더더욱 아니잖아,0
1,지루하지는 않은데 완전 막장임 돈주고 보기에는,0
2,아니었어도 다섯 줬을텐데 나와서 심기를 불편하게 하죠,0
3,마치 미국애니에서 튀어나온듯한 창의력없는 로봇디자인부터가고개를 젖게한다,0
4,갈수록 개판되가는 중국영화 유치하고 내용없음 폼잡다 끝남 말도안되는 무기에 유치한남...,0
...,...,...
23697,액션영화로 기대하지말고 스릴러영화라 생각하고 보면 괜찮은 영화인듯,1
23698,오랜만에 평점 로긴했네ㅋㅋ 킹왕짱 쌈뽕한 영화를 만났습니다 강렬하게 육쾌함,1
23699,의지 박약들이나 하는거다 탈영은 일단 주인공 김대희 닮았고 이등병 찐따,0
23700,그림도 좋고 완성도도 높았지만 보는 내내 불안하게 만든다,0


In [18]:
fix_length = 256

Text =  Field(sequential = True,
              tokenize = tokenizer.morphs,
              fix_length = fix_length,
              batch_first = True,
              is_target=False,
              use_vocab=True,
              tokenizer_language ='ko',
              preprocessing = stopwords_process,
              eos_token='<EOS>',
              include_lengths=False
              )

Label =  Field(sequential=False,
               batch_first= True,
               is_target=True,
               dtype = torch.float32,
               use_vocab=True,
               unk_token=None
               )

In [19]:
path = os. getcwd()

In [20]:
valid_ratio =.2

In [21]:
train_data, valid_data = TabularDataset(
        path=path+'/train_data.csv',format='csv',
        fields=[('document', Text), ('label', Label)], skip_header=True).split(split_ratio=(1 - valid_ratio))

In [22]:
test_data = TabularDataset(
    path=path+'/test_data.csv', format='csv',
    fields=[('document', Text), ('label', Label)])

In [23]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'

In [24]:
vector = torchtext.vocab.Vectors(name='./cc.ko.300.vec.gz')

In [25]:
Text.build_vocab(train_data, max_size=10000, vectors=vector)
Label.build_vocab(train_data)

In [26]:
BATCH_SIZE = 16
# an iterator that batches examples of similar lengths together.
train_iter = BucketIterator(dataset=train_data, batch_size=BATCH_SIZE,
                            shuffle=True, device=device,
                            sort_key=lambda x: len(x.document),
                            sort_within_batch=True)
valid_iter = BucketIterator(dataset=valid_data, batch_size=BATCH_SIZE,
                            device=device,
                            sort_key=lambda x: len(x.document),
                            sort_within_batch=True)
test_iter = BucketIterator(dataset=test_data, batch_size=BATCH_SIZE,
                           device=device,
                           sort_key=lambda x: len(x.document),
                           sort_within_batch=True)

In [27]:
import torch.nn as nn
import torch.nn.functional as F

In [44]:
def accuracy_metric(predictions, true_vals):
  max_vals, max_indicies = torch.max(predictions, 1)
  accuracy = (max_indicies == true_vals).sum().data.cpu().numpy()/max_indicies.size()[0]
  return accuracy

In [45]:
import torch
import torch.nn as nn
import torch.nn.functional as F

class CNNClassifier(nn.Module):

    def __init__(
        self,
        input_size,
        word_vec_size,
        n_classes,
        use_batch_norm=False,
        dropout_p=.5,
        window_sizes=[3, 4, 5],
        n_filters=[100, 100, 100],
    ):
        self.input_size = input_size  # vocabulary size
        self.word_vec_size = word_vec_size
        self.n_classes = n_classes
        self.use_batch_norm = use_batch_norm
        self.dropout_p = dropout_p
        # window_size means that how many words a pattern covers.
        self.window_sizes = window_sizes
        # n_filters means that how many patterns to cover.
        self.n_filters = n_filters

        super().__init__()

        self.emb = nn.Embedding.from_pretrained(Text.vocab.vectors)
        # Use nn.ModuleList to register each sub-modules.
        self.feature_extractors = nn.ModuleList()
        for window_size, n_filter in zip(window_sizes, n_filters):
            self.feature_extractors.append(
                nn.Sequential(
                    nn.Conv2d(
                        in_channels=1, # We only use one embedding layer.
                        out_channels=n_filter,
                        kernel_size=(window_size, word_vec_size),
                    ),
                    nn.ReLU(),
                    nn.BatchNorm2d(n_filter) if use_batch_norm else nn.Dropout(dropout_p),
                )
            )

        # An input of generator layer is max values from each filter.
        self.generator = nn.Linear(sum(n_filters), n_classes)
        # We use LogSoftmax + NLLLoss instead of Softmax + CrossEntropy
        self.activation = nn.LogSoftmax(dim=-1)

    def forward(self, x):
        # |x| = (batch_size, length)
        x = self.emb(x)
        # |x| = (batch_size, length, word_vec_size)
        min_length = max(self.window_sizes)
        if min_length > x.size(1):
            # Because some input does not long enough for maximum length of window size,
            # add zero tensor for padding.
            pad = x.new(x.size(0), min_length - x.size(1), self.word_vec_size).zero_()
            # |pad| = (batch_size, min_length - length, word_vec_size)
            x = torch.cat([x, pad], dim=1)
            # |x| = (batch_size, min_length, word_vec_size)


        x = x.unsqueeze(1)
        # |x| = (batch_size, 1, length, word_vec_size)

        cnn_outs = []
        for block in self.feature_extractors:
            cnn_out = block(x)
            # |cnn_out| = (batch_size, n_filter, length - window_size + 1, 1)


            cnn_out = nn.functional.max_pool1d(
                input=cnn_out.squeeze(-1),
                kernel_size=cnn_out.size(-2)
            ).squeeze(-1)
            # |cnn_out| = (batch_size, n_filter)
            cnn_outs += [cnn_out]
        # Merge output tensors from each convolution layer.
        cnn_outs = torch.cat(cnn_outs, dim=-1)
        # |cnn_outs| = (batch_size, sum(n_filters))
        y = self.activation(self.generator(cnn_outs))
        # |y| = (batch_size, n_classes)

        return y


In [46]:
import torch.optim as optim
from tqdm import tqdm

In [47]:
num_batches = int(len(train_data) / BATCH_SIZE)

In [54]:
class Trainer():
  def __init__(self, model, optimizer, criterion):
    self.model = model
    self.optimizer = optimizer
    self.criterion = criterion


  def train(self, epochs=1):
    self.model.train()
    best_acc = float('-inf')
    train_acc = 0.0
    for epoch in range(1, epochs+1):
      running_loss = 0.0
      print('==============Epoch:{}/{}=============='.format(epoch, epochs))
      for batch_idx, batch in tqdm(enumerate(train_iter), total=num_batches, desc="Training progress"):
        input = batch.document
        label = batch.label
        label = label.long()


        self.optimizer.zero_grad()
        logits = self.model(input)
        loss = self.criterion(logits, label)
        running_loss += loss.item()

        loss.backward()
        self.optimizer.step()
        train_acc+=accuracy_metric(logits, label)

      train_loss = running_loss/len(train_data)
      train_acc = train_acc/(batch_idx+1)

      valid_loss, valid_acc = self.validation()
      if best_acc < valid_acc:
        best_acc = valid_acc
        torch.save(self.model.state_dict(), f'./CNN.model' )
      tqdm.write('Train Loss: {:.4f}| Train Accuracy:{:.4f}| Valid Loss: {:.4f}| Valid Accuracy:{:.4f}'.format(train_loss, train_acc, valid_loss, valid_acc))


  def validation(self):
    self.model.eval()
    running_loss = 0.0
    valid_acc =0.0
    with torch.no_grad():
      for batch_idx, batch in enumerate(valid_iter):
        input = batch.document
        label = batch.label
        label = label.long()

        logits = self.model(input)
        loss = self.criterion(logits, label)
        running_loss += loss.item()

        valid_acc += accuracy_metric(logits, label)

      valid_loss = running_loss/len(valid_data)
      valid_acc = valid_acc/(batch_idx+1)
      return valid_loss, valid_acc

In [55]:
input_size = len(Text.vocab.stoi)
word_vec_size = 300
n_classes = 2
num_epochs = 5

model = CNNClassifier(input_size = input_size, word_vec_size=word_vec_size, n_classes=n_classes).to(device)
optimizer = optim.Adam(model.parameters())
criterion = nn.NLLLoss()

In [56]:
trainer = Trainer(model, optimizer, criterion)
trainer.train(num_epochs)
trainer.validation()



Training progress: 3555it [00:12, 275.25it/s]                          


Train Loss: 0.0269| Train Accuracy:0.7977| Valid Loss: 0.0251| Valid Accuracy:0.8232


Training progress: 3555it [00:12, 282.97it/s]                          


Train Loss: 0.0212| Train Accuracy:0.8526| Valid Loss: 0.0233| Valid Accuracy:0.8310


Training progress: 3555it [00:12, 276.84it/s]                          


Train Loss: 0.0147| Train Accuracy:0.9045| Valid Loss: 0.0269| Valid Accuracy:0.8250


Training progress: 3555it [00:12, 281.52it/s]                          


Train Loss: 0.0081| Train Accuracy:0.9532| Valid Loss: 0.0314| Valid Accuracy:0.8209


Training progress: 3555it [00:12, 282.35it/s]                          


Train Loss: 0.0046| Train Accuracy:0.9745| Valid Loss: 0.0382| Valid Accuracy:0.8181


(0.03824016958236961, 0.8180923407301359)

In [57]:
Text.build_vocab(test_data, max_size=10000, vectors=vector)
Label.build_vocab(test_data)

In [58]:
model_load = '.CNN.model'

In [59]:
model.eval()
model.load_state_dict(torch.load(model_load, map_location=device))
with torch.no_grad():
  test_acc = 0.0
  for batch_idx, batch in enumerate(test_iter):
    input = batch.document
    label = batch.label
    label = label.long()

    logits = model(input)
    test_acc += accuracy_metric(logits, label)
  test_acc = test_acc/(batch_idx+1)

print('Accuracy:',float(test_acc))

Accuracy: 0.518971708116445
