In [1]:
import pandas as pd
import numpy as np
import unicodedata
import torch
from bs4 import BeautifulSoup
from pytorch_transformers import BertTokenizer, BertModel

import re

df = pd.read_csv('IMDB_Dataset.csv', encoding='utf-8')
df = df.loc[:20]
# df = pd.read_csv('test.csv', encoding='utf-8')
# 시험삼아 해보세용..

In [2]:
# Sentiment 1, 0으로 바꾸기
df['sentiment'] = (df['sentiment'] == 'positive').astype(int)

In [3]:
# 문장 단위로 쪼개기
df_reviews = df.review.str.split('(?<!\w\.\w.)(?<![A-Z][a-z]\.)(?<=\.|\?)\s', expand=True)
df_reviews = df_reviews.stack().reset_index(level=1, drop=True).to_frame('review_single')
df_pre = df[["sentiment"]].merge(df_reviews, left_index=True, right_index=True, how='left')
df_reviews = df_pre.values

In [4]:
MAX = 0
MAXS = ''
for df_review in df_reviews:
    if MAX < len(df_review[1]):
        MAXS = df_review[1]

In [5]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained('bert-base-uncased')
model.eval()

BertModel(
  (embeddings): BertEmbeddings(
    (word_embeddings): Embedding(30522, 768, padding_idx=0)
    (position_embeddings): Embedding(512, 768)
    (token_type_embeddings): Embedding(2, 768)
    (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): BertEncoder(
    (layer): ModuleList(
      (0): BertLayer(
        (attention): BertAttention(
          (self): BertSelfAttention(
            (query): Linear(in_features=768, out_features=768, bias=True)
            (key): Linear(in_features=768, out_features=768, bias=True)
            (value): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): BertSelfOutput(
            (dense): Linear(in_features=768, out_features=768, bias=True)
            (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
            (dropout): Dropout(p=0.1, inplace=False)
          

In [6]:
# 전처리
def unicodeToAscii(s):
    return ''.join(
        c for c in unicodedata.normalize('NFD', s)
        if unicodedata.category(c) != 'Mn'
    )

def clean_review(raw_review: str) -> str:
    # 1. unicode to ASCII
    raw_review = unicodeToAscii(raw_review)
    
    # 2. Remove HTML
    review_text = BeautifulSoup(raw_review, "lxml").get_text()
    
    return review_text

def preprocess(review: str, total: int, show_progress: bool = True) -> list:
    if show_progress:
        global counter
        counter += 1
        print('Processing... %6i/%6i'% (counter, total), end='\r')
    # 1. Clean text
    review = clean_review(review)
        
    # 2. BERT
    token_ids = torch.tensor([tokenizer.encode(review, add_special_tokens=True)])
    all_hiddens = []
        
    # 길이가 길면 그 문장을 넘어감.
    if len(token_ids[0, :]) >= 512:
        return all_hiddens
    
    for i in range(len(token_ids[0, :]) - 1):
        temp_ids = torch.cat([token_ids[0, :i+1],token_ids[0, -1:]], dim = 0)
        action = token_ids[0, i+1]
        
        
        temp_ids = temp_ids.view(1, len(temp_ids))
        segments_ids = torch.tensor([1] * len(temp_ids))
                
        with torch.no_grad():
            hidden = model(temp_ids)[0][0][-1]
            action = model.embeddings(torch.tensor([[action]]))[0][0]
                        
        all_hiddens.append([hidden, action])
    return all_hiddens

In [7]:
counter = 0

states = []
actions = []
codes = []

# num = 1

df_length = len(df_reviews)
for df_review in df_reviews:
    parts = preprocess(df_review[1], df_length)
    for part in parts:
        states.append(part[0])
        actions.append(part[1])
        codes.append(torch.tensor([df_review[0]]))
        
    '''
    분할 저장용 코드
    if len(states) > batch_size:
        np.savez_compressed('IMDB_Dataset'+str(num)+'.npz',
                                states = states, actions = actions, codes = codes)
        states = []
        actions = []
        codes = []
        num += 1
    '''

states = torch.stack(states)
actions = torch.stack(actions)
codes = torch.stack(codes)

Processing...    140/   140

In [8]:
actions.shape

torch.Size([4562, 768])

In [9]:
states.shape

torch.Size([4562, 768])

In [10]:
codes.shape

torch.Size([4562, 1])

In [11]:
np.savez('IMDB_Dataset.npz', states = states, actions = actions, codes = codes)