In [1]:
import pandas as pd
import numpy as np
import unicodedata
import torch
from bs4 import BeautifulSoup
from pytorch_transformers import BertTokenizer, BertModel

import re

df = pd.read_csv('IMDB_Dataset.csv', encoding='utf-8')
df = df.loc[:25]
# 시험삼아 해보세용..

In [2]:
# Sentiment 1, 0으로 바꾸기
df['sentiment'] = (df['sentiment'] == 'positive').astype(int)

In [3]:
# 문장 단위로 쪼개기
df_reviews = df.review.str.split('(?<!\w\.\w.)(?<![A-Z][a-z]\.)(?<=\.|\?)\s', expand=True)
df_reviews = df_reviews.stack().reset_index(level=1, drop=True).to_frame('review_single')
df_pre = df[["sentiment"]].merge(df_reviews, left_index=True, right_index=True, how='left')
df_reviews = df_pre.values

In [4]:
MAX = 0
MAXS = ''
for df_review in df_reviews:
    if MAX < len(df_review[1]):
        MAXS = df_review[1]

In [5]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained('bert-base-uncased')

In [6]:
# 전처리
def unicodeToAscii(s):
    return ''.join(
        c for c in unicodedata.normalize('NFD', s)
        if unicodedata.category(c) != 'Mn'
    )

def clean_review(raw_review: str) -> str:
    # 1. unicode to ASCII
    raw_review = unicodeToAscii(raw_review)
    
    # 2. Remove HTML
    review_text = BeautifulSoup(raw_review, "lxml").get_text()
    
    return review_text

def preprocess(review: str, total: int, show_progress: bool = True) -> list:
    if show_progress:
        global counter
        counter += 1
        print('Processing... %6i/%6i'% (counter, total), end='\r')
    # 1. Clean text
    review = clean_review(review)
        
    # 2. BERT
    token_ids = torch.tensor([tokenizer.encode(review, add_special_tokens=True)])
    all_hiddens = []
    
    for i in range(min(len(token_ids[0, :]) - 1,512)):
        temp_ids = torch.cat([token_ids[0, :i+1],token_ids[0, -1:]], dim = 0)
        action = token_ids[0, i+1]
        
        temp_ids = temp_ids.view(1, len(temp_ids))
        
        with torch.no_grad():
            hidden = model(temp_ids)[0][0][-1]
                
        all_hiddens.append([hidden, action])
    return all_hiddens

In [7]:
counter = 0

states = []
actions = []
codes = []

df_length = len(df_reviews)
for df_review in df_reviews:
    parts = preprocess(df_review[1], df_length)
    for part in parts:
        states.append(part[0])
        actions.append(torch.tensor([part[1]]))
        codes.append(torch.tensor([df_review[0]]))

states = torch.stack(states)
actions = torch.stack(actions)
codes = torch.stack(codes)

Processing...    168/   168

In [8]:
actions.shape

torch.Size([5679, 1])

In [9]:
states.shape

torch.Size([5679, 768])

In [10]:
codes.shape

torch.Size([5679, 1])

In [11]:
np.savez_compressed('IMDB_Dataset.npz', states = states, actions = actions, codes = codes)