In [None]:
import pandas as pd
import numpy as np
import unicodedata
import torch
from bs4 import BeautifulSoup
from pytorch_transformers import BertTokenizer, BertModel

import re

df = pd.read_csv('IMDB_Dataset.csv', encoding='utf-8')
# df = pd.read_csv('test.csv', encoding='utf-8')
# 시험삼아 해보세용..

In [None]:
# Sentiment 1, 0으로 바꾸기
df['sentiment'] = (df['sentiment'] == 'positive').astype(int)

In [None]:
# 문장 단위로 쪼개기
df_reviews = df.review.str.split('(?<!\w\.\w.)(?<![A-Z][a-z]\.)(?<=\.|\?)\s', expand=True)
df_reviews = df_reviews.stack().reset_index(level=1, drop=True).to_frame('review_single')
df_pre = df[["sentiment"]].merge(df_reviews, left_index=True, right_index=True, how='left')
df_reviews = df_pre.values

In [None]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

In [None]:
# 전처리
def unicodeToAscii(s):
    return ''.join(
        c for c in unicodedata.normalize('NFD', s)
        if unicodedata.category(c) != 'Mn'
    )

def clean_review(raw_review: str) -> str:
    # 1. unicode to ASCII
    raw_review = unicodeToAscii(raw_review)
    
    # 2. Remove HTML
    review_text = BeautifulSoup(raw_review, "lxml").get_text()
    
    return review_text

def preprocess(review: str, total: int, show_progress: bool = True) -> list:
    if show_progress:
        global counter
        counter += 1
        if counter % 100 == 0:
            print('Processing... %6i/%6i'% (counter, total))
    # 1. Clean text
    review = clean_review(review)
        
    # 2. BERT
    token_ids = torch.tensor([tokenizer.encode(review, add_special_tokens=True)])
        
    # 길이가 길면 그 문장을 넘어감.
    if len(token_ids[0, :]) >= 100:
        return None
      
    L = len(token_ids[0])
      
    actions = token_ids[0, 1:L].view(L - 1, 1)

    return actions

In [None]:
data = np.load('IMDB_Dataset_n233.npz')
data['states'].shape

In [None]:
counter = 0

actions_ids = []

cur_size = 0
batch_size = 512
num = 1

df_length = len(df_reviews)
for df_review in df_reviews:
    parts = preprocess(df_review[1], df_length)
    print(parts, parts.shape)
    if len(parts) == None:
        continue
      
    actions_ids.append(parts)
    cur_size += len(parts)
    if cur_size >= batch_size:
        data = np.load('IMDB_Dataset'+str(num)+'npz')
        states = data['states']
        actions = data['actions']
        codes = data['codes']
        np.savez_compressed('IMDB_Dataset_n'+str(num),
                            states = states,
                            actions = actions,
                            codes = codes,
                           actions_ids = torch.cat(actions_ids, dim=0))
        break
        actions_ids = 0
        cur_size = 0
        num += 1

In [None]:
actions.shape

In [None]:
states.shape

In [None]:
codes.shape

In [None]:
np.savez('IMDB_Dataset.npz', states = states, actions = actions, codes = codes)