In [16]:
import pandas as pd
import numpy as np
from bs4 import BeautifulSoup
from pytorch_transformers import BertTokenizer 

import re

df = pd.read_csv('IMDB_Dataset.csv')
# df = df.loc[:10] 시험삼아 해보세용..

In [18]:
# Sentiment 1, 0으로 바꾸기
df['sentiment'] = (df['sentiment'] == 'positive').astype(int)

In [19]:
# 문장 단위로 쪼개기
df_reviews = df.review.str.split('(?<!\w\.\w.)(?<![A-Z][a-z]\.)(?<=\.|\?)\s', expand=True)
df_reviews = df_reviews.stack().reset_index(level=1, drop=True).to_frame('review_single')
df_pre = df[["sentiment"]].merge(df_reviews, left_index=True, right_index=True, how='left')
df_reviews = df.values

In [20]:
counter = 0
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

In [21]:
# 전처리

def clean_review(raw_review: str) -> str:
    # 1. Remove HTML
    review_text = BeautifulSoup(raw_review, "lxml").get_text()
    return review_text

def preprocess(review: str, sentiment: int, total: int, show_progress: bool = True) -> list:
    if show_progress:
        global counter
        counter += 1
        print('Processing... %6i/%6i'% (counter, total), end='\r')
    # 1. Clean text
    review = '<CLS> ' + clean_review(review)
    
    # 2. BERT
    tokens = tokenizer.tokenize(review)
    all_tokens = []
    for i in range(len(tokens)):
        token_ids = tokenizer.convert_tokens_to_ids(tokens[:i+1])
        all_tokens.append([token_ids, sentiment])
    return all_tokens

In [22]:
df_reviews = np.array(list(map(lambda x: preprocess(x[0], x[1], len(df_reviews)), df_reviews)))

Processing...     11/    11

In [24]:
df_reviews[0]

[[[1026], 1],
 [[1026, 18856], 1],
 [[1026, 18856, 2015], 1],
 [[1026, 18856, 2015, 1028], 1],
 [[1026, 18856, 2015, 1028, 2028], 1],
 [[1026, 18856, 2015, 1028, 2028, 1997], 1],
 [[1026, 18856, 2015, 1028, 2028, 1997, 1996], 1],
 [[1026, 18856, 2015, 1028, 2028, 1997, 1996, 2060], 1],
 [[1026, 18856, 2015, 1028, 2028, 1997, 1996, 2060, 15814], 1],
 [[1026, 18856, 2015, 1028, 2028, 1997, 1996, 2060, 15814, 2038], 1],
 [[1026, 18856, 2015, 1028, 2028, 1997, 1996, 2060, 15814, 2038, 3855], 1],
 [[1026, 18856, 2015, 1028, 2028, 1997, 1996, 2060, 15814, 2038, 3855, 2008],
  1],
 [[1026,
   18856,
   2015,
   1028,
   2028,
   1997,
   1996,
   2060,
   15814,
   2038,
   3855,
   2008,
   2044],
  1],
 [[1026,
   18856,
   2015,
   1028,
   2028,
   1997,
   1996,
   2060,
   15814,
   2038,
   3855,
   2008,
   2044,
   3666],
  1],
 [[1026,
   18856,
   2015,
   1028,
   2028,
   1997,
   1996,
   2060,
   15814,
   2038,
   3855,
   2008,
   2044,
   3666,
   2074],
  1],
 [[1026,
   18

In [25]:
np.savez('./IMDB_pre_Dataset', df_reviews)