In [None]:
from google.colab import auth
auth.authenticate_user()

from google.colab import drive
drive.mount('/content/gdrive')

!pip install pytorch_transformers

In [None]:
import pandas as pd
import numpy as np
import unicodedata
import torch
from bs4 import BeautifulSoup
from pytorch_transformers import BertTokenizer, BertModel

import re

df = pd.read_csv('/content/gdrive/My Drive/IMDB_Dataset.csv', encoding='utf-8')
# df = pd.read_csv('test.csv', encoding='utf-8')
# 시험삼아 해보세용..

In [None]:
# Sentiment 1, 0으로 바꾸기
df['sentiment'] = (df['sentiment'] == 'positive').astype(int)

In [None]:
# 문장 단위로 쪼개기
df_reviews = df.review.str.split('(?<!\w\.\w.)(?<![A-Z][a-z]\.)(?<=\.|\?)\s', expand=True)
df_reviews = df_reviews.stack().reset_index(level=1, drop=True).to_frame('review_single')
df_pre = df[["sentiment"]].merge(df_reviews, left_index=True, right_index=True, how='left')
df_reviews = df_pre.values

In [None]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained('bert-base-uncased')
model = model.to('cuda')
model.eval()

In [None]:
import pickle
import gzip

with gzip.open('/content/gdrive/My Drive/Top5000_BtoA.pickle','rb') as f:
    Top5000_BtoA = pickle.load(f)

In [None]:
# 전처리
def unicodeToAscii(s):
    return ''.join(
        c for c in unicodedata.normalize('NFD', s)
        if unicodedata.category(c) != 'Mn'
    )

def clean_review(raw_review: str) -> str:
    # 1. unicode to ASCII
    raw_review = unicodeToAscii(raw_review)
    
    # 2. Remove HTML
    review_text = BeautifulSoup(raw_review, "lxml").get_text()
    
    return review_text

def preprocess(review: str, total: int, show_progress: bool = True) -> list:
    if show_progress:
        global counter
        counter += 1
        if counter % 10000 == 0:
          print('Processing... %6i/%6i'% (counter, total))
          
    # 1. Clean text
    review = clean_review(review)
        
    # 2. BERT
    token_ids = [tokenizer.encode(review, add_special_tokens=True)]
    
    L = len(token_ids[0])

    # 길이가 길면 그 문장을 넘어감.
    if L >= 100:
        return None
    
    # TOP5000에 없는 단어가 있는 문장이면 넘어감.
    for word in token_ids[0]:
        try:
            Top5000_BtoA[word]
        except:
            return None
    
    action_ids = torch.tensor([Top5000_BtoA[x] for x in token_ids[0]])
    token_ids = torch.tensor(token_ids)

    toks = torch.zeros((L - 1, L))
    mask = torch.zeros((L - 1, L))
    actions = token_ids[0, 1:L].view(L - 1, 1)
    #action_ids = token_ids[0, 1:L].view(L - 1, 1)
        
    for i in range(L - 1):
        temp_ids = torch.cat([token_ids[0, :i+1],token_ids[0, -1:]], dim = 0)
           
        #temp_ids = temp_ids.view(1, len(temp_ids))
        #segments_ids = torch.tensor([1] * len(temp_ids))
        toks[i, :i+2] = temp_ids
        mask[i, :i+2] = 1.
        
    with torch.no_grad():
        hidden = model(toks.long().to('cuda'),mask.long().to('cuda'))[0][:, -2]
        action = model.embeddings(actions.to('cuda'))[:, 0]
        
    return hidden, action, action_ids

In [None]:
counter = 0

states = []
actions = []
codes = []
action_ids = []

cur_size = 0
batch_size = 512
num = 1

df_length = len(df_reviews)
for df_review in df_reviews:
    parts = preprocess(df_review[1], df_length)
      
    if parts == None:
        continue
      
    states.append(parts[0])
    actions.append(parts[1])
    action_ids.append(parts[2])
    codes.append(torch.full((len(parts[0]), 1), df_review[0]).long())
    
    cur_size += len(parts[0])
        
    if cur_size >= batch_size:
        np.savez_compressed('/content/gdrive/My Drive/batch/IMDB_Dataset'+str(num),
                            states = torch.cat(states, dim=0).to('cpu'),
                            actions = torch.cat(actions, dim=0).to('cpu'),
                            codes = torch.cat(codes, dim=0).to('cpu'),
                           action_ids = torch.cat(action_ids, dim=0).to('cpu'))
        states = []
        actions = []
        codes = []
        cur_size = 0
        num += 1

In [None]:
actions.shape

In [None]:
states.shape

In [None]:
codes.shape

In [None]:
np.savez('IMDB_Dataset.npz', states = states, actions = actions, codes = codes)