In [21]:
from google.colab import auth
auth.authenticate_user()

from google.colab import drive
drive.mount('/content/gdrive')

!pip install pytorch_transformers

Mounted at /content/gdrive


In [0]:
import pandas as pd
import numpy as np
import unicodedata
import torch
from bs4 import BeautifulSoup
from pytorch_transformers import BertTokenizer, BertModel

import pickle
import gzip

with gzip.open('/content/gdrive/My Drive/pre_reviews.pickle', 'rb') as f:
    pre_reviews = pickle.load(f)
    
with gzip.open('/content/gdrive/My Drive/Top5000_BtoA.pickle','rb') as f:
    Top5000_BtoA = pickle.load(f)

In [4]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained('bert-base-uncased')
model = model.to('cuda')
model.eval()

100%|██████████| 231508/231508 [00:00<00:00, 421488.61B/s]
100%|██████████| 313/313 [00:00<00:00, 131901.65B/s]
100%|██████████| 440473133/440473133 [00:37<00:00, 11854686.38B/s]


BertModel(
  (embeddings): BertEmbeddings(
    (word_embeddings): Embedding(30522, 768, padding_idx=0)
    (position_embeddings): Embedding(512, 768)
    (token_type_embeddings): Embedding(2, 768)
    (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): BertEncoder(
    (layer): ModuleList(
      (0): BertLayer(
        (attention): BertAttention(
          (self): BertSelfAttention(
            (query): Linear(in_features=768, out_features=768, bias=True)
            (key): Linear(in_features=768, out_features=768, bias=True)
            (value): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): BertSelfOutput(
            (dense): Linear(in_features=768, out_features=768, bias=True)
            (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
            (dropout): Dropout(p=0.1, inplace=False)
          

In [0]:
def preprocess(review: str, total: int, show_progress: bool = True) -> list:
    if show_progress:
        global counter
        counter += 1
        if counter % 10000 == 0:
          print('Processing... %6i/%6i'% (counter, total))
        
    # BERT
    token_ids = tokenizer.encode(review, add_special_tokens=True)
    
    L = len(token_ids)

    # 길이가 길면 그 문장을 넘어감.
    if L >= 50:
        return None
    
    # TOP5000에 없는 단어가 있는 문장이면 넘어감.
    for word in token_ids:
        try:
            Top5000_BtoA[word]
        except:
            return None
    
    prev_action_ids = torch.tensor([Top5000_BtoA[x] for x in token_ids[1:-1]]).view(L - 2, 1)
    action_ids = torch.tensor([Top5000_BtoA[x] for x in token_ids[2:]]).view(L - 2, 1)
    token_ids = torch.tensor(token_ids)

    toks = torch.zeros((L - 2, L))
    mask = torch.zeros((L - 2, L))
    actions = token_ids[2:L].view(L - 2, 1)
    #action_ids = token_ids[0, 1:L].view(L - 1, 1)
        
    for i in range(L - 2):
        temp_ids = torch.cat([token_ids[:i+2],token_ids[-1:]], dim = 0)
           
        #temp_ids = temp_ids.view(1, len(temp_ids))
        #segments_ids = torch.tensor([1] * len(temp_ids))
        toks[i, :i+3] = temp_ids
        mask[i, :i+3] = 1.
        
    with torch.no_grad():
        hidden = model(toks.long().to('cuda'),mask.long().to('cuda'))[0][:, -2] # 마지막 단어
        action = model.embeddings(actions.to('cuda'))[:, 0]
        
    return hidden, action, action_ids, prev_action_ids

In [0]:
counter = 0

states = []
actions = []
codes = []
action_ids = []
prev_action_ids = []

cur_size = 0
batch_size = 512
num = 1

length = len(pre_reviews)

for review in pre_reviews:
    parts = preprocess(review[0], length)
      
    if parts == None:
      continue
    
    states.append(parts[0])
    actions.append(parts[1])
    action_ids.append(parts[2])
    prev_action_ids.append(parts[3])
    codes.append(torch.full((len(parts[0]), 1), review[1]).long())
    
    cur_size += len(parts[0])
        
    if cur_size >= batch_size:
        np.savez_compressed('/content/gdrive/My Drive/batch_2/IMDB_Dataset'+str(num),
                            states = torch.cat(states, dim=0).to('cpu'),
                            actions = torch.cat(actions, dim=0).to('cpu'),
                            codes = torch.cat(codes, dim=0).to('cpu'),
                           action_ids = torch.cat(action_ids, dim=0).to('cpu'),
                           prev_action_ids = torch.cat(prev_action_ids, dim=0).to('cpu'))
        states = []
        actions = []
        codes = []
        action_ids = []
        prev_action_ids = []
        
        cur_size = 0
        num += 1

In [0]:
actions.shape

In [0]:
states.shape

In [0]:
codes.shape

In [0]:
np.savez('IMDB_Dataset.npz', states = states, actions = actions, codes = codes)