# CSV Data Loading

In [1]:
def load_text(path):
    data = []
    with open(path) as f:
        lines = f.readlines()
        for i in range(1, len(lines)):
             record = {}
             line = lines[i].split(',')
             record['reviewerID'] = line[0]
             record['asin'] = line[1]
             record['rating'] = line[2]
             record['review'] = "".join(line[3:])
             data.append(record)
    return data

reviews = load_text('dataset/toy_3-core_80_20_with_text.csv')
reviews[0]

def get_pure_text(review):
    return review['review']

pure_text = list(map(get_pure_text, reviews))

# Convert text to vocabulary token

In [2]:

from transformers import BertTokenizer, BertForPreTraining
import torch

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertForPreTraining.from_pretrained('bert-base-uncased')
inputs = tokenizer(pure_text, return_tensors="pt", padding="longest", truncation=True)['input_ids']
inputs.shape


Downloading:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/226k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/455k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/570 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/420M [00:00<?, ?B/s]

Some weights of BertForPreTraining were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['cls.predictions.decoder.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


torch.Size([111069, 512])

# Setup Bert Pretraining Model

In [3]:

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
from transformers import BertModel
model = BertModel.from_pretrained('bert-base-uncased')
emb_dims = model.config.hidden_size

model.to(device)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.decoder.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


BertModel(
  (embeddings): BertEmbeddings(
    (word_embeddings): Embedding(30522, 768, padding_idx=0)
    (position_embeddings): Embedding(512, 768)
    (token_type_embeddings): Embedding(2, 768)
    (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): BertEncoder(
    (layer): ModuleList(
      (0): BertLayer(
        (attention): BertAttention(
          (self): BertSelfAttention(
            (query): Linear(in_features=768, out_features=768, bias=True)
            (key): Linear(in_features=768, out_features=768, bias=True)
            (value): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): BertSelfOutput(
            (dense): Linear(in_features=768, out_features=768, bias=True)
            (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
            (dropout): Dropout(p=0.1, inplace=False)
          

# Generate Embedded Review Vectors

In [4]:
import pandas as pd
import numpy as np
import time 

batch_size = 10 
outputs = torch.zeros((len(pure_text), emb_dims))
outputs = np.zeros((len(pure_text), emb_dims))

torch.cuda.empty_cache()
batch_num = len(inputs) // batch_size + 1

start_time = time.time()
for batch in range(batch_num):
    if not batch % 100:
        current_time = time.time()
        print(f'{batch*batch_size} reviews processed, cost {current_time - start_time}s')
    
    start_index = batch * batch_size
    end_index = min(start_index+batch_size, len(inputs))
    selected_inputs = inputs[start_index:end_index]
    
    # Move input to GPU
    selected_inputs = selected_inputs.cuda()
    output = model(selected_inputs).last_hidden_state.mean(dim=1)
    output = output.cpu().detach().numpy()
    outputs[start_index:start_index+batch_size, :] = output
    del output
    torch.cuda.empty_cache()

df = pd.DataFrame(outputs)




0 reviews processed, cost 9.989738464355469e-05s


RuntimeError: CUDA out of memory. Tried to allocate 120.00 MiB (GPU 0; 3.82 GiB total capacity; 2.07 GiB already allocated; 30.62 MiB free; 2.11 GiB reserved in total by PyTorch) If reserved memory is >> allocated memory try setting max_split_size_mb to avoid fragmentation.  See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF

# Output Embedded Data

In [None]:
df = pd.DataFrame(outputs)
df.to_csv('dataset/toy_embedded_review.csv', header=False, index=False)