# CSV Data Loading

In [8]:
import pandas as pd
from transformers import BertTokenizer, BertForPreTraining
from transformers import BertModel
import torch
import numpy as np
import time

input_path = 'dataset/toy_3-core_80_20_with_text.csv'
output_path = 'dataset/toy_embdded_review.csv'

def load_text(path):
    data = []
    with open(path) as f:
        lines = f.readlines()
        for i in range(1, len(lines)):
             record = {}
             line = lines[i].split(',')
             record['reviewerID'] = line[0]
             record['asin'] = line[1]
             record['rating'] = line[2]
             record['review'] = "".join(line[3:])
             data.append(record)
    return data

def get_pure_text(review):
    return review['review']

def get_review_info(review):
    review = review.copy()
    review.pop('review')
    return review

def text2vector(input_path, output_path):
    reviews = load_text(input_path)
    pure_text = list(map(get_pure_text, reviews))
    df_info = pd.DataFrame(list(map(get_review_info, reviews)))
    

    def tokenize_text(reviews, batch_size):
        """
        Parameters
        ----------
        reviews: a list of string
            contains the text to be converted
        batch_size: int
            text numbers
        
        Returns
        -------
        embedding: tensor
            embedded text
        """
        tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
        current_index = 0
        batch_num = len(reviews) // batch_size
        embedding = tokenizer(pure_text[current_index:(current_index+batch_size)], return_tensors="pt", padding="longest", truncation=True)['input_ids']
        current_index += batch_size
        for i in range(1,batch_num):
            current_embedding = tokenizer(pure_text[current_index:(current_index+batch_size)], return_tensors="pt", padding="longest", truncation=True)['input_ids']
            current_index += batch_size
            torch.cat((embedding, current_embedding), dim=0)
        
        return embedding

    inputs = tokenize_text(pure_text, 10000)
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    model = BertModel.from_pretrained('bert-base-uncased')
    emb_dims = model.config.hidden_size
    model.to(device)
    batch_size = 10 
    
    outputs = np.zeros((len(pure_text), emb_dims))
    torch.cuda.empty_cache()
    batch_num = len(inputs) // batch_size + 1
    start_time = time.time()
    for batch in range(batch_num):
        if not batch % 100:
            current_time = time.time()
            print(f'{batch*batch_size} reviews processed, cost {current_time - start_time}s')
        
        start_index = batch * batch_size
        end_index = min(start_index+batch_size, len(inputs))
        selected_inputs = inputs[start_index:end_index]
        
        # Move input to GPU
        selected_inputs = selected_inputs.cuda()
        output = model(selected_inputs).last_hidden_state.mean(dim=1)
        try:
            output = output.cpu().detach().numpy()
            outputs[start_index:start_index+batch_size, :] = output
        except:
            continue
        del output
        torch.cuda.empty_cache()

    df = pd.DataFrame(outputs)
    df = pd.concat([df_info, df], axis=1)
    df.to_csv(output_path, index=False)
    

In [9]:

for k in ['train', 'val', 'test']:
    input_path = f'dataset/toy_3-core_80_20_{k}_with_text.csv'
    output_path = f'dataset/toy_embedded_review_{k}.csv'
    text2vector(input_path, output_path)
    print(f'{k} data processed')


Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.transform.dense.bias', 'cls.predictions.bias', 'cls.seq_relationship.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


0 reviews processed, cost 0.0s
1000 reviews processed, cost 20.020386457443237s
2000 reviews processed, cost 39.91488766670227s
3000 reviews processed, cost 59.89401650428772s
4000 reviews processed, cost 79.91717219352722s
5000 reviews processed, cost 99.93721461296082s
6000 reviews processed, cost 120.02234935760498s
7000 reviews processed, cost 140.1390676498413s
8000 reviews processed, cost 160.25284481048584s
9000 reviews processed, cost 180.35614442825317s
10000 reviews processed, cost 200.46010303497314s
train data processed


Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.transform.dense.bias', 'cls.predictions.bias', 'cls.seq_relationship.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


0 reviews processed, cost 0.0s
1000 reviews processed, cost 19.943925142288208s
2000 reviews processed, cost 39.82104563713074s
3000 reviews processed, cost 59.751686334609985s
4000 reviews processed, cost 79.74381566047668s
5000 reviews processed, cost 99.75318551063538s
6000 reviews processed, cost 119.7707028388977s
7000 reviews processed, cost 139.83673810958862s
8000 reviews processed, cost 159.90662026405334s
9000 reviews processed, cost 180.0421531200409s
10000 reviews processed, cost 200.10855340957642s
val data processed


Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.transform.dense.bias', 'cls.predictions.bias', 'cls.seq_relationship.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


0 reviews processed, cost 0.0s
1000 reviews processed, cost 20.000468492507935s
2000 reviews processed, cost 39.92395997047424s
3000 reviews processed, cost 59.91825866699219s
4000 reviews processed, cost 79.97499775886536s
5000 reviews processed, cost 100.09404921531677s
6000 reviews processed, cost 120.1779944896698s
7000 reviews processed, cost 140.2665615081787s
8000 reviews processed, cost 160.38254499435425s
9000 reviews processed, cost 180.52363848686218s
10000 reviews processed, cost 200.66764736175537s
test data processed
