In [None]:
import pandas as pd 
import timeit
import re
import numpy as np
import torch
from transformers import BertTokenizer, BertModel, get_linear_schedule_with_warmup
import pyarrow as pa
import pyarrow.parquet as pq
import multiprocessing

# Import the huge dataset as JSON and convert it to .parquet

In [None]:
df = pd.read_json('yelp_academic_dataset_review.json', lines=True)
relevant_cols = ['review_id','text','stars']
df_relevant = df[relevant_cols]
table = pa.Table.from_pandas(df_relevant)
pq.write_table(table, 'data.parquet')

# Convert each of the star values to [0,4] rather than [1,5] (will come in handy later)

In [None]:
df = pd.read_parquet('data.parquet')

In [None]:
def decrease_star(star_number):
    star_number = int(star_number)
    return star_number -1
df['stars'] = df.stars.apply(decrease_star)

# Define our tokenizer and our model

In [None]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

In [None]:
model = BertModel.from_pretrained('bert-base-uncased')

# Tokenize and encode the data in batches of 1000 rows each

In [None]:
batch_size = 1000
num_batches = int(len(df)/batch_size)
print(f"Number of batches to run: {num_batches}")

In [None]:
input_ids_list = [] # Add list to store the encoded value of the tokens
attention_mask_list = []  # Add list to store attention masks
token_type_ids_list = []  # Add list to store segment IDs

final_df = pd.DataFrame()

In [None]:
for i in range(num_batches):
    
    # Prints out a nice messsage
    print(f"Processing batch {i}/{num_batches}")
    
    # Only looks at the section of the huge df that we're interested in for this batch
    batch_df = df[i*batch_size:(i+1)*batch_size]
    
    # Encode the batch of data
    for txt in batch_df.text:
        tokens = tokenizer.encode_plus(
            txt,
            truncation=True,
            max_length=512,
            padding='max_length',
            return_tensors='pt',
        )
        
        # Append the input IDs, attention mask, and segment IDs to their respective lists
        input_ids_list.append(tokens['input_ids'][0].tolist())
        attention_mask_list.append(tokens['attention_mask'][0].tolist())
        token_type_ids_list.append(tokens['token_type_ids'][0].tolist())
        
    # The first write 
    if i == 100:
        # Make a new df that contains the list of encoded data for this set of batches
        final_df = pd.DataFrame()
        final_df['input_ids'] = pd.Series(input_ids_list)

        # Convert the DataFrame to a PyArrow table
        table = pa.Table.from_pandas(final_df)

        # Write the PyArrow table to a new Parquet file
        pq.write_table(table, 'final_data.parquet')

        # Reset the lists
        input_ids_list = [] 
        attention_mask_list = [] 
        token_type_ids_list = []  

    # All the rest of the writes
    if i % 100 == 0 and i > 100 :

        # Make a new df that contains the list of encoded data for this set of batches
        final_df = pd.DataFrame()
        final_df['input_ids'] = pd.Series(input_ids_list)

        # Load the existing Parquet file into a PyArrow table
        existing_table = pq.read_table('final_data.parquet')

        # Convert the DataFrame to a PyArrow table
        new_table = pa.Table.from_pandas(final_df)

        # Concatenate the existing table and the new table
        concatenated_table = pa.concat_tables([existing_table, new_table])

        # Write the concatenated table to the same Parquet file
        pq.write_table(concatenated_table, 'final_data.parquet')

        # Reset the lists
        input_ids_list = [] 
        attention_mask_list = [] 
        token_type_ids_list = []
