In [1]:
import pandas as pd 
import timeit
import re
import numpy as np
import torch
from transformers import BertTokenizer, BertModel, get_linear_schedule_with_warmup
import pyarrow as pa
import pyarrow.parquet as pq

# Import the huge dataset as JSON and convert it to .parquet

In [None]:
df = pd.read_json('yelp_academic_dataset_review.json', lines=True)
relevant_cols = ['review_id','text','stars']
df_relevant = df[relevant_cols]
table = pa.Table.from_pandas(df_relevant)
pq.write_table(table, 'data.parquet')

# Convert each of the star values to [0,4] rather than [1,5] (will come in handy later)

In [2]:
df = pd.read_parquet('data.parquet')

In [3]:
def decrease_star(star_number):
    star_number = int(star_number)
    return star_number -1
df['stars'] = df.stars.apply(decrease_star)

# Define our tokenizer and our model

In [4]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

In [5]:
model = BertModel.from_pretrained('bert-base-uncased')

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight', 'cls.predictions.decoder.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


# Tokenize and encode the data in batches of 1000 rows each

In [6]:
batch_size = 1000
num_batches = int(np.ceil(len(df) / batch_size))
print(f"Number of batches to be processed: {num_batches}")

Number of batches to be processed: 6991


In [None]:
# Set the batch size and number of batches
batch_size = 1000
num_batches = int(np.ceil(len(df) / batch_size))

# Create empty arrays to store the tokenized values
input_ids = np.zeros((len(df), 512), dtype=np.int64)
attention_mask = np.zeros((len(df), 512), dtype=np.int64)
token_type_ids = np.zeros((len(df), 512), dtype=np.int64)

# Tokenize the text in batches
for i in range(num_batches):
    batch_df = df[i*batch_size:(i+1)*batch_size]
    print(f"Processing batch {i+1}/{num_batches}")
    batch_tokens = tokenizer.batch_encode_plus(
        batch_df.text.tolist(),
        truncation=True,
        max_length=512,
        padding='max_length',
        return_attention_mask=True,
        return_token_type_ids=True,
        return_tensors='np',
    )

    # Store the tokenized values in the arrays
    input_ids[i*batch_size:(i+1)*batch_size] = batch_tokens['input_ids']
    attention_mask[i*batch_size:(i+1)*batch_size] = batch_tokens['attention_mask']
    token_type_ids[i*batch_size:(i+1)*batch_size] = batch_tokens['token_type_ids']

# Convert the arrays to PyArrow tables
input_ids_table = pa.Table.from_arrays([pa.array(input_ids)])
attention_mask_table = pa.Table.from_arrays([pa.array(attention_mask)])
token_type_ids_table = pa.Table.from_arrays([pa.array(token_type_ids)])

# Write the tables to a Parquet file
pq.write_table(input_ids_table, 'input_ids.parquet')
pq.write_table(attention_mask_table, 'attention_mask.parquet')
pq.write_table(token_type_ids_table, 'token_type_ids.parquet')
# Read the parquet files
table1 = pq.read_table('input_ids.parquet')
table2 = pq.read_table('attention_mask.parquet')
table3 = pq.read_table('token_type_ids.parquet')

# Concatenate the tables along the columns
table = pa.concat_tables([table1, table2, table3])

# Write the new table to a parquet file
pq.write_table(table, 'data_final.parquet')

Processing batch 1/6991
Processing batch 2/6991
Processing batch 3/6991
Processing batch 4/6991
Processing batch 5/6991
Processing batch 6/6991
Processing batch 7/6991
Processing batch 8/6991
Processing batch 9/6991
Processing batch 10/6991
Processing batch 11/6991
Processing batch 12/6991
Processing batch 13/6991
Processing batch 14/6991
Processing batch 15/6991
Processing batch 16/6991
Processing batch 17/6991
Processing batch 18/6991
Processing batch 19/6991
Processing batch 20/6991
Processing batch 21/6991
Processing batch 22/6991
Processing batch 23/6991
Processing batch 24/6991
Processing batch 25/6991
Processing batch 26/6991
Processing batch 27/6991
Processing batch 28/6991
Processing batch 29/6991
Processing batch 30/6991
Processing batch 31/6991
Processing batch 32/6991
Processing batch 33/6991
Processing batch 34/6991
Processing batch 35/6991
Processing batch 36/6991
Processing batch 37/6991
Processing batch 38/6991
Processing batch 39/6991
Processing batch 40/6991
Processin

In [None]:
input_ids_list = [] # Add list to store the encoded value of the tokens
attention_mask_list = []  # Add list to store attention masks
token_type_ids_list = []  # Add list to store segment IDs

for i in range(num_batches):
    batch_df = df[i*batch_size:(i+1)*batch_size]
    print(i)
    for txt in batch_df.text:
        tokens = tokenizer.encode_plus(
            txt,
            truncation=True,
            max_length=512,
            padding='max_length',
            return_tensors='pt',
        )
        
        # Append the input IDs, attention mask, and segment IDs to their respective lists
        input_ids_list.append(tokens['input_ids'][0].tolist())
        attention_mask_list.append(tokens['attention_mask'][0].tolist())
        token_type_ids_list.append(tokens['token_type_ids'][0].tolist())
    

df['input_ids'] = pd.Series(input_ids_list)
df['attention_mask'] = pd.Series(attention_mask_list)  # Add new column for attention masks
df['token_type_ids'] = pd.Series(token_type_ids_list)  # Add new column for segment IDs

# Save the dataframe to a new Parquet file
df.to_parquet('data_tokenized.parquet', index=False)


0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
27

NameError: name 'df' is not defined