In [1]:
import re
import pandas as pd
import polars as pl
from tqdm import tqdm
from typing import Optional
import os 


from tokenizers import Tokenizer
from tokenizers.models import BPE
from tokenizers.trainers import BpeTrainer
from tokenizers.pre_tokenizers import Whitespace


### Pre-processing of Book Corpus Refined 
The dataset is from Kaggle, `nishantsingh96/refined-bookcorpus-dataset`.\
We preprocess the data, even though the author claims its been already preprocessed. We'll use the polars library for faster processing. 

In [13]:
def clean_text(text):
    # remove unnecessary symbols (keep punctuation)
    text = re.sub(r"[^a-zA-Z0-9.,!?;:'\"()\- ]+", " ", text)
    
    # normalize white space
    text = re.sub(r"\s+", " ", text).strip()
    
    
    # text has allegedly been lowered for us. But, we do this so that we can apply it on the downstream dataset
    text = text.lower()
    
    return text
def filter_text(text: str, min_length: int = 10, max_length: int = 10000) -> bool:
    """Filter text based on length criteria."""
    if not isinstance(text, str):
        return False
    
    text_length = len(text.strip())
    return min_length <= text_length <= max_length

In [3]:
    
data_path = os.path.expanduser("~/BookCorpus/BookCorpus3.csv")
# Example: reading CSV
df = pl.read_csv(data_path)  


In [3]:
df

0
str
"""she began getting up first thi…"
"""caitrin hid the fact that she …"
"""she considered strapping down …"
"""as hard as it had been for cai…"
"""as her treks took her farther …"
…
"""you can't fight him she shoute…"
"""a ha war eagle exclaimed. he l…"
"""it was the next afternoon befo…"
"""the story of zorana has been t…"


In [5]:
df_processed = (
    df.with_columns([
        pl.col('0').map_elements(clean_text, return_dtype=pl.Utf8).alias("cleaned_text")
    ]).filter(
            # Filter out empty strings and texts outside length bounds
            (pl.col("cleaned_text").str.len_chars() >= 10) &
            (pl.col("cleaned_text").str.len_chars() <= 10000) &
            (pl.col("cleaned_text") != "")
        )
)
# save it immediately 
df_processed.write_csv('bookcorpus.txt')

### Training a tokenizer 
We'll do a BPE tokenizer. 


In [7]:
tokenizer = Tokenizer(BPE(unk_token="[UNK]"))
tokenizer.pre_tokenizer = Whitespace()

trainer = BpeTrainer(
    vocab_size=30000, 
    special_tokens=["[PAD]", "[UNK]", "[CLS]", "[SEP]", "[MASK]"]
)

files = ["bookcorpus.txt"]


In [8]:
# train and save the tokenizer when ready! 
tokenizer.train(files, trainer)
tokenizer.save("bpe_tokenizer.json")







In [None]:
''' 
Example usage:
from transformers import PreTrainedTokenizerFast

hf_tokenizer = PreTrainedTokenizerFast(tokenizer_file="bpe_tokenizer.json")
hf_tokenizer.pad_token = "[PAD]"
'''

In [9]:
from transformers import PreTrainedTokenizerFast

hf_tokenizer = PreTrainedTokenizerFast(tokenizer_file="bpe_tokenizer.json")
hf_tokenizer.pad_token = "[PAD]"

  from .autonotebook import tqdm as notebook_tqdm


In [11]:
sample_text = "the quick brown fox jumps over the lazy dog."
encoded = hf_tokenizer.encode(sample_text)
print("Encoded IDs:", encoded)
decoded = hf_tokenizer.decode(encoded)
print("Decoded Text:", decoded)

Encoded IDs: [52, 772, 1995, 6113, 12040, 259, 52, 8817, 1701, 10]
Decoded Text: the quick brown fox jumps over the lazy dog .


### Pre-processing IMDb dataset


In [15]:
imdb_path = os.path.expanduser("~/datasets/imdb_spoiler/IMDB_reviews.json")
ds_df = pl.read_ndjson(imdb_path)


### Splitting into Train and Test 



In [18]:
ds_df = ds_df.sample(fraction=1.0, shuffle=True, seed=42)  # reproducible split

# Train-test split (e.g., 80/20)
split_idx = int(0.8 * len(ds_df))
train_df = ds_df[:split_idx]
test_df = ds_df[split_idx:]

In [20]:
train_df

review_date,movie_id,user_id,is_spoiler,review_text,rating,review_summary
str,str,str,bool,str,str,str
"""31 July 2001""","""tt0126029""","""ur0341961""",false,"""this movie was so good. I me…","""10""","""And I went without little peop…"
"""19 December 2003""","""tt0167260""","""ur2513127""",false,"""I am saddened to learn that Se…","""10""","""Fantastic!!! No other words"""
"""15 June 2007""","""tt0449088""","""ur15706936""",false,"""Why are they making more Pirat…","""1""","""What the?"""
"""11 April 2002""","""tt0137523""","""ur1685153""",false,"""David Fincher, Edward Norton, …","""10""","""What a ride!"""
"""10 July 2015""","""tt1823672""","""ur48448272""",false,"""Chappie 2015 is an extremely a…","""10""","""simply amazing!"""
…,…,…,…,…,…,…
"""8 August 2006""","""tt0454848""","""ur9034662""",true,"""Well, he doesn't have me foole…","""2""","""don't bother"""
"""24 November 2010""","""tt0464154""","""ur19669890""",true,"""Fish. Tits. Fish. Tits. Fish. …","""3""","""A Portion Of Fish And Tits, Pl…"
"""16 June 2011""","""tt1605783""","""ur2444068""",true,"""Gil (Owen Wilson) is a self-de…","""8""","""A dream ride in a classic Fren…"
"""23 November 2012""","""tt0454876""","""ur6983748""",false,"""Astounding movie experience!! …","""10""","""Movie of the Year 2012"""


In [21]:
train_df_processed = (
    train_df.with_columns([
        pl.col('review_text').map_elements(clean_text, return_dtype=pl.Utf8).alias("cleaned_text")
    ]).filter(
            # Filter out empty strings and texts outside length bounds
            (pl.col("cleaned_text").str.len_chars() >= 10) &
            (pl.col("cleaned_text").str.len_chars() <= 10000) &
            (pl.col("cleaned_text") != "")
        )
)

In [22]:
test_df_processed = (
    test_df.with_columns([
        pl.col('review_text').map_elements(clean_text, return_dtype=pl.Utf8).alias("cleaned_text")
    ]).filter(
            # Filter out empty strings and texts outside length bounds
            (pl.col("cleaned_text").str.len_chars() >= 10) &
            (pl.col("cleaned_text").str.len_chars() <= 10000) &
            (pl.col("cleaned_text") != "")
        )
)

In [24]:
# tokenize the datasets so we don't need to keep processing them!

from datasets import Dataset

# Convert Polars → Pandas → Hugging Face Dataset
train_hf = Dataset.from_pandas(train_df_processed.to_pandas())
test_hf = Dataset.from_pandas(test_df_processed.to_pandas())

# Tokenize
def tokenize_batch(batch):
    return hf_tokenizer(
        batch["cleaned_text"],
        truncation=True,
        padding="max_length",
        max_length=512
    )

train_hf = train_hf.map(tokenize_batch, batched=True)
test_hf = test_hf.map(tokenize_batch, batched=True)

# Save to disk
train_hf.save_to_disk("imdb_train_tokenized")
test_hf.save_to_disk("imdb_test_tokenized")


Map: 100%|██████████| 459109/459109 [04:12<00:00, 1819.44 examples/s]
Map: 100%|██████████| 114780/114780 [01:03<00:00, 1814.93 examples/s]
Saving the dataset (6/6 shards): 100%|██████████| 459109/459109 [00:08<00:00, 56998.88 examples/s]
Saving the dataset (2/2 shards): 100%|██████████| 114780/114780 [00:01<00:00, 65123.68 examples/s]


In [None]:
'''
next time, we can load the dataset as such:

from datasets import load_from_disk

train_hf = load_from_disk("imdb_train_tokenized")
test_hf = load_from_disk("imdb_test_tokenized")
'''

In [25]:
from transformers import BertTokenizer, BertModel
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained("bert-base-uncased")


In [26]:
model

BertModel(
  (embeddings): BertEmbeddings(
    (word_embeddings): Embedding(30522, 768, padding_idx=0)
    (position_embeddings): Embedding(512, 768)
    (token_type_embeddings): Embedding(2, 768)
    (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): BertEncoder(
    (layer): ModuleList(
      (0-11): 12 x BertLayer(
        (attention): BertAttention(
          (self): BertSdpaSelfAttention(
            (query): Linear(in_features=768, out_features=768, bias=True)
            (key): Linear(in_features=768, out_features=768, bias=True)
            (value): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): BertSelfOutput(
            (dense): Linear(in_features=768, out_features=768, bias=True)
            (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
            (dropout): Dropout(p=0.1, inplace=False

In [28]:
hf_tokenizer

PreTrainedTokenizerFast(name_or_path='', vocab_size=30000, model_max_length=1000000000000000019884624838656, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'pad_token': '[PAD]'}, clean_up_tokenization_spaces=False, added_tokens_decoder={
	0: AddedToken("[PAD]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	1: AddedToken("[UNK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	2: AddedToken("[CLS]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	3: AddedToken("[SEP]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	4: AddedToken("[MASK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
}
)

In [34]:
hf_tokenizer.vocab_size

30000