Tokenizing with the Distilbert base uncased model:

In [16]:
import torch
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification
from datasets import DatasetDict

# Assuming you have loaded your dataset previously
%store -r new_ds 

# Select the DistilBERT tokenizer
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')

# Tokenize function with max_length
def tokenize_function(example):
    return tokenizer(example['text'], padding="max_length", truncation=True, max_length=128)

# Tokenize datasets with batch size for efficient memory usage
tokenized_train_dataset = new_ds['train'].map(tokenize_function, batched=True, batch_size=1000)
tokenized_test_dataset = new_ds['test'].map(tokenize_function, batched=True, batch_size=1000)

# Ensure the datasets have the label column correctly mapped
tokenized_train_dataset = tokenized_train_dataset.rename_column("label", "labels")
tokenized_test_dataset = tokenized_test_dataset.rename_column("label", "labels")

# Print sample tokenized data
print("Sample tokenized text:", tokenized_train_dataset[0])

# Example of accessing text and labels
train_texts = tokenized_train_dataset['text']
train_labels = tokenized_train_dataset['labels']

test_texts = tokenized_test_dataset['text']
test_labels = tokenized_test_dataset['labels']


Map:   0%|          | 0/11314 [00:00<?, ? examples/s]

Map:   0%|          | 0/7532 [00:00<?, ? examples/s]

Sample tokenized text: {'text': 'I was wondering if anyone out there could enlighten me on this car I saw\nthe other day. It was a 2-door sports car, looked to be from the late 60s/\nearly 70s. It was called a Bricklin. The doors were really small. In addition,\nthe front bumper was separate from the rest of the body. This is \nall I know. If anyone can tellme a model name, engine specs, years\nof production, where this car is made, history, or whatever info you\nhave on this funky looking car, please e-mail.', 'labels': 7, 'label_text': 'rec.autos', 'cleaned_text': 'I was wondering if anyone out there could enlighten me on this car I saw the other day It was a door sports car looked to be from the late s early s It was called a Bricklin The doors were really small In addition the front bumper was separate from the rest of the body This is all I know If anyone can tellme a model name engine specs years of production where this car is made history or whatever info you have on this funky

In [18]:
%store tokenized_train_dataset
%store tokenized_test_dataset

Stored 'tokenized_train_dataset' (Dataset)
Stored 'tokenized_test_dataset' (Dataset)
