# Testing of different Tokenizer 

In [1]:
# Import necessary libraries
import pandas as pd
import spacy
from sklearn.feature_extraction.text import CountVectorizer
from tokenizers import Tokenizer, models, pre_tokenizers, decoders, trainers, processors, normalizers

In [2]:
# Load the post data
preprocessed_posts = pd.read_csv('c:/Users/Jannik Wirtheim/OneDrive/Dokumente/Privat/Bildung/M. Sc. Social and Economic Data Science/2. Semester/Deep Learning for Social Sciences/Project/data/preprocessed_posts.csv')

## Byte-Pair Encoding (BPE)

In [4]:
# Copy post data
df_bpe = preprocessed_posts.copy()

# Initialize a tokenizer
tokenizer_bpe = Tokenizer(models.BPE())

# Use a pre-tokenizer to split text into words
tokenizer_bpe.pre_tokenizer = pre_tokenizers.Whitespace()

# Define a trainer
trainer = trainers.BpeTrainer(vocab_size=10000, min_frequency=2, special_tokens=["<unk>", "<s>", "</s>", "<pad>", "<mask>"])

# Train the tokenizer on the dataframe's text column
texts = df_bpe["title_and_text_lemmatized"].tolist()
tokenizer_bpe.train_from_iterator(texts, trainer)
vocab_bpe = tokenizer_bpe.get_vocab()
vocab_size_bpe = len(vocab_bpe)

# Tokenize the column and add tokens as a new column
df_bpe['tokens'] = df_bpe["title_and_text_lemmatized"].apply(lambda x: tokenizer_bpe.encode(x).tokens)

## Word Piece

In [5]:
# Copy post data
df_wp = preprocessed_posts.copy()

# Initialize a tokenizer
tokenizer_wp = Tokenizer(models.WordPiece(unl_token="[UNK]"))

# Normalize the text (optional but recommended)
tokenizer_wp.normalizer = normalizers.BertNormalizer()

# Use a pre-tokenizer to split text into words
tokenizer_wp.pre_tokenizer = pre_tokenizers.Whitespace()

# Define a trainer
trainer = trainers.WordPieceTrainer(vocab_size=10000, min_frequency=2, special_tokens=["[UNK]", "[CLS]", "[SEP]", "[PAD]", "[MASK]"])

# Train the tokenizer on the dataframe's text column
texts = df_wp['title_and_text_lemmatized'].tolist()
tokenizer_wp.train_from_iterator(texts, trainer)
vocab_wordpiece = tokenizer_wp.get_vocab()
vocab_size_wordpiece = len(vocab_wordpiece)

# Apply the function to the dataframe
df_wp['tokens'] = df_wp['title_and_text_lemmatized'].apply(lambda x: tokenizer_wp.encode(x).tokens)

# Spacy

In [8]:
# Copy post data
df_spacy = preprocessed_posts.copy()

nlp = spacy.load("en_core_web_sm")

def tokenize_spacy(text):
    """
    Tokenizes the given text using the Spacy library.

    Parameters:
    text (str): The input text to be tokenized.

    Returns:
    list: A list of tokens extracted from the text, excluding punctuation and whitespace.
    """
    doc = nlp(text)
    return [token.text for token in doc if not token.is_punct and not token.is_space]

# Apply the tokenize function to the "cleaned" column
df_spacy['tokens'] = df_spacy['title_and_text_lemmatized'].apply(tokenize_spacy)

# Build the vocabulary
# Flatten the list of tokens and create a set of unique tokens
vocab = set(token for tokens in df_spacy['tokens'] for token in tokens)

# Calculate the length of the vocabulary
vocab_size_spacy= len(vocab)

## Unigram

In [None]:
# Copy post data
df_unigram = preprocessed_posts.copy()

# Initialize the CountVectorizer for unigram (default)
vectorizer = CountVectorizer()

# Fit the vectorizer on the "cleaned" column and transform the text to a term-document matrix
X = vectorizer.fit_transform(df_unigram['title_and_text_lemmatized'])

# Get the feature names (i.e., the vocabulary)
vocab = vectorizer.get_feature_names_out()

# Length of the vocabulary
vocab_size_unigram = len(vocab)

## Comparison of tokenizers

In [None]:
# Collect the results
results = {
    "Tokenizer": ["BPE", "Unigram", "WordPiece", "spaCy"],
    "Vocab Size": [vocab_size_bpe, vocab_size_unigram, vocab_size_wordpiece, vocab_size_spacy]
}

# Create a DataFrame to display the results
df_results = pd.DataFrame(results)
print(df_results)

   Tokenizer  Vocab Size
0        BPE       10000
1    Unigram       14797
2  WordPiece       10000
3      spaCy       14818
