# Text preprocessing: Pytorch, NLTK and SKLEARN

## Tokenization

Extracting tokens(words) from the text.

In [1]:
from torchtext.data.utils import get_tokenizer

In [2]:
tokenizer = get_tokenizer('basic_english')

In [3]:
corpus = """
A large language model (LLM) is a language model notable for its ability 
to achieve general-purpose language generation and other natural language processing tasks such as classification. 
LLMs acquire these abilities by learning statistical relationships from text documents during a computationally 
intensive self-supervised and semi-supervised training process.
"""

In [4]:
tokens = tokenizer(corpus)

In [5]:
print(tokens)

['a', 'large', 'language', 'model', '(', 'llm', ')', 'is', 'a', 'language', 'model', 'notable', 'for', 'its', 'ability', 'to', 'achieve', 'general-purpose', 'language', 'generation', 'and', 'other', 'natural', 'language', 'processing', 'tasks', 'such', 'as', 'classification', '.', 'llms', 'acquire', 'these', 'abilities', 'by', 'learning', 'statistical', 'relationships', 'from', 'text', 'documents', 'during', 'a', 'computationally', 'intensive', 'self-supervised', 'and', 'semi-supervised', 'training', 'process', '.']


## Stop word removal

In [None]:
import nltk
nltk.download('stopwords')

In [7]:
from nltk.corpus import stopwords

In [8]:
stop_words = set(stopwords.words("english"))

In [9]:
tokens = list(set(tokens) - set(stop_words))

In [10]:
print(tokens)

['abilities', 'large', 'generation', '(', '.', 'classification', 'process', 'acquire', 'language', 'training', 'statistical', 'self-supervised', 'relationships', 'tasks', 'model', 'learning', 'natural', 'intensive', 'text', ')', 'semi-supervised', 'ability', 'achieve', 'notable', 'llms', 'llm', 'computationally', 'documents', 'general-purpose', 'processing']


## Stemming

In [11]:
from nltk.stem import PorterStemmer

In [12]:
stemmer = PorterStemmer()

In [13]:
tokens = [stemmer.stem(i) for i in tokens]

In [14]:
print(tokens)

['abil', 'larg', 'gener', '(', '.', 'classif', 'process', 'acquir', 'languag', 'train', 'statist', 'self-supervis', 'relationship', 'task', 'model', 'learn', 'natur', 'intens', 'text', ')', 'semi-supervis', 'abil', 'achiev', 'notabl', 'llm', 'llm', 'comput', 'document', 'general-purpos', 'process']


## Rare word removal

Removing infrequent words

In [15]:
from nltk.probability import FreqDist

In [16]:
tokens_freq = FreqDist(tokens)

In [17]:
tokens_freq

FreqDist({'abil': 2, 'process': 2, 'llm': 2, 'larg': 1, 'gener': 1, '(': 1, '.': 1, 'classif': 1, 'acquir': 1, 'languag': 1, ...})

In [18]:
threshold = 2

In [19]:
tokens = [i for i in tokens if tokens_freq[i]>=threshold]

In [20]:
print(tokens)

['abil', 'process', 'abil', 'llm', 'llm', 'process']


## Encoding

* One-hot encoding: words mapped into a binary vector
* Bag of words (BOW): word frequency disregarding the order
* TF-IDF: word uniquence and importance
* Embeddings: convert words into vectors

### One-hot encoding

In [21]:
import torch

In [22]:
vocab = ['achieve', 'natural', 'ability']

In [23]:
one_hot_vectors = torch.eye(len(vocab))

In [24]:
one_hot_dict = {word: one_hot_vectors[i] for i, word in enumerate(vocab)}

In [25]:
one_hot_dict

{'achieve': tensor([1., 0., 0.]),
 'natural': tensor([0., 1., 0.]),
 'ability': tensor([0., 0., 1.])}

### BOW

In [26]:
from sklearn.feature_extraction.text import CountVectorizer

In [27]:
vectorizer = CountVectorizer()
X = vectorizer.fit_transform(corpus.split('.'))

In [28]:
X.toarray().shape

(3, 41)

In [29]:
X.toarray()[:1]

array([[0, 1, 1, 0, 1, 1, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 1, 1, 4, 1, 0, 1,
        0, 2, 1, 1, 1, 0, 1, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0]])

In [30]:
print(vectorizer.get_feature_names_out())

['abilities' 'ability' 'achieve' 'acquire' 'and' 'as' 'by'
 'classification' 'computationally' 'documents' 'during' 'for' 'from'
 'general' 'generation' 'intensive' 'is' 'its' 'language' 'large'
 'learning' 'llm' 'llms' 'model' 'natural' 'notable' 'other' 'process'
 'processing' 'purpose' 'relationships' 'self' 'semi' 'statistical' 'such'
 'supervised' 'tasks' 'text' 'these' 'to' 'training']


### TF-IDF

In [31]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [32]:
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(corpus.split('.'))

In [33]:
# shape (sentence, word)
X.toarray().shape

(3, 41)

In [34]:
X.toarray()[:1]

array([[0.        , 0.15895379, 0.15895379, 0.        , 0.12088845,
        0.15895379, 0.        , 0.15895379, 0.        , 0.        ,
        0.        , 0.15895379, 0.        , 0.15895379, 0.15895379,
        0.        , 0.15895379, 0.15895379, 0.63581516, 0.15895379,
        0.        , 0.15895379, 0.        , 0.31790758, 0.15895379,
        0.15895379, 0.15895379, 0.        , 0.15895379, 0.15895379,
        0.        , 0.        , 0.        , 0.        , 0.15895379,
        0.        , 0.15895379, 0.        , 0.        , 0.15895379,
        0.        ]])

In [35]:
print(vectorizer.get_feature_names_out())

['abilities' 'ability' 'achieve' 'acquire' 'and' 'as' 'by'
 'classification' 'computationally' 'documents' 'during' 'for' 'from'
 'general' 'generation' 'intensive' 'is' 'its' 'language' 'large'
 'learning' 'llm' 'llms' 'model' 'natural' 'notable' 'other' 'process'
 'processing' 'purpose' 'relationships' 'self' 'semi' 'statistical' 'such'
 'supervised' 'tasks' 'text' 'these' 'to' 'training']


# Pytorch Dataset and Dataloader

* Dataset is a container for preprocessed text
* DataLoader is a batch loader with shuffle and multiprocessing 

In [36]:
from torch.utils.data import Dataset, DataLoader

In [37]:
class CustomDataset(Dataset):
    def __init__(self, text):
        self.text = text

    def __len__(self):
        return len(self.text)

    def __getitem__(self, idx):
        return self.text[idx]

In [None]:
dataset = CustomDataset(encoded_text)

In [None]:
dataloader = DataLoader(dataset, batch_size=2, shuffle=True)

# Example Shakespeare dataset

In [39]:
import io

In [40]:
text = io.open('./data/shakespeare.txt', encoding='utf-8')

In [41]:
with text as file:
    lines = [line.rstrip() for line in file]

In [42]:
lines = [line.lower() for line in lines if line!='']

In [43]:
len(lines)

2158

In [44]:
lines[:5]

['the sonnets',
 'by william shakespeare',
 'from fairest creatures we desire increase,',
 "that thereby beauty's rose might never die,",
 'but as the riper should by time decease,']

In [45]:
# Create a list of stopwords
stop_words = set(stopwords.words("english"))

# Initialize the tokenizer and stemmer
tokenizer = get_tokenizer("basic_english")
stemmer = PorterStemmer() 

# Complete the function to preprocess sentences
def preprocess_sentences(sentences):
    processed_sentences = []
    for sentence in sentences:
        sentence = sentence.lower()
        tokens = tokenizer(sentence)
        tokens = [token for token in tokens if token not in stop_words]
        tokens = [stemmer.stem(token) for token in tokens]
        processed_sentences.append(' '.join(tokens))
    return processed_sentences

processed_shakespeare = preprocess_sentences(lines)
print(processed_shakespeare[:5]) 

['sonnet', 'william shakespear', 'fairest creatur desir increas ,', "therebi beauti ' rose might never die ,", 'riper time deceas ,']


In [46]:
class ShakespeareDataset(Dataset):
    def __init__(self, text):
        self.text = text

    def __len__(self):
        return len(self.text)

    def __getitem__(self, idx):
        return self.text[idx]

def encode_sentences(sentences):
    vectorizer = CountVectorizer()
    X = vectorizer.fit_transform(sentences)
    return X.toarray(), vectorizer

def text_processing_pipeline(sentences):
    processed_sentences = preprocess_sentences(sentences)
    encoded_sentences, vectorizer = encode_sentences(processed_sentences)
    dataset = ShakespeareDataset(encoded_sentences)
    dataloader = DataLoader(dataset, batch_size=2, shuffle=True)
    return dataloader, vectorizer

In [47]:
dataloader, vectorizer = text_processing_pipeline(processed_shakespeare)

# Print the vectorizer's feature names and the first 5 components of the first item
print(vectorizer.get_feature_names_out()[:5]) 
print(next(iter(dataloader))[0, :5])

['ab' 'abhor' 'abid' 'abl' 'absenc']
tensor([0, 0, 0, 0, 0])
