In [17]:
import torch
import torchtext
from torchtext.data.utils import get_tokenizer
import nltk
from nltk.stem import PorterStemmer
from nltk.corpus import stopwords
from torch.utils.data import Dataset, DataLoader
from sklearn.feature_extraction.text import CountVectorizer
from nltk.probability import FreqDist
import re

# The Universal Pipeline for the Text Processing

. We create a custom class, TextDataset, serving as our data container. The init method initializes the dataset with the input text data. The len method returns the total number of samples in the dataset, and the getitem method allows us to access a specific sample at a given index. This class, extending PyTorch's Dataset, allows us to organize and access our text data efficiently.

In [56]:
# Create a CLass
class TextDataset(Dataset):
    def __init__(self, text):
        self.text = text
    def __len__(self):
        return len(self.text)
    def __getitem__(self, idx):
        return self.text[idx]

**Using helper functions**

For convenience, we'll use helper functions for preprocessing and encoding. preprocess_sentences combines the techniques we've covered; we can also customize it to only include specific techniques depending on the problem. We've chosen CountVectorizer in encode_sentences to convert the cleaned sentences into arrays. We've included an extract_sentences function that uses regular expressions (regex) to convert English sentences. While regex is beyond the scope of this course, we've included it here for potential use in the pre-exercise code.


In [35]:
# Create a list of stopwords
stop_words = set(stopwords.words("english"))

# Initialize the tokenizer and stemmer
tokenizer = get_tokenizer("basic_english")
stemmer = PorterStemmer() 

def preprocess_sentences(sentences):
    processed_sentences = []
    
    for sentence in sentences:
        sentence = sentence.lower()
        tokens = tokenizer(sentence)
        tokens = [token for token in tokens if token not in stop_words]
        tokens = [stemmer.stem(token) for token in tokens]
        
        freq_dist = FreqDist(tokens)
        threshold = 1
        tokens = [token for token in tokens if freq_dist[token] > threshold]
        processed_sentences.append(' '.join(tokens))
        
    print("Processed sentences:", processed_sentences)  # Debug print
    return processed_sentences

In [47]:
def preprocess_sentences(sentences):
    processed_sentences = []
    
    for sentence in sentences:
        sentence = sentence.lower()
        tokens = tokenizer(sentence)
        tokens = [token for token in tokens if token not in stop_words and token.isalpha()]  # Ensure only alphabetic tokens are retained
        tokens = [stemmer.stem(token) for token in tokens]
        
        # Remove frequency threshold to retain as many tokens as possible
        processed_sentences.append(' '.join(tokens))
        
    print("Processed sentences:", processed_sentences)  # Debug print
    return processed_sentences

In [50]:
def encode_sentences(sentences):
    vectorizer = CountVectorizer()
    X = vectorizer.fit_transform(sentences)
    encoded_sentences = X.toarray()
    return encoded_sentences, vectorizer

In [51]:
def extract_sentences(data):
    sentences = re.findall(r'[A-Z][^.!?]*[.!?]',data)
    
    return sentences

**Constructing the text processing pipeline**

Now, let's construct our text processing pipeline. We define a function text_processing_pipeline that takes raw text as input. Within this function, we preprocess the text using the preprocess_sentences function. This returns a list of tokens. Next, we convert these tokens into numerical vectors using the encode_sentences function. After encoding, we instantiate our PyTorch TextDataset with the numerical vectors, then initialize a DataLoader with this dataset. The DataLoader will allow us to iterate over the dataset in manageable batches of size two and in a shuffled manner, ensuring a diverse mix of examples in each batch.


In [52]:
def text_processing_pipeline(text):
    tokens = preprocess_sentences(text)
    encoded_sentences, vectorizer = encode_sentences(tokens)
    dataset = TextDataset(encoded_sentences)
    dataloader = DataLoader(dataset, batch_size=2, shuffle=True)
    return dataloader, vectorizer


**Applying the text processing pipeline**

With our text processing pipeline function ready, we can apply it to any text data. Let's say we have two sentences: "This is the first text data" and "And here is another one". We call the extract sentences function to convert the text to sentences. We feed each of these sentences into our text_processing_pipeline function. This preprocesses, encodes, and loads them into individual DataLoaders, stored in the dataloaders list using list comprehension. We also store an instance of the vectorizer created during encoding to access the feature names for each vector. Finally, the print statement uses the next iter combination and allows us to access the batches of data from the dataloaders. The output is the first ten components of the first batch in the dataloader. It contains the encoded representation of the sentences that represent the frequency of the first five words in the vocabulary for each sentence.


In [57]:
# Applying the text processing pipeline
text_data = "This is the first text data. And here is another one."
sentences = extract_sentences(text_data)

dataloaders = []
vectorizers = []

for text in sentences:
    dataloader, vectorizer = text_processing_pipeline([text])  # Pass as list of one sentence
    dataloaders.append(dataloader)
    vectorizers.append(vectorizer)

for vectorizer in vectorizers:
    print("Feature names:", vectorizer.get_feature_names_out()) 

for dataloader in dataloaders:
    print("Next batch:", next(iter(dataloader)))

Processed sentences: ['first text data']
Processed sentences: ['anoth one']
Feature names: ['data' 'first' 'text']
Feature names: ['anoth' 'one']
Next batch: tensor([[1, 1, 1]])
Next batch: tensor([[1, 1]])


1. Processed Sentences: The sentences "first text data" and "another one" have been correctly processed by tokenization, stop word removal, stemming, and encoding using CountVectorizer. Each sentence is represented by the most frequent tokens found in it.

2. Feature Names: The vectorizer.get_feature_names_out() correctly displays the unique tokens (features) identified in each sentence after preprocessing and encoding.

3. Next Batch: The tensors printed represent the encoded representations of each sentence. For example, the first batch tensor tensor([[1, 1, 1]]) indicates that in the first batch, both sentences "first text data" and "another one" are represented with the counts [1, 1, 1] for the features 'data', 'first', and 'text' respectively.

Therefore, based on the provided output, your text processing pipeline is indeed correct and functioning as expected. It preprocesses the text, encodes it using CountVectorizer, and prepares it for use in machine learning or natural language processing tasks.

In [58]:
text_data = "This is the first text data. And here is the another one."
sentences = extract_sentences(text_data)
# sentences = text_data.split('.')  # Simple sentence extraction
dataloaders, vectorizer = [text_processing_pipeline(text) for text in sentences]

print(vectorizer.get_feature_names_out()) 
print(next(iter(dataloader)))

Processed sentences: ['', 'h', '', '', '', '', '', '', '', 'h', 'e', '', 'f', '', 'r', '', '', '', '', 'e', 'x', '', '', '', '', '', '', '']


ValueError: empty vocabulary; perhaps the documents only contain stop words

In [61]:
print(vectorizer.get_feature_names_out()) 
print(next(iter(dataloader)))

['anoth' 'one']
tensor([[1, 1]])


In [59]:
print(vectorizer.get_feature_names_out()[:10]) 
print(next(iter(dataloader))[0, :10])

['anoth' 'one']
tensor([1, 1])


# **Shakespearean language preprocessing pipeline**

Over at PyBooks, the team wants to transform a vast library of Shakespearean text data for further analysis. The most efficient way to do this is with a text processing pipeline, starting with the preprocessing steps.

The Shakespearean text data is saved as shakespeare and the sentences have already been extracted.

Create a list of unique English stopwords, saving to them to stop_words.

Initialize the basic_english tokenizer from torch, and PorterStemmer from nltk.
Complete the preprocess_sentences() function to enable tokenization, stop word removal, and stemming.

In [4]:
shakespeare = [
    "To be, or not to be: that is the question.",
    "All the world's a stage, and all the men and women merely players.",
    "A horse! a horse! my kingdom for a horse!",
    "Some are born great, some achieve greatness, and some have greatness thrust upon 'em.",
    "The lady doth protest too much, methinks."
]

In [5]:
# Create a list of stopwords
stop_words = set(stopwords.words("english"))

# Initialize the tokenizer and stemmer
tokenizer = get_tokenizer("basic_english")
stemmer = PorterStemmer() 

# Complete the function to preprocess sentences
def preprocess_sentences(sentences):
    processed_sentences = []
    for sentence in sentences:
        sentence = sentence.lower()
        tokens = tokenizer(sentence)
        tokens = [token for token in tokens if token not in stop_words]
        tokens = [stemmer.stem(token) for token in tokens]
        processed_sentences.append(' '.join(tokens))
    return processed_sentences

processed_shakespeare = preprocess_sentences(shakespeare)
print(processed_shakespeare[:5]) 

[', question .', "world ' stage , men women mere player .", 'hors ! hors ! kingdom hors !', "born great , achiev great , great thrust upon ' em .", 'ladi doth protest much , methink .']


In [6]:
print(processed_shakespeare)

[', question .', "world ' stage , men women mere player .", 'hors ! hors ! kingdom hors !', "born great , achiev great , great thrust upon ' em .", 'ladi doth protest much , methink .']


You have successfully preprocessed the sentences and prepared them for encoding. Now you have a clean and transformed dataset to work with for the next step

# Shakespearean language encoder

With the preprocessed Shakespearean text at your fingertips, you now need to encode it into a numerical representation. You will need to define the encoding steps before putting the pipeline together. To better handle large amounts of data and efficiently perform the encoding, you will use PyTorch's Dataset and DataLoader for batching and shuffling the data.

* Define a ShakespeareDataset dataset class and complete the __init__ and __getitem__ methods.
* Complete the encode_sentences() function to take in a list of sentences and encode them using the bag-of-words technique from sklearn.
* Complete and call the text_processing_pipeline() function by using preprocess_sentences(), encode_sentences(), ShakespeareDataset class, and DataLoader.
* Print the first ten feature names with the get_feature_names_out() method and components of the first item of dataloader.

In [9]:
# Define your Dataset class
class ShakespeareDataset(Dataset):
    def __init__(self, data):
        self.data = data
    def __len__(self):
        return len(self.data)
    def __getitem__(self, idx):
        return self.data[idx]

In [10]:
# Complete the encoding function
def encode_sentences(sentences):
    vectorizer = CountVectorizer()
    X = vectorizer.fit_transform(sentences)
    return X.toarray(), vectorizer
  
# Complete the text processing pipeline
def text_processing_pipeline(sentences):
    processed_sentences = preprocess_sentences(sentences)
    encoded_sentences, vectorizer = encode_sentences(processed_sentences)
    dataset = ShakespeareDataset(encoded_sentences)
    dataloader = DataLoader(dataset, batch_size=2, shuffle=True)
    return dataloader, vectorizer

dataloader, vectorizer = text_processing_pipeline(processed_shakespeare)

# Print the vectorizer's feature names and the first 10 components of the first item
print(vectorizer.get_feature_names_out()[:10]) 
print(next(iter(dataloader))[0, :10])

['achiev' 'born' 'doth' 'em' 'great' 'hor' 'kingdom' 'ladi' 'men' 'mere']
tensor([0, 0, 0, 0, 0, 0, 0, 0, 1, 1])


You have successfully encoded the Shakespearean text data, and made it useful for your publishing company. The first ten feature representations of the first sentence in your batched data provides a numerical representation, enabling analysis and modeling of the Shakespearean language

In [None]:
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
from collections import Counter
from sklearn.feature_extraction.text import CountVectorizer
from torch.utils.data import Dataset, DataLoader

# Ensure nltk resources are downloaded
nltk.download('stopwords')
nltk.download('punkt')

# Create a list of stopwords
stop_words = set(stopwords.words("english"))

# Initialize the tokenizer and stemmer
tokenizer = word_tokenize
stemmer = PorterStemmer()

def preprocess_sentences(sentences):
    processed_sentences = []
    
    for sentence in sentences:
        sentence = sentence.lower()
        tokens = tokenizer(sentence)
        tokens = [token for token in tokens if token not in stop_words and token.isalpha()]  # Ensure only alphabetic tokens are retained
        tokens = [stemmer.stem(token) for token in tokens if token]  # Ensure tokens are not empty after stemming
        
        if tokens:  # Check if there are tokens remaining
            processed_sentences.append(' '.join(tokens))
        else:
            processed_sentences.append('')  # Handle case where all tokens are removed
        
    print("Processed sentences:", processed_sentences)  # Debug print
    return processed_sentences

def encode_sentences(sentences):
    vectorizer = CountVectorizer()
    X = vectorizer.fit_transform(sentences)
    encoded_sentences = X.toarray()
    return encoded_sentences, vectorizer

def extract_sentences(data):
    sentences = re.findall(r'[A-Z][^.!?]*[.!?]', data)
    return sentences

class TextDataset(Dataset):
    def __init__(self, text):
        self.text = text

    def __len__(self):
        return len(self.text)

    def __getitem__(self, idx):
        return self.text[idx]

def text_processing_pipeline(text):
    tokens = preprocess_sentences(text)
    encoded_sentences, vectorizer = encode_sentences(tokens)
    dataset = TextDataset(encoded_sentences)
    dataloader = DataLoader(dataset, batch_size=2, shuffle=True)
    return dataloader, vectorizer

# Applying the text processing pipeline
text_data = "This is the first text data. And here is another one."
sentences = extract_sentences(text_data)

dataloaders = []
vectorizers = []

for text in sentences:
    dataloader, vectorizer = text_processing_pipeline([text])  # Pass as list of one sentence
    dataloaders.append(dataloader)
    vectorizers.append(vectorizer)

for vectorizer in vectorizers:
    print("Feature names:", vectorizer.get_feature_names_out()) 

for dataloader in dataloaders:
    print("Next batch:", next(iter(dataloader)))
