In [1]:
data = [
    ("I love this movie", 1),          
    ("This film is terrible", 0), 
    ("What a great movie", 1),
    ("I hated this film", 0),
    ("Amazing acting and good story", 1),
    ("Bad plot and boring", 0),
]

In [2]:
tokens = [word.split() for word in [sentence[0].lower() for sentence in data]]

unique_tokens = set([word for sentence in tokens for word in sentence])

vocab = {word: idx for idx, word in enumerate(unique_tokens, start=2)}
(vocab['PAD'], vocab['UNK']) = (0, 0)

In [3]:
import spacy
nlp = spacy.load("en_core_web_sm")

nlp_tokens = [[token.text for token in nlp(sentence[0]).doc] for sentence in data]

In [4]:
from transformers import BertTokenizer

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
tt_tokens = [tokenizer.tokenize(sentence[0].lower()) for sentence in data]

In [5]:
from tensorflow.keras.preprocessing.text import Tokenizer

tokenizer = Tokenizer()
tokenizer.fit_on_texts([sentence[0].lower() for sentence in data])
tk_tokens = tokenizer.texts_to_sequences([sentence[0].lower() for sentence in data])

In [7]:
from transformers import BertTokenizer

# Load the tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Function to tokenize the text
def tokenize_function(text):
    return tokenizer.tokenize(text.lower())

# Tokenize the sentences
def tokenize_data(data):
    return [(tokenize_function(sentence), label) for sentence, label in data]

# Example usage:
# Replace `data` with your actual dataset (list of tuples with sentences and labels)
# tokenized_texts = tokenize_data(data)

# For illustration, here's how you would print the tokenized sentences:
# for sentence, tokens in tokenized_texts:
#     print(f"Original sentence: {sentence}")
#     print(f"Tokenized: {tokens}")

In [None]:
from tensorflow.keras.layers import TextVectorization

'''
TensorFlow's TextVectorization Layer: TensorFlow provides a TextVectorization layer, which is optimized for performance and integrates directly into TensorFlow models. This can be used for large datasets and avoids manually handling the vocabulary.
'''


# Create TextVectorization layer
vectorizer = TextVectorization(max_tokens=10000, output_mode='int', output_sequence_length=100)

# Fit the vectorizer
vectorizer.adapt(texts)

# Transform text to sequence of integers
sequences = vectorizer(texts)

In [None]:
from tensorflow.keras.layers import TextVectorization

# Set up the TextVectorization layer (you can add `oov_token` to handle unknown words)
vectorizer = TextVectorization(
    max_tokens=10000,         # Limit vocabulary size to 10,000 most frequent words
    output_mode='int',        # Return sequences of integers
    output_sequence_length=50,  # Ensure uniform sequence length (you can adjust this)
    oov_token="<UNK>"         # Handle out-of-vocabulary tokens
)

# Fit on the dataset (adapt to learn the vocabulary)
vectorizer.adapt(flat_texts)

# Transform text to sequences
padded_sequences = vectorizer(flat_texts)
print(padded_sequences)

In [None]:
from concurrent.futures import ProcessPoolExecutor

def clean_data_parallel(data_batch):
    
    '''
    Parallel Data Loading and Preprocessing

    For very large datasets, consider parallelizing the data loading and preprocessing using libraries like joblib, concurrent.futures, or multiprocessing. This can significantly speed up the pipeline when processing a large volume of text data.

    '''

    return [clean_and_tokenize(text) for text in data_batch]

# Split data into batches for parallel processing
batch_size = 500
batches = [data[i:i + batch_size] for i in range(0, len(data), batch_size)]

# Use ProcessPoolExecutor to clean data in parallel
with ProcessPoolExecutor() as executor:
    results = list(executor.map(clean_data_parallel, batches))

# Flatten the list of results
cleaned_texts = [item for sublist in results for item in sublist]

In [9]:
from tensorflow.keras.preprocessing.text import Tokenizer

# Prepare data (lowercase text)
texts = [sentence[0].lower() for sentence in data]

# Initialize tokenizer
tokenizer = Tokenizer(oov_token="<UNK>")  # For handling out-of-vocabulary words
tokenizer.fit_on_texts(texts)

# Create vocabulary (words mapped to integer indices)
vocab = tokenizer.word_index

# Add PAD token at index 0
vocab['PAD'] = 0

In [None]:
from tensorflow.keras.preprocessing.text import Tokenizer

# Your data
data = [
    ("I love this movie", 1),          
    ("This film is terrible", 0), 
    ("What a great movie", 1),
    ("I hated this film", 0),
    ("Amazing acting and good story", 1),
    ("Bad plot and boring", 0),
]

# Prepare text data (extract only the sentences and convert to lowercase)
texts = [sentence[0].lower() for sentence in data]

# Initialize the Tokenizer
tokenizer = Tokenizer(oov_token="<UNK>")  # Out-of-vocabulary token

# Fit the tokenizer on your text data (builds the vocabulary)
tokenizer.fit_on_texts(texts)

# Create the vocabulary (words -> index mapping)
vocab = tokenizer.word_index

# Add padding (for consistency with your original approach)
vocab['PAD'] = 0  # We define PAD as 0, to match your earlier setup

# Show the vocabulary (word -> index mapping)
print("Vocabulary:", vocab)

# Optionally, convert the texts into sequences (word indices)
sequences = tokenizer.texts_to_sequences(texts)

# Show sequences (list of word indices)
print("Sequences:", sequences)

from tensorflow.keras.preprocessing.sequence import pad_sequences

# Pad the sequences so that they all have the same length
padded_sequences = pad_sequences(sequences, padding='post')

print("Padded Sequences:", padded_sequences)

In [None]:
import spacy #with spacy
from tensorflow.keras.preprocessing.text import Tokenizer as tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences as padding

# Load spaCy's small English model
nlp = spacy.load("en_core_web_sm")

# Your data
data = [
    ("I love this movie", 1),          
    ("This film is terrible", 0), 
    ("What a great movie", 1),
    ("I hated this film", 0),
    ("Amazing acting and good story", 1),
    ("Bad plot and boring", 0),
]

# Function to clean and tokenize text using spaCy
def clean_and_tokenize(text):
    doc = nlp(text.lower())  # Process the text with spaCy (lowercase, tokenization)
    # Remove stop words and punctuation, keep only alphanumeric tokens
    return [token.text for token in doc if not token.is_stop and not token.is_punct]

# Clean and tokenize the text data
texts = [clean_and_tokenize(sentence[0]) for sentence in data]

# Initialize the Tokenizer
# This will ensure that the vocabulary is limited to the 10,000 most frequent words.
tokenizer = tokenizer(num_words=10000, oov_token="<UNK>")  # Out-of-vocabulary token

# Fit the tokenizer on your cleaned text data
flat_texts = [" ".join(sentence) for sentence in texts]  # Join words back into a sentence
tokenizer.fit_on_texts(flat_texts)

# Create the vocabulary (word -> index mapping)
vocab = tokenizer.word_index

# Add padding token with index 0
vocab['PAD'] = 0

# Show the vocabulary (word -> index mapping)
print("Vocabulary:", vocab)

# Convert cleaned and tokenized sentences into sequences (word indices)
sequences = tokenizer.texts_to_sequences(flat_texts)

# Pad the sequences so they all have the same length
padded_sequences = padding(sequences, padding='post')

# Show sequences and padded sequences
print("Sequences:", sequences)
print("Padded Sequences:", padded_sequences)


In [None]:
import spacy #ngrams
from tensorflow.keras.preprocessing.text import Tokenizer as tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences as padding
from nltk.util import ngrams

# Load spaCy's small English model
nlp = spacy.load("en_core_web_sm")

# Your data
data = [
    ("I love this movie", 1),          
    ("This film is terrible", 0), 
    ("What a great movie", 1),
    ("I hated this film", 0),
    ("Amazing acting and good story", 1),
    ("Bad plot and boring", 0),
]

# Function to clean and tokenize text using spaCy
def clean_and_tokenize(text):
    doc = nlp(text.lower())  # Process the text with spaCy (lowercase, tokenization)
    # Remove stop words and punctuation, keep only alphanumeric tokens
    return [token.text for token in doc if not token.is_stop and not token.is_punct]

# Function to generate n-grams (bigrams, trigrams, etc.)
def generate_ngrams(tokens, n=2):
    return [' '.join(gram) for gram in ngrams(tokens, n)]

# Clean and tokenize the text data
texts = [clean_and_tokenize(sentence[0]) for sentence in data]

# Generate n-grams (for example, bigrams)
n = 2  # For bigrams, change to 3 for trigrams, etc.
texts_with_ngrams = []

for text in texts:
    # Generate n-grams and append to the tokenized sentence
    ngram_tokens = generate_ngrams(text, n)
    texts_with_ngrams.append(text + ngram_tokens)  # Combine words and n-grams

# Initialize the Tokenizer
tokenizer = tokenizer(oov_token="<UNK>")  # Out-of-vocabulary token

# Fit the tokenizer on your cleaned text data with n-grams
flat_texts = [" ".join(sentence) for sentence in texts_with_ngrams]  # Join words and n-grams back into a sentence
tokenizer.fit_on_texts(flat_texts)

# Create the vocabulary (word -> index mapping)
vocab = tokenizer.word_index

# Add padding token with index 0
vocab['PAD'] = 0

# Show the vocabulary (word -> index mapping)
print("Vocabulary:", vocab)

# Convert cleaned and tokenized sentences with n-grams into sequences (word indices)
sequences = tokenizer.texts_to_sequences(flat_texts)

# Pad the sequences so they all have the same length
padded_sequences = padding(sequences, padding='post')

# Show sequences and padded sequences
print("Sequences:", sequences)
print("Padded Sequences:", padded_sequences)


In [None]:
import spacy #parts of speech tagger
from tensorflow.keras.preprocessing.text import Tokenizer as tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences as padding

# Load spaCy's small English model
nlp = spacy.load("en_core_web_sm")

# Your data
data = [
    ("I love this movie", 1),          
    ("This film is terrible", 0), 
    ("What a great movie", 1),
    ("I hated this film", 0),
    ("Amazing acting and good story", 1),
    ("Bad plot and boring", 0),
]

# Function to clean, tokenize, and add POS tags using spaCy
def clean_and_tokenize_with_pos(text):
    doc = nlp(text.lower())  # Process the text with spaCy (lowercase, tokenization)
    # Extract tokens and their POS tags (filter out stop words and punctuation)
    tokens_pos = [(token.text, token.pos_) for token in doc if not token.is_stop and not token.is_punct]
    return tokens_pos

# Clean and tokenize the text data with POS tags
texts_with_pos = [clean_and_tokenize_with_pos(sentence[0]) for sentence in data]

# Show tokenized text with POS tags
print("Tokenized and POS-tagged Texts:", texts_with_pos)

# Initialize the Tokenizer
tokenizer = tokenizer(oov_token="<UNK>")  # Out-of-vocabulary token

# Prepare the text data (flatten the list of tokenized words)
flat_texts = [" ".join([token[0] for token in sentence]) for sentence in texts_with_pos]  # Only words, not POS tags

# Fit the tokenizer on your cleaned text data
tokenizer.fit_on_texts(flat_texts)

# Create the vocabulary (word -> index mapping)
vocab = tokenizer.word_index

# Add padding token with index 0
vocab['PAD'] = 0

# Show the vocabulary (word -> index mapping)
print("Vocabulary:", vocab)

# Convert cleaned and tokenized sentences into sequences (word indices)
sequences = tokenizer.texts_to_sequences(flat_texts)

# Pad the sequences so they all have the same length
padded_sequences = padding(sequences, padding='post')

# Show sequences and padded sequences
print("Sequences:", sequences)
print("Padded Sequences:", padded_sequences)


In [None]:
import spacy #name entity recognizer
from tensorflow.keras.preprocessing.text import Tokenizer as tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences as padding

# Load spaCy's small English model
nlp = spacy.load("en_core_web_sm")

# Your data
data = [
    ("I love this movie", 1),          
    ("This film is terrible", 0), 
    ("What a great movie", 1),
    ("I hated this film", 0),
    ("Amazing acting and good story", 1),
    ("Bad plot and boring", 0),
]

# Function to clean, tokenize, add POS tags, and extract named entities using spaCy
def clean_and_tokenize_with_ner(text):
    doc = nlp(text.lower())  # Process the text with spaCy (lowercase, tokenization)
    tokens = [(token.text, token.pos_) for token in doc if not token.is_stop and not token.is_punct]
    # Extract named entities
    entities = [(ent.text, ent.label_) for ent in doc.ents]  # (Entity text, Entity type)
    return tokens, entities

# Clean, tokenize, and extract NER for each sentence
texts_with_pos_and_entities = [clean_and_tokenize_with_ner(sentence[0]) for sentence in data]

# Show tokenized text with POS tags and Named Entities
print("Tokenized Texts with POS and Named Entities:", texts_with_pos_and_entities)

# Initialize the Tokenizer
tokenizer = tokenizer(oov_token="<UNK>")  # Out-of-vocabulary token

# Prepare the text data (flatten the list of tokenized words)
flat_texts = [" ".join([token[0] for token in sentence[0]]) for sentence in texts_with_pos_and_entities]  # Only words, not POS tags or entities

# Fit the tokenizer on your cleaned text data
tokenizer.fit_on_texts(flat_texts)

# Create the vocabulary (word -> index mapping)
vocab = tokenizer.word_index

# Add padding token with index 0
vocab['PAD'] = 0

# Show the vocabulary (word -> index mapping)
print("Vocabulary:", vocab)

# Convert cleaned and tokenized sentences into sequences (word indices)
sequences = tokenizer.texts_to_sequences(flat_texts)

# Pad the sequences so they all have the same length
padded_sequences = padding(sequences, padding='post')

# Show sequences and padded sequences
print("Sequences:", sequences)
print("Padded Sequences:", padded_sequences)