## Text Preprocessing and Tokenization 

1. Make a dataset
2. Tokenization
3. Padding
4. Stop words
5. Grammar tagging (Part of Speech)


In [18]:


import tensorflow as tf

from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
# nltk

from nltk.corpus import stopwords
from nltk import pos_tag

from nltk.tokenize import word_tokenize
import nltk

nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\Field Employee\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


True

## Make a dataset

In [19]:


# Example text data

texts = ["Hello, how are you?", "I am doing great!", "What about you?"]

## Tokenization

In [20]:
# Tokenization

tokenizer = Tokenizer()

tokenizer.fit_on_texts(texts)

sequences = tokenizer.texts_to_sequences(texts)

## Padding

In [21]:
# Padding

max_len = max(len(sequence) for sequence in sequences)

padded_sequences = pad_sequences(sequences, maxlen=max_len)

## Stop words

In [22]:
# Stop words removal
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

filtered_sequences = []
for sequence in sequences:
    filtered_sequence = [word for word in sequence if word not in stop_words]
    filtered_sequences.append(filtered_sequence)

[nltk_data] Downloading package stopwords to C:\Users\Field
[nltk_data]     Employee\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


## Grammar tagging (Part of Speech)

In [23]:
# Grammar tagging
tagged_sequences = []
for sequence in filtered_sequences:
    words = tokenizer.sequences_to_texts([sequence])[0].split()  # Convert sequence back to words
    tagged_sequence = pos_tag(words)
    tagged_sequences.append(tagged_sequence)

print(tagged_sequences)

[[('hello', 'VB'), ('how', 'WRB'), ('are', 'VBP'), ('you', 'PRP')], [('i', 'NN'), ('am', 'VBP'), ('doing', 'VBG'), ('great', 'JJ')], [('what', 'WP'), ('about', 'IN'), ('you', 'PRP')]]
