In [None]:
import nltk
from nltk.tokenize import word_tokenize
nltk.download('punkt')

# Sample text
text = "Tokenization is the process of breaking down a text into individual words or tokens."

# Tokenize the text
tokens = word_tokenize(text)

# Print the tokens
print(tokens)


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


['Tokenization', 'is', 'the', 'process', 'of', 'breaking', 'down', 'a', 'text', 'into', 'individual', 'words', 'or', 'tokens', '.']


**Stemming using NLTK:**

In [None]:
import nltk
from nltk.stem import PorterStemmer

# Sample words to stem
words = ["running", "flies", "jumping", "friendly"]

# Initialize the Porter Stemmer

stemmer = PorterStemmer()

# Stem the words
stemmed_words = [stemmer.stem(word) for word in words]

# Print the stemmed words
print(stemmed_words)


['run', 'fli', 'jump', 'friendli']


**Lemmatization using spaCy:**

In [None]:
import spacy

# Load spaCy's English language model
nlp = spacy.load("en_core_web_sm")

# Sample words to lemmatize
words = ["running", "flies", "jumping", "friendly"]

# Lemmatize the words
lemmatized_words = [token.lemma_ for token in nlp(" ".join(words))]

# Print the lemmatized words
print(lemmatized_words)


['run', 'fly', 'jump', 'friendly']


**Text Normalization:**

In [None]:
# Sample text with variations
text = "I luv metapi. Plz gr8 job! AI is amazin'."

# Define a simple text normalization function
def text_normalization(text):
    # Replace abbreviations and acronyms
    text = text.replace("luv", "love")
    text = text.replace("Plz", "Please")
    text = text.replace("gr8", "great")
    text = text.replace("amazin'", "amazing")

    # Convert to lowercase (optional)
    text = text.lower()

    return text

# Apply text normalization
normalized_text = text_normalization(text)

# Print the normalized text
print(normalized_text)


i love metapi. please great job! ai is amazing.


**Removing Redundancy**


In [None]:
import difflib

# Sample list of text entries with potential duplicates
text_entries = [
    "This is an example sentence.",
    "Another example sentence.",
    "This is an example sentence.",  # Duplicate
    "Yet another example sentence.",
    "Some unique text here.",
    "This is an example sentence."   # Duplicate
]

# Function to remove duplicates based on similarity threshold
def remove_redundancy(text_entries, similarity_threshold=0.8):
    # Initialize a list to store the deduplicated entries
    deduplicated_entries = []

    # Iterate through the text entries
    for entry in text_entries:
        # Flag to check if the entry is a duplicate
        is_duplicate = False

        # Compare the entry with each deduplicated entry
        for dedup_entry in deduplicated_entries:
            similarity = difflib.SequenceMatcher(None, entry, dedup_entry).ratio()
            if similarity >= similarity_threshold:
                is_duplicate = True
                break

        # If the entry is not a duplicate, add it to the deduplicated list
        if not is_duplicate:
            deduplicated_entries.append(entry)

    return deduplicated_entries

# Specify a similarity threshold (adjust as needed)
similarity_threshold = 0.8

# Remove redundancy
deduplicated_text_entries = remove_redundancy(text_entries, similarity_threshold)

# Print the deduplicated entries
for entry in deduplicated_text_entries:
    print(entry)

This is an example sentence.
Another example sentence.
Some unique text here.


In [7]:
text = "Cats and dogs are friends."
tokens = text.split()

In [8]:
vocab = set(tokens)
one_hot_encoded = []
for token in tokens:
    one_hot_vector = [1 if token == word else 0 for word in vocab]
    one_hot_encoded.append(one_hot_vector)


In [9]:
one_hot_encoded

[[0, 0, 0, 1, 0],
 [0, 0, 0, 0, 1],
 [0, 0, 1, 0, 0],
 [1, 0, 0, 0, 0],
 [0, 1, 0, 0, 0]]

In [10]:
flattened_vector = [item for sublist in one_hot_encoded for item in sublist]
print(flattened_vector)

[0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0]


In [11]:
from sklearn.feature_extraction.text import CountVectorizer

text = "Tokenization is the process of breaking down a text into individual words or tokens."

# Create an instance of CountVectorizer
vectorizer = CountVectorizer(binary=True)

# Fit the vectorizer on the text and transform the text into a one-hot encoded matrix
one_hot_matrix = vectorizer.fit_transform([text])

# Get the feature names (words)
feature_names = vectorizer.get_feature_names_out()

# Convert the one-hot matrix to a dense array
one_hot_array = one_hot_matrix.toarray()

# Create a dictionary to map words to their one-hot encoding
one_hot_encoding = {word: one_hot_array[0][i] for i, word in enumerate(feature_names)}

# Print the one-hot encoding for each word
for word, encoding in one_hot_encoding.items():
    print(f"{word}: {encoding}")


breaking: 1
down: 1
individual: 1
into: 1
is: 1
of: 1
or: 1
process: 1
text: 1
the: 1
tokenization: 1
tokens: 1
words: 1


In [12]:
from sklearn.feature_extraction.text import CountVectorizer

text = "Tokenization is the process of breaking down a text into individual words or tokens."

# Create an instance of CountVectorizer
vectorizer = CountVectorizer(binary=True)

# Fit the vectorizer on the text and transform the text into a one-hot encoded matrix
one_hot_matrix = vectorizer.fit_transform([text])

# Get the feature names (words)
feature_names = vectorizer.get_feature_names_out()

# Convert the one-hot matrix to a dense array
one_hot_vectors = one_hot_matrix.toarray()

# Create a dictionary to map words to their one-hot encoding
one_hot_encoding = {word: vector for word, vector in zip(feature_names, one_hot_vectors)}

# Print the one-hot encoding vectors for each word
for word, encoding_vector in one_hot_encoding.items():
    print(f"{word}: {encoding_vector}")


breaking: [1 1 1 1 1 1 1 1 1 1 1 1 1]
