In [1]:
import nltk # Natural Language Toolkit
from nltk.tokenize import sent_tokenize, word_tokenize # Tokenizers
from nltk.corpus import stopwords # Stopwords
import pandas as pd # Import the Pandas library
import numpy as np # Import the NumPy library
from nltk.stem import PorterStemmer # Import the PorterStemmer function
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer # import the TfidfVectorizer and CountVectorizer

In [2]:
text = [ "The cat sat on the mat!", "Dogs are sitting on the log. Dogs love logs.", "Cats are lying on the rug, but the cat prefers the mat." ]

# Preprocessing

In [3]:
'''Convert text to lowercase'''
preprocessed_text = [ document.lower() for document in text ] # Convert text to lowercase using list comprehension
preprocessed_text

['the cat sat on the mat!',
 'dogs are sitting on the log. dogs love logs.',
 'cats are lying on the rug, but the cat prefers the mat.']

In [4]:
'''Tokenize text into words'''
preprocessed_text = [ word_tokenize(document) for document in preprocessed_text ] # Tokenize text into words
preprocessed_text

[['the', 'cat', 'sat', 'on', 'the', 'mat', '!'],
 ['dogs',
  'are',
  'sitting',
  'on',
  'the',
  'log',
  '.',
  'dogs',
  'love',
  'logs',
  '.'],
 ['cats',
  'are',
  'lying',
  'on',
  'the',
  'rug',
  ',',
  'but',
  'the',
  'cat',
  'prefers',
  'the',
  'mat',
  '.']]

In [5]:
'''Remove non-alphabetic tokens'''
preprocessed_text = [ [ token for token in document if token.isalpha() ] for document in preprocessed_text ] # Add only alphabetic tokens to the filtered_text list for each document in the preprocessed_text list
preprocessed_text

[['the', 'cat', 'sat', 'on', 'the', 'mat'],
 ['dogs', 'are', 'sitting', 'on', 'the', 'log', 'dogs', 'love', 'logs'],
 ['cats',
  'are',
  'lying',
  'on',
  'the',
  'rug',
  'but',
  'the',
  'cat',
  'prefers',
  'the',
  'mat']]

In [6]:
'''Remove stop words'''
stop_words = set(stopwords.words("english")) # Get the set of English stopwords
preprocessed_text = [ [ token for token in document if token not in stop_words ] for document in preprocessed_text ] # Add only non-stopwords to the filtered_text list for each document in the preprocessed_text list
preprocessed_text

[['cat', 'sat', 'mat'],
 ['dogs', 'sitting', 'log', 'dogs', 'love', 'logs'],
 ['cats', 'lying', 'rug', 'cat', 'prefers', 'mat']]

# Write a Function To Do the Preprocessing On Each Individual Document In Text

In [7]:
# Convert the array to a DataFrame
data = pd.DataFrame(text, columns=['text'])

# Define the preprocessing function
def clean_text(text):
    cleaned_text = ""

    # Create the stop words set
    stem = PorterStemmer() # Instantiate the stemmer object
    stop_words = set(stopwords.words("english"))
    
    # Convert text to lowercase
    text = text.lower()
    
    # Tokenize the text into words
    text = word_tokenize(text)
    
    # Use a loop to filter out non-alphabetic characters and stopwords
    for word in text: # Loop through each word in the text
        if word.isalpha() and word not in stop_words: # Check if the word is alphabetic and not a stopword
            cleaned_text = cleaned_text + stem.stem(word) + " " # Stem and append the word to the filtered_text string
    
    return cleaned_text # Return the filtered_text string

# Apply the preprocessing function to the text column
data['cleaned_text'] = data['text'].apply(clean_text)

# Print the preprocessed data
print("Preprocessed Data:")
data

Preprocessed Data:


Unnamed: 0,text,cleaned_text
0,The cat sat on the mat!,cat sat mat
1,Dogs are sitting on the log. Dogs love logs.,dog sit log dog love log
2,"Cats are lying on the rug, but the cat prefers...",cat lie rug cat prefer mat


# Complete Array for CountVectorizer and TF-IDF Vectorizer matrices 

In [8]:
cvectorizer = CountVectorizer() # Instantiate the object from the class
X2 = cvectorizer.fit_transform(data['cleaned_text']) # fit it to the data
X_dense = X2.todense() # convert the sparse matrix to a dense matrix
print(f"CountVector Matrix:\n{X_dense}\n\nFeatures\n{cvectorizer.get_feature_names_out()}\n\nShape: {X2.shape}\n") # generates a dense matrix
print(cvectorizer.vocabulary_) # creates a dictionary of the words and their index in the matrix

CountVector Matrix:
[[1 0 0 0 0 1 0 0 1 0]
 [0 2 0 2 1 0 0 0 0 1]
 [2 0 1 0 0 1 1 1 0 0]]

Features
['cat' 'dog' 'lie' 'log' 'love' 'mat' 'prefer' 'rug' 'sat' 'sit']

Shape: (3, 10)

{'cat': 0, 'sat': 8, 'mat': 5, 'dog': 1, 'sit': 9, 'log': 3, 'love': 4, 'lie': 2, 'rug': 7, 'prefer': 6}


In [10]:
doc_matrix1 = pd.DataFrame(X2.toarray(),columns=cvectorizer.get_feature_names_out()) # Convert it into a array to visualize
doc_matrix1 # print the dataframe

Unnamed: 0,cat,dog,lie,log,love,mat,prefer,rug,sat,sit
0,1,0,0,0,0,1,0,0,1,0
1,0,2,0,2,1,0,0,0,0,1
2,2,0,1,0,0,1,1,1,0,0


In [11]:
vectorizer = TfidfVectorizer() # instantiate the object from the class
X = vectorizer.fit_transform(data['cleaned_text']) # fit it to the data
X_dense = X.toarray() # convert the sparse matrix to a dense matrix
print(f"Tf-IDF Matrix:\n{X_dense}\n\nFeatures\n{vectorizer.get_feature_names_out()}\n\nShape: {X.shape}\n")
print(vectorizer.vocabulary_) # creates a dictionary of the words and their index in the matrix

Tf-IDF Matrix:
[[0.51785612 0.         0.         0.         0.         0.51785612
  0.         0.         0.68091856 0.        ]
 [0.         0.63245553 0.         0.63245553 0.31622777 0.
  0.         0.         0.         0.31622777]
 [0.62663214 0.         0.41197298 0.         0.         0.31331607
  0.41197298 0.41197298 0.         0.        ]]

Features
['cat' 'dog' 'lie' 'log' 'love' 'mat' 'prefer' 'rug' 'sat' 'sit']

Shape: (3, 10)

{'cat': 0, 'sat': 8, 'mat': 5, 'dog': 1, 'sit': 9, 'log': 3, 'love': 4, 'lie': 2, 'rug': 7, 'prefer': 6}


In [12]:
'''Convert the array to a DataFrame'''
doc_matrix = pd.DataFrame(X_dense,columns=vectorizer.get_feature_names_out()) # Convert it into a dataframe to visualize it
doc_matrix

Unnamed: 0,cat,dog,lie,log,love,mat,prefer,rug,sat,sit
0,0.517856,0.0,0.0,0.0,0.0,0.517856,0.0,0.0,0.680919,0.0
1,0.0,0.632456,0.0,0.632456,0.316228,0.0,0.0,0.0,0.0,0.316228
2,0.626632,0.0,0.411973,0.0,0.0,0.313316,0.411973,0.411973,0.0,0.0
