## Task: Convert the sentences in the variable sents to vectorized form.

### Import Count Vectorizer from sklearn.feature_extraction.text and pandas

In [38]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
import pandas as pd
import numpy as np
import re
import warnings
warnings.filterwarnings('ignore')

In [39]:
document =[
    'The cat sat on the mat',
    'The dog is barking at the cat',
    'The cat and the dog are friends',
    'A quick brown fox jumps over the lazy dog',
    'Black dogs are wild hmmmm'
]

In [40]:
document[0]

'The cat sat on the mat'

## Approach 1: Using default parameters

In [41]:
cv = CountVectorizer()
# ngram_range treat each word as token

In [42]:
X = cv.fit_transform(document)

In [43]:
X = X.toarray()

In [44]:
X

array([[0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 2, 0],
       [0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 2, 0],
       [1, 1, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0],
       [0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 1, 0, 0, 1, 1, 0, 1, 0],
       [0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1]],
      dtype=int64)

In [45]:
columns = cv.get_feature_names_out()

In [46]:
columns

array(['and', 'are', 'at', 'barking', 'black', 'brown', 'cat', 'dog',
       'dogs', 'fox', 'friends', 'hmmmmmmm', 'is', 'jumps', 'lazy', 'mat',
       'on', 'over', 'quick', 'sat', 'the', 'wild'], dtype=object)

In [47]:
df = pd.DataFrame(X,columns=columns)

In [48]:
df

Unnamed: 0,and,are,at,barking,black,brown,cat,dog,dogs,fox,...,is,jumps,lazy,mat,on,over,quick,sat,the,wild
0,0,0,0,0,0,0,1,0,0,0,...,0,0,0,1,1,0,0,1,2,0
1,0,0,1,1,0,0,1,1,0,0,...,1,0,0,0,0,0,0,0,2,0
2,1,1,0,0,0,0,1,1,0,0,...,0,0,0,0,0,0,0,0,2,0
3,0,0,0,0,0,1,0,1,0,1,...,0,1,1,0,0,1,1,0,1,0
4,0,1,0,0,1,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,1


Tokenization is the process of breaking down a text into smaller units called tokens. These tokens can be words, phrases, symbols, or other meaningful elements, depending on the context and the task at hand. Tokenization is a fundamental step in many natural language processing (NLP) tasks because it allows machines to understand and process human language.

Note that here, we have tokenized a sentence by breaking it into individual words.

Now, how do we capture the information in seqence of words? Example "brown fox" or "quick brown fox"?

We can use Ngrams in order to do that.

## Approach 2: Add n_gram range in the CountVectorizer

In [49]:
cv = CountVectorizer(ngram_range=(1,3))

In [50]:
X = cv.fit_transform(document)

In [51]:
X = X.toarray()

In [52]:
X

array([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0,
        1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 2, 1, 0, 1, 0, 0, 0, 0,
        0, 1, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0,
        0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 1, 0, 0, 1, 0, 1, 0,
        0, 0, 0, 0],
       [1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1,
        0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 1, 1, 0, 1, 1, 0, 0,
        0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0,
        0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 1, 1, 1, 1,
        1, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1,
        1, 0, 0, 0],
       [0, 0, 0, 1, 0, 1, 1,

In [53]:
df = pd.DataFrame(X,columns=cv.get_feature_names_out())

In [54]:
df

Unnamed: 0,and,and the,and the dog,are,are friends,are wild,are wild hmmmmmmm,at,at the,at the cat,...,the cat and,the cat sat,the dog,the dog are,the dog is,the lazy,the lazy dog,the mat,wild,wild hmmmmmmm
0,0,0,0,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,1,0,0
1,0,0,0,0,0,0,0,1,1,1,...,0,0,1,0,1,0,0,0,0,0
2,1,1,1,1,1,0,0,0,0,0,...,1,0,1,1,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,1,0,0,0
4,0,0,0,1,0,1,1,0,0,0,...,0,0,0,0,0,0,0,0,1,1


In order to reduce number of features, we can add stopwords

## Approach 3: Add stopwords

In [55]:
cv = CountVectorizer(stop_words='english',ngram_range=(1,3))
X = cv.fit_transform(document).toarray()
df = pd.DataFrame(X,columns=cv.get_feature_names_out())

In [56]:
df

Unnamed: 0,barking,barking cat,black,black dogs,black dogs wild,brown,brown fox,brown fox jumps,cat,cat dog,...,lazy,lazy dog,mat,quick,quick brown,quick brown fox,sat,sat mat,wild,wild hmmmmmmm
0,0,0,0,0,0,0,0,0,1,0,...,0,0,1,0,0,0,1,1,0,0
1,1,1,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,1,1,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,1,1,1,0,0,...,1,1,0,1,1,1,0,0,0,0
4,0,0,1,1,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,1


In [57]:
tf = TfidfVectorizer()

In [58]:
tf = TfidfVectorizer(stop_words='english',ngram_range=(1,3))
X1 = tf.fit_transform(document).toarray()
df1 = pd.DataFrame(X1,columns=cv.get_feature_names_out())

In [59]:
df1

Unnamed: 0,barking,barking cat,black,black dogs,black dogs wild,brown,brown fox,brown fox jumps,cat,cat dog,...,lazy,lazy dog,mat,quick,quick brown,quick brown fox,sat,sat mat,wild,wild hmmmmmmm
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.286912,0.0,...,0.0,0.0,0.428411,0.0,0.0,0.0,0.428411,0.428411,0.0,0.0
1,0.451891,0.451891,0.0,0.0,0.0,0.0,0.0,0.0,0.302637,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.302637,0.451891,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.26308,0.26308,0.26308,0.0,0.0,...,0.26308,0.26308,0.0,0.26308,0.26308,0.26308,0.0,0.0,0.0,0.0
4,0.0,0.0,0.333333,0.333333,0.333333,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.333333,0.333333


## Approach 4: Add custom pre-processing steps

In [60]:
import nltk
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from nltk.stem import WordNetLemmatizer
nltk.download('wordnet')

stop = stopwords.words('english')

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\SUSHI\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [61]:
def customPreprocess(text):
    #Lowerase text
    text = text.lower()
    #Remove all special character
    cleanedText = re.sub(r'[^A-Za-z]',' ',text)
    #Lemmatize the text
    words = cleanedText.split()
    words = [word for word in words if word not in stop]
    lemmatizer = WordNetLemmatizer()
    lemmWords = [lemmatizer.lemmatize(word) for word in words]
    #convert to string
    cleanedText = ' '.join(lemmWords)
    return cleanedText

In [62]:
customPreprocess(document[4])

'black dog wild hmmmmmmm'

In [63]:
cv = CountVectorizer(stop_words='english',ngram_range=(1,3), preprocessor=customPreprocess)
X = cv.fit_transform(document).toarray()
df = pd.DataFrame(X,columns=cv.get_feature_names_out())

In [64]:
df

Unnamed: 0,barking,barking cat,black,black dog,black dog wild,brown,brown fox,brown fox jump,cat,cat dog,...,lazy,lazy dog,mat,quick,quick brown,quick brown fox,sat,sat mat,wild,wild hmmmmmmm
0,0,0,0,0,0,0,0,0,1,0,...,0,0,1,0,0,0,1,1,0,0
1,1,1,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,1,1,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,1,1,1,0,0,...,1,1,0,1,1,1,0,0,0,0
4,0,0,1,1,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,1


## Now that we understand the basics of Vetorization, Vectorize the documents in the variable documents using Tf-IDF vectorizer. Use english stopwords, custom preprocessor and ngram_range of (1,2)

In [65]:
tf = TfidfVectorizer(stop_words='english',ngram_range=(1,2),preprocessor=customPreprocess)
X1 = tf.fit_transform(document).toarray()
df1 = pd.DataFrame(X1,columns=tf.get_feature_names_out())

In [66]:
df1

Unnamed: 0,barking,barking cat,black,black dog,brown,brown fox,cat,cat dog,cat sat,dog,...,jump lazy,lazy,lazy dog,mat,quick,quick brown,sat,sat mat,wild,wild hmmmmmmm
0,0.0,0.0,0.0,0.0,0.0,0.0,0.317527,0.0,0.474125,0.0,...,0.0,0.0,0.0,0.474125,0.0,0.0,0.474125,0.474125,0.0,0.0
1,0.515306,0.515306,0.0,0.0,0.0,0.0,0.345106,0.0,0.0,0.290314,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.345106,0.515306,0.0,0.290314,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.311326,0.311326,0.0,0.0,0.0,0.175395,...,0.311326,0.311326,0.311326,0.0,0.311326,0.311326,0.0,0.0,0.0,0.0
4,0.0,0.0,0.39786,0.39786,0.0,0.0,0.0,0.0,0.0,0.224148,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.39786,0.39786
