### Tfidf Vectorizer and CountWord Vectorizer, what they do and how to use them ###

In [1]:
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer # import the TfidfVectorizer and CountVectorizer
import pandas as pd # import the pandas library
import numpy as np # import the numpy library

In [2]:
corpus = ['This is the first document.','This document is the second document.','And this is the third one.','Is this the first document?']
# The above is a corpus of text data
# Each string can be considered a seperate document

#### Example 1 - tfidf Vectorizer ####

In [3]:
vectorizer = TfidfVectorizer() # Instantiate the object from the class
X = vectorizer.fit_transform(corpus) # fit it to the data
print(f"Sparseform\n{X}\n\nFeatures\n{vectorizer.get_feature_names_out()}\n\nShape: {X.shape}\n") # print the sparse matrix, features, and shape
# Because items in the matrix are 0's, it generates a simple vector.
# The matrix is a sparse matrix, meaning most of the values are 0, and only a few are meaningful.
# This sparse matrix saves memory.
# The shape of the matrix is 4x9, because there are 4 documents and 9 unique words in the corpus.

Sparseform
  (0, 1)	0.46979138557992045
  (0, 2)	0.5802858236844359
  (0, 6)	0.38408524091481483
  (0, 3)	0.38408524091481483
  (0, 8)	0.38408524091481483
  (1, 5)	0.5386476208856763
  (1, 1)	0.6876235979836938
  (1, 6)	0.281088674033753
  (1, 3)	0.281088674033753
  (1, 8)	0.281088674033753
  (2, 4)	0.511848512707169
  (2, 7)	0.511848512707169
  (2, 0)	0.511848512707169
  (2, 6)	0.267103787642168
  (2, 3)	0.267103787642168
  (2, 8)	0.267103787642168
  (3, 1)	0.46979138557992045
  (3, 2)	0.5802858236844359
  (3, 6)	0.38408524091481483
  (3, 3)	0.38408524091481483
  (3, 8)	0.38408524091481483

Features
['and' 'document' 'first' 'is' 'one' 'second' 'the' 'third' 'this']

Shape: (4, 9)



In [4]:
# Vocabulary generated by tfidvectorizer
# This it does automatically and acn be accessed through the method of the Class.
vectorizer.vocabulary_ # creates a dictionary of the words and their index in the matrix

{'this': 8,
 'is': 3,
 'the': 6,
 'first': 2,
 'document': 1,
 'second': 5,
 'and': 0,
 'third': 7,
 'one': 4}

In [5]:
# You can convert the tf-idf vector into a matrix of rows and columns
# This the Term Document Matrix (TDM) that we discussed in the slides

X_dense = X.toarray() # You can also do X.todense() to get a numpy array 
print(f"Dense Matrix:\n{X_dense}\n\n of shape: {X_dense.shape}") # print the dense matrix and its shape

Dense Matrix:
[[0.         0.46979139 0.58028582 0.38408524 0.         0.
  0.38408524 0.         0.38408524]
 [0.         0.6876236  0.         0.28108867 0.         0.53864762
  0.28108867 0.         0.28108867]
 [0.51184851 0.         0.         0.26710379 0.51184851 0.
  0.26710379 0.51184851 0.26710379]
 [0.         0.46979139 0.58028582 0.38408524 0.         0.
  0.38408524 0.         0.38408524]]

 of shape: (4, 9)


In [6]:
# Dataframe creation

doc_matrix = pd.DataFrame(X_dense,columns=vectorizer.get_feature_names_out()) # Convert it into a dataframe to visualize
print(doc_matrix) # print the dataframe

        and  document     first        is       one    second       the  \
0  0.000000  0.469791  0.580286  0.384085  0.000000  0.000000  0.384085   
1  0.000000  0.687624  0.000000  0.281089  0.000000  0.538648  0.281089   
2  0.511849  0.000000  0.000000  0.267104  0.511849  0.000000  0.267104   
3  0.000000  0.469791  0.580286  0.384085  0.000000  0.000000  0.384085   

      third      this  
0  0.000000  0.384085  
1  0.000000  0.281089  
2  0.511849  0.267104  
3  0.000000  0.384085  


#### Example 2 - including stopward removal ####

In [7]:
vect_stop = TfidfVectorizer(stop_words='english') # Using the stopwords parameter while vectorizing (before creating a matrix).
# Check the docuementation for parameters

In [8]:
# Now tf-idf will be generated after removing stopwords

X1 = vect_stop.fit_transform(corpus) # fit the vectorizer to the data
print(X1,'\n',vect_stop.get_feature_names_out(),'\n',X1.shape,'\n') # print the sparse matrix, features, and shape

  (0, 0)	1.0
  (1, 1)	0.6166684570284895
  (1, 0)	0.78722297610404
  (3, 0)	1.0 
 ['document' 'second'] 
 (4, 2) 



#### Example 3 - CountWord Vectorizer ####

In [8]:
cvectorizer = CountVectorizer() # Instantiate the object from the class
X2 = cvectorizer.fit_transform(corpus) # fit it to the data
print(f"Sparseform\n{X2}\n\nFeatures\n{cvectorizer.get_feature_names_out()}\n\nShape: {X2.shape}\n") # generates a sparse matrix
print(cvectorizer.vocabulary_) # creates a dictionary of the words and their index in the matrix

Sparseform
  (0, 8)	1
  (0, 3)	1
  (0, 6)	1
  (0, 2)	1
  (0, 1)	1
  (1, 8)	1
  (1, 3)	1
  (1, 6)	1
  (1, 1)	2
  (1, 5)	1
  (2, 8)	1
  (2, 3)	1
  (2, 6)	1
  (2, 0)	1
  (2, 7)	1
  (2, 4)	1
  (3, 8)	1
  (3, 3)	1
  (3, 6)	1
  (3, 2)	1
  (3, 1)	1

Features
['and' 'document' 'first' 'is' 'one' 'second' 'the' 'third' 'this']

Shape: (4, 9)

{'this': 8, 'is': 3, 'the': 6, 'first': 2, 'document': 1, 'second': 5, 'and': 0, 'third': 7, 'one': 4}


In [9]:
# Dataframe creation 

doc_matrix1 = pd.DataFrame(X2.toarray(),columns=cvectorizer.get_feature_names_out()) # Convert it into a array to visualize
print(doc_matrix1) # print the dataframe

   and  document  first  is  one  second  the  third  this
0    0         1      1   1    0       0    1      0     1
1    0         2      0   1    0       1    1      0     1
2    1         0      0   1    1       0    1      1     1
3    0         1      1   1    0       0    1      0     1


#### Example 4 - How to use the vectorizers when building classifiers - shown with tfidf, but applies to CountVectorizer too ####

In [10]:
# Data which could have labels attached to them as spam or not spam

train = ('The sky is blue.','The sun is bright.') # The data which could have labels attached to them as spam or not spam
test = ('The sun in the sky is bright', 'We can see the shining sun, the bright sun.') # The data which could have labels attached to them as spam or not spam

In [11]:
# instantiate the vectorizer object
# use analyzer is word and stop_words is english which are responsible for remove stop words and create word vocabulary

tfidfvectorizer = TfidfVectorizer(analyzer='word' , stop_words='english') # Instantiate the object from the class

In [12]:
# Fit to training data and use the fitted vectorizer to transform training and test data
# Note that we have to use the same, fitted, vectorizer object for all the text data

tfidfvectorizer.fit(train) # fit the vectorizer to the training data
tfidf_train = tfidfvectorizer.transform(train) # transform the training data
tfidf_test  = tfidfvectorizer.transform(test) # transform the test data
print(tfidfvectorizer.vocabulary_) # creates a dictionary of the words and their index in the matrix

{'sky': 2, 'blue': 0, 'sun': 3, 'bright': 1}


In [14]:
print("Sparse Matrix form train and test data : \n") # print the sparse matrix
print(f"The training data matrix:\n{tfidf_train.toarray()}\n\nThe test data matrix:\n{tfidf_test.toarray()}") # print the training and test data matrices

Sparse Matrix form train and test data : 

The training data matrix:
[[0.70710678 0.         0.70710678 0.        ]
 [0.         0.70710678 0.         0.70710678]]

The test data matrix:
[[0.         0.57735027 0.57735027 0.57735027]
 [0.         0.4472136  0.         0.89442719]]


In [13]:
y_train = np.array([1,0]).reshape(-1,1) # let us add some fake labels to the training data
print(y_train) # print the labels

[[1]
 [0]]


In [14]:
# So no we can do the usualmodel building process

from sklearn.linear_model import LogisticRegression # import the logistic regression model
lreg = LogisticRegression() # instantiate the object from the class
lreg.fit(tfidf_train.toarray(),y_train) # fit the model to the training data
lreg.predict(tfidf_test.toarray()) # predict the test data

  y = column_or_1d(y, warn=True)


array([0, 0])