In [3]:
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
# read abt svtoriser
import numpy as np

In [4]:
sentences = ['I have a credit card account', 'My account card,debit card is lost', 'My credit card stopped working']
vectorizer = CountVectorizer()
countvec = vectorizer.fit_transform(sentences)

In [5]:
countvec.A

array([[1, 1, 1, 0, 1, 0, 0, 0, 0, 0],
       [1, 2, 0, 1, 0, 1, 1, 1, 0, 0],
       [0, 1, 1, 0, 0, 0, 0, 1, 1, 1]])

In [6]:
vectorizer.get_feature_names()

['account',
 'card',
 'credit',
 'debit',
 'have',
 'is',
 'lost',
 'my',
 'stopped',
 'working']

In [7]:
vectorizer = CountVectorizer(max_features=4)
countvec = vectorizer.fit_transform(sentences)
print(countvec.A)
print(vectorizer.get_feature_names())

[[1 1 1 0]
 [1 2 0 1]
 [0 1 1 1]]
['account', 'card', 'credit', 'my']


In [8]:
vectorizer = CountVectorizer(max_features=4, stop_words='english')
countvec = vectorizer.fit_transform(sentences)
print(countvec.A)
print(vectorizer.get_feature_names())

[[1 1 1 0]
 [1 2 0 1]
 [0 1 1 0]]
['account', 'card', 'credit', 'debit']


In [9]:
vectorizer = CountVectorizer(max_features=6,  ngram_range=(1,2))
countvec = vectorizer.fit_transform(sentences)
print(countvec.A)
print(vectorizer.get_feature_names())

[[1 1 1 1 0 0]
 [1 2 0 0 1 0]
 [0 1 1 1 1 1]]
['account', 'card', 'credit', 'credit card', 'my', 'stopped']


In [10]:
vectorizer = CountVectorizer(max_features=6, stop_words='english', ngram_range=(1,2))
countvec = vectorizer.fit_transform(sentences)
print(countvec.A)
print(vectorizer.get_feature_names())

[[1 0 1 1 1 1]
 [1 1 2 0 0 0]
 [0 0 1 0 1 1]]
['account', 'account card', 'card', 'card account', 'credit', 'credit card']


In [11]:
# TfIdf Term Frequency
# TF = (# Occurance of term in the Document) / (# number of words in the Document)
vectorizer = TfidfVectorizer(use_idf= False, norm='l1')
tfvec = vectorizer.fit_transform(sentences)
print(tfvec.A)
print(vectorizer.get_feature_names())

[[0.25       0.25       0.25       0.         0.25       0.
  0.         0.         0.         0.        ]
 [0.14285714 0.28571429 0.         0.14285714 0.         0.14285714
  0.14285714 0.14285714 0.         0.        ]
 [0.         0.2        0.2        0.         0.         0.
  0.         0.2        0.2        0.2       ]]
['account', 'card', 'credit', 'debit', 'have', 'is', 'lost', 'my', 'stopped', 'working']


In [12]:
vectorizer = TfidfVectorizer(use_idf= False, norm='l2') # typical Euclidean norm
tfvec = vectorizer.fit_transform(sentences)
print(tfvec.A)
print(vectorizer.get_feature_names())

[[0.5        0.5        0.5        0.         0.5        0.
  0.         0.         0.         0.        ]
 [0.33333333 0.66666667 0.         0.33333333 0.         0.33333333
  0.33333333 0.33333333 0.         0.        ]
 [0.         0.4472136  0.4472136  0.         0.         0.
  0.         0.4472136  0.4472136  0.4472136 ]]
['account', 'card', 'credit', 'debit', 'have', 'is', 'lost', 'my', 'stopped', 'working']


In [15]:
print(1/np.sqrt(9))

0.3333333333333333


In [16]:
vectorizer = TfidfVectorizer(use_idf= False, norm=None) # it is basically coutn vectoriser
tfvec = vectorizer.fit_transform(sentences) 
print(tfvec.A)
print(vectorizer.get_feature_names())

[[1. 1. 1. 0. 1. 0. 0. 0. 0. 0.]
 [1. 2. 0. 1. 0. 1. 1. 1. 0. 0.]
 [0. 1. 1. 0. 0. 0. 0. 1. 1. 1.]]
['account', 'card', 'credit', 'debit', 'have', 'is', 'lost', 'my', 'stopped', 'working']


In [17]:
vectorizer = TfidfVectorizer(use_idf= False,stop_words='english', norm=None) # it is basically coutn vectoriser
tfvec = vectorizer.fit_transform(sentences) 
print(tfvec.A)
print(vectorizer.get_feature_names())

[[1. 1. 1. 0. 0. 0. 0.]
 [1. 2. 0. 1. 1. 0. 0.]
 [0. 1. 1. 0. 0. 1. 1.]]
['account', 'card', 'credit', 'debit', 'lost', 'stopped', 'working']


In [19]:
# IDF = log(# of Documents/ # Document with term t in it) 
# IT increase the weight of the term that occurs rarely. and decreases the weight of the term that occurs frequently.
vectorizer_idf = TfidfVectorizer(smooth_idf= False) # this calculates both the terms tf and idf as well. tf * idf is complete caluculation.
tfidfvec = vectorizer_idf.fit_transform(sentences)


In [20]:
print(vectorizer_idf.idf_)
print(vectorizer_idf.get_feature_names())

[1.40546511 1.         1.40546511 2.09861229 2.09861229 2.09861229
 2.09861229 1.40546511 2.09861229 2.09861229]
['account', 'card', 'credit', 'debit', 'have', 'is', 'lost', 'my', 'stopped', 'working']


In [21]:
print(np.log(3/2)+1) # why do we add 1 to the value. terms with 0 idf means it is occuring in all the doucmnets.

1.4054651081081644


In [22]:
tfidfvec.A # tfidf vector

array([[0.45951737, 0.3269504 , 0.45951737, 0.        , 0.68614212,
        0.        , 0.        , 0.        , 0.        , 0.        ],
       [0.3055129 , 0.43474989, 0.        , 0.45618573, 0.        ,
        0.45618573, 0.45618573, 0.3055129 , 0.        , 0.        ],
       [0.        , 0.26959162, 0.37890161, 0.        , 0.        ,
        0.        , 0.        , 0.37890161, 0.56576828, 0.56576828]])