In [1]:
from nltk.tokenize import word_tokenize
import pandas as pd
from nltk.corpus import stopwords

In [2]:
sent1 = 'It is a good practice for us. '
sent2 = 'It was also good to know about it. '


In [3]:
tokens1 = word_tokenize(sent1.lower())
tokens2 = word_tokenize(sent2.lower())

In [4]:
tokens = set(tokens1 + tokens2)

In [5]:
from collections import Counter

In [6]:
x = Counter(tokens1)

In [7]:
x1 = Counter(tokens2)

In [8]:
x, x1

(Counter({'it': 1,
          'is': 1,
          'a': 1,
          'good': 1,
          'practice': 1,
          'for': 1,
          'us': 1,
          '.': 1}),
 Counter({'it': 2,
          'was': 1,
          'also': 1,
          'good': 1,
          'to': 1,
          'know': 1,
          'about': 1,
          '.': 1}))

In [9]:
df = pd.DataFrame(data = [x, x1])

In [10]:
df.fillna(0, inplace = True)
df

Unnamed: 0,it,is,a,good,practice,for,us,.,was,also,to,know,about
0,1,1.0,1.0,1,1.0,1.0,1.0,1,0.0,0.0,0.0,0.0,0.0
1,2,0.0,0.0,1,0.0,0.0,0.0,1,1.0,1.0,1.0,1.0,1.0


In [11]:
df.astype('int32')

Unnamed: 0,it,is,a,good,practice,for,us,.,was,also,to,know,about
0,1,1,1,1,1,1,1,1,0,0,0,0,0
1,2,0,0,1,0,0,0,1,1,1,1,1,1


In [54]:
# Method2

# Create dataframe with tokens as column names
df1 = pd.DataFrame( index =[1,2], columns = list(tokens))

In [56]:
# Extract the counts of words for sentences
counts1=[tokens1.count(x) for x in df.columns]
counts2= [tokens2.count(x) for x in df.columns]

In [58]:
# Supply the counts to the rows of DataFrame
df1.iloc[0,:]= counts1
df1.iloc[1,:]= counts2

In [60]:
df1

Unnamed: 0,practice,is,also,us,for,was,about,know,it,a,.,good,to
1,1,1,0,1,1,0,0,0,1,1,1,1,0
2,0,0,1,0,0,1,1,1,2,0,1,1,1


# Using in-built library 

## CountVectorizer
- from sklearn.feature_extraction.text
- makes bag of words with respect to the fit data

In [65]:
from sklearn.feature_extraction.text import CountVectorizer

In [67]:
cvt= CountVectorizer() #

In [70]:
x_new= cvt.fit_transform([sent1, sent2])

In [76]:
x_new # all feature extraction methods returns sparse matrix

<2x11 sparse matrix of type '<class 'numpy.int64'>'
	with 13 stored elements in Compressed Sparse Row format>

In [78]:
x_new.toarray()

array([[0, 0, 1, 1, 1, 1, 0, 1, 0, 1, 0],
       [1, 1, 0, 1, 0, 2, 1, 0, 1, 0, 1]], dtype=int64)

In [80]:
cvt.get_feature_names_out()

array(['about', 'also', 'for', 'good', 'is', 'it', 'know', 'practice',
       'to', 'us', 'was'], dtype=object)

In [82]:
df=pd.DataFrame(data= x_new.toarray(), columns = cvt.get_feature_names_out())

In [84]:
df

Unnamed: 0,about,also,for,good,is,it,know,practice,to,us,was
0,0,0,1,1,1,1,0,1,0,1,0
1,1,1,0,1,0,2,1,0,1,0,1


In [86]:
new = 'It was good for us.'

In [88]:
new_features = cvt.transform([new])

In [90]:
new_features.toarray()

array([[0, 0, 1, 1, 0, 1, 0, 0, 0, 1, 1]], dtype=int64)

In [96]:
df= pd.DataFrame(data= new_features.toarray(), columns= cvt.get_feature_names_out())

In [98]:
df

Unnamed: 0,about,also,for,good,is,it,know,practice,to,us,was
0,0,0,1,1,0,1,0,0,0,1,1


#### N-grams

In [101]:
help(CountVectorizer)

Help on class CountVectorizer in module sklearn.feature_extraction.text:

class CountVectorizer(_VectorizerMixin, sklearn.base.BaseEstimator)
 |  CountVectorizer(*, input='content', encoding='utf-8', decode_error='strict', strip_accents=None, lowercase=True, preprocessor=None, tokenizer=None, stop_words=None, token_pattern='(?u)\\b\\w\\w+\\b', ngram_range=(1, 1), analyzer='word', max_df=1.0, min_df=1, max_features=None, vocabulary=None, binary=False, dtype=<class 'numpy.int64'>)
 |
 |  Convert a collection of text documents to a matrix of token counts.
 |
 |  This implementation produces a sparse representation of the counts using
 |  scipy.sparse.csr_matrix.
 |
 |  If you do not provide an a-priori dictionary and you do not use an analyzer
 |  that does some kind of feature selection then the number of features will
 |  be equal to the vocabulary size found by analyzing the data.
 |
 |  For an efficiency comparison of the different feature extractors, see
 |  :ref:`sphx_glr_auto_examp

In [103]:
ngram=  CountVectorizer(ngram_range=(1,2))

In [109]:
new_ng=ngram.fit_transform([sent1,sent2])

In [117]:
ndf=pd.DataFrame(data=new_ng.toarray(), columns= ngram.get_feature_names_out())
ndf, ndf.columns

(   about  about it  also  also good  for  for us  good  good practice  \
 0      0         0     0          0    1       1     1              1   
 1      1         1     1          1    0       0     1              0   
 
    good to  is  ...  it was  know  know about  practice  practice for  to  \
 0        0   1  ...       0     0           0         1             1   0   
 1        1   0  ...       1     1           1         0             0   1   
 
    to know  us  was  was also  
 0        0   1    0         0  
 1        1   0    1         1  
 
 [2 rows x 23 columns],
 Index(['about', 'about it', 'also', 'also good', 'for', 'for us', 'good',
        'good practice', 'good to', 'is', 'is good', 'it', 'it is', 'it was',
        'know', 'know about', 'practice', 'practice for', 'to', 'to know', 'us',
        'was', 'was also'],
       dtype='object'))

In [113]:
sent1, sent2

('It is a good practice for us. ', 'It was also good to know about it. ')

In [None]:
'is good