In [1]:
import numpy as np
import pandas as pd
from nltk.tokenize import word_tokenize

In [2]:
s1= "The quick brown fox jumps over the lazy dog."
s2= "The cat chases the mouse and it squeaks loudely."

In [3]:
s1=s1.lower()
s2=s2.lower()

In [4]:
w1=word_tokenize(s1)
w1

['the', 'quick', 'brown', 'fox', 'jumps', 'over', 'the', 'lazy', 'dog', '.']

In [5]:
w2=word_tokenize(s2)
w2

['the',
 'cat',
 'chases',
 'the',
 'mouse',
 'and',
 'it',
 'squeaks',
 'loudely',
 '.']

In [6]:
tokens = w1+w2
tokens=set(tokens)
tokens

{'.',
 'and',
 'brown',
 'cat',
 'chases',
 'dog',
 'fox',
 'it',
 'jumps',
 'lazy',
 'loudely',
 'mouse',
 'over',
 'quick',
 'squeaks',
 'the'}

In [7]:
df = pd.DataFrame(index=[1,2],columns=list(tokens))
df

Unnamed: 0,fox,jumps,.,quick,chases,cat,mouse,and,lazy,squeaks,over,it,loudely,the,dog,brown
1,,,,,,,,,,,,,,,,
2,,,,,,,,,,,,,,,,


In [8]:
counts1 = [w1.count(x) for x in df.columns]
counts2 = [w2.count(x) for x in df.columns]

In [9]:
df.iloc[0,:] = counts1
df.iloc[1,:] = counts2

In [10]:
df

Unnamed: 0,fox,jumps,.,quick,chases,cat,mouse,and,lazy,squeaks,over,it,loudely,the,dog,brown
1,1,1,1,1,0,0,0,0,1,0,1,0,0,2,1,1
2,0,0,1,0,1,1,1,1,0,1,0,1,1,2,0,0


## Count Vectorizer

In [11]:
from sklearn.feature_extraction.text import CountVectorizer

In [12]:
cv = CountVectorizer()

In [13]:
vect_data = cv.fit_transform([s1,s2])
vect_data

<2x15 sparse matrix of type '<class 'numpy.int64'>'
	with 16 stored elements in Compressed Sparse Row format>

In [14]:
vect_data.toarray()

array([[0, 1, 0, 0, 1, 1, 0, 1, 1, 0, 0, 1, 1, 0, 2],
       [1, 0, 1, 1, 0, 0, 1, 0, 0, 1, 1, 0, 0, 1, 2]])

In [15]:
cv.get_feature_names_out()

array(['and', 'brown', 'cat', 'chases', 'dog', 'fox', 'it', 'jumps',
       'lazy', 'loudely', 'mouse', 'over', 'quick', 'squeaks', 'the'],
      dtype=object)

In [16]:
df = pd.DataFrame(vect_data.toarray(),
                 columns=cv.get_feature_names_out())
df

Unnamed: 0,and,brown,cat,chases,dog,fox,it,jumps,lazy,loudely,mouse,over,quick,squeaks,the
0,0,1,0,0,1,1,0,1,1,0,0,1,1,0,2
1,1,0,1,1,0,0,1,0,0,1,1,0,0,1,2


In [17]:
# Adding a new sentence
s3 = "The lazy cat jumps over the dog."

In [18]:
vect_data1 = cv.fit_transform([s1,s2,s3])
vect_data1

<3x15 sparse matrix of type '<class 'numpy.int64'>'
	with 22 stored elements in Compressed Sparse Row format>

In [19]:
df1 = pd.DataFrame(vect_data1.toarray(),
                 columns=cv.get_feature_names_out())
df1

Unnamed: 0,and,brown,cat,chases,dog,fox,it,jumps,lazy,loudely,mouse,over,quick,squeaks,the
0,0,1,0,0,1,1,0,1,1,0,0,1,1,0,2
1,1,0,1,1,0,0,1,0,0,1,1,0,0,1,2
2,0,0,1,0,1,0,0,1,1,0,0,1,0,0,2


## Getting file from local machine and vectorizing it

In [47]:
file = open('india.txt')
data = file.readlines()

In [48]:
data

['1.India, officially the Republic of India is a country in South Asia. \n',
 "2.It is the seventh-largest country by area; the most populous country as of June 2023; and from the time of its independence in 1947, the world's most populous democracy.\n",
 '3.Bounded by the Indian Ocean on the south, the Arabian Sea on the southwest, and the Bay of Bengal on the southeast, it shares land borders with Pakistan to the west; China, Nepal, and Bhutan to the north; and Bangladesh and Myanmar to the east. \n',
 '4.In the Indian Ocean, India is in the vicinity of Sri Lanka and the Maldives; its Andaman and Nicobar Islands share a maritime border with Thailand, Myanmar, and Indonesia.\n']

In [49]:
cv_india = CountVectorizer()

In [50]:
vector_data = cv_india.fit_transform(data)

In [51]:
df = pd.DataFrame(vector_data.toarray(),
                 columns=cv_india.get_feature_names_out())
df

Unnamed: 0,1947,2023,and,andaman,arabian,area,as,asia,bangladesh,bay,...,southwest,sri,thailand,the,time,to,vicinity,west,with,world
0,0,0,0,0,0,0,0,1,0,0,...,0,0,0,1,0,0,0,0,0,0
1,1,1,1,0,0,1,1,0,0,0,...,0,0,0,4,1,0,0,0,0,1
2,0,0,4,0,1,0,0,0,1,1,...,1,0,0,9,0,3,0,1,1,0
3,0,0,3,1,0,0,0,0,0,0,...,0,1,1,3,0,0,1,0,1,0
