In [3]:
import pyforest
import pandas as pd

In [11]:
data = pd.DataFrame({'text': ['people watch ineuron', 'ineuron watch ineuron', 'people write comment', 'ineuron write comment'], 'output': [1,1,0,0]})
data

Unnamed: 0,text,output
0,people watch ineuron,1
1,ineuron watch ineuron,1
2,people write comment,0
3,ineuron write comment,0


In [6]:
# Document Matrix (Bag of Words - BOW)

# link - https://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.CountVectorizer.html

In [9]:
from sklearn.feature_extraction.text import CountVectorizer

BOW = CountVectorizer()

In [18]:
document_matrix = BOW.fit_transform(data['text'])
document_matrix

<4x5 sparse matrix of type '<class 'numpy.int64'>'
	with 11 stored elements in Compressed Sparse Row format>

In [19]:
BOW.vocabulary_

{'people': 2, 'watch': 3, 'ineuron': 1, 'write': 4, 'comment': 0}

In [22]:
document_matrix[0].toarray()

#checking the arrangement of the first sentence or document

# order in alphabet [ comment , ineuron, people, watch, write ]

# our document was [ people watch ineuron ] 

array([[0, 1, 1, 1, 0]])

In [23]:
document_matrix[1].toarray()

array([[0, 2, 0, 1, 0]])

In [24]:
document_matrix[2].toarray()

array([[1, 0, 1, 0, 1]])

In [25]:
document_matrix[3].toarray()

array([[1, 1, 0, 0, 1]])

In [34]:
bigram = CountVectorizer(ngram_range=(2,2))

bigramvocab = bigram.fit_transform(data['text'])

bigramvocab

<4x6 sparse matrix of type '<class 'numpy.int64'>'
	with 8 stored elements in Compressed Sparse Row format>

In [35]:
bigram.vocabulary_

{'people watch': 2,
 'watch ineuron': 4,
 'ineuron watch': 0,
 'people write': 3,
 'write comment': 5,
 'ineuron write': 1}

In [36]:
trigram = CountVectorizer(ngram_range=(3,3))

trigramvocab = trigram.fit_transform(data['text'])

trigramvocab

<4x4 sparse matrix of type '<class 'numpy.int64'>'
	with 4 stored elements in Compressed Sparse Row format>

In [37]:
trigram.vocabulary_

{'people watch ineuron': 2,
 'ineuron watch ineuron': 0,
 'people write comment': 3,
 'ineuron write comment': 1}

In [38]:
mix = CountVectorizer(ngram_range=(1,2))

mixvocab = mix.fit_transform(data['text'])

mixvocab

<4x11 sparse matrix of type '<class 'numpy.int64'>'
	with 19 stored elements in Compressed Sparse Row format>

In [39]:
mix.vocabulary_

{'people': 4,
 'watch': 7,
 'ineuron': 1,
 'people watch': 5,
 'watch ineuron': 8,
 'ineuron watch': 2,
 'write': 9,
 'comment': 0,
 'people write': 6,
 'write comment': 10,
 'ineuron write': 3}

In [40]:
# Term Frequency - Inverse Document Frequency ( TF-IDF )

In [41]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf = TfidfVectorizer()

In [42]:
tfidf.fit_transform(data['text']).toarray()

array([[0.        , 0.49681612, 0.61366674, 0.61366674, 0.        ],
       [0.        , 0.8508161 , 0.        , 0.52546357, 0.        ],
       [0.57735027, 0.        , 0.57735027, 0.        , 0.57735027],
       [0.61366674, 0.49681612, 0.        , 0.        , 0.61366674]])

In [44]:
tfidf.get_feature_names_out()

# getting the vocabulary in alphabetical order

array(['comment', 'ineuron', 'people', 'watch', 'write'], dtype=object)

In [46]:
tfidf.idf_   # getting the inverse document frequency

array([1.51082562, 1.22314355, 1.51082562, 1.51082562, 1.51082562])