# Bag of words

In [3]:
import pandas as pd

In [4]:
df = pd.DataFrame({'text':['people watch campusx', 'campusx watch campusx', 'people write comment', 'campusx write comment'], 'output': [1,1,0,0]})

In [5]:
df

Unnamed: 0,text,output
0,people watch campusx,1
1,campusx watch campusx,1
2,people write comment,0
3,campusx write comment,0


In [6]:
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer()

In [7]:
bow = cv.fit_transform(df['text'])

In [8]:
cv.vocabulary_

{'people': 2, 'watch': 3, 'campusx': 0, 'write': 4, 'comment': 1}

In [9]:
bow

<4x5 sparse matrix of type '<class 'numpy.int64'>'
	with 11 stored elements in Compressed Sparse Row format>

In [10]:
bow.toarray()

array([[1, 0, 1, 1, 0],
       [2, 0, 0, 1, 0],
       [0, 1, 1, 0, 1],
       [1, 1, 0, 0, 1]], dtype=int64)

In [11]:
cv.transform(['campusx watch and write comment of campusx']).toarray()

array([[2, 1, 0, 1, 1]], dtype=int64)


# n-gram model

In [12]:
df

Unnamed: 0,text,output
0,people watch campusx,1
1,campusx watch campusx,1
2,people write comment,0
3,campusx write comment,0


In [13]:
from sklearn.feature_extraction.text import CountVectorizer

In [14]:
cv = CountVectorizer(ngram_range=(2,2))
cv

In [15]:
bow = cv.fit_transform(df['text'])
bow.toarray()

array([[0, 0, 1, 0, 1, 0],
       [1, 0, 0, 0, 1, 0],
       [0, 0, 0, 1, 0, 1],
       [0, 1, 0, 0, 0, 1]], dtype=int64)

In [16]:
cv.vocabulary_

{'people watch': 2,
 'watch campusx': 4,
 'campusx watch': 0,
 'people write': 3,
 'write comment': 5,
 'campusx write': 1}

In [17]:
cv = CountVectorizer(ngram_range=(1,2))

In [18]:
bow = cv.fit_transform(df['text'])
bow.toarray()

array([[1, 0, 0, 0, 1, 1, 0, 1, 1, 0, 0],
       [2, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0],
       [0, 0, 0, 1, 1, 0, 1, 0, 0, 1, 1],
       [1, 0, 1, 1, 0, 0, 0, 0, 0, 1, 1]], dtype=int64)

In [19]:
cv.vocabulary_

{'people': 4,
 'watch': 7,
 'campusx': 0,
 'people watch': 5,
 'watch campusx': 8,
 'campusx watch': 1,
 'write': 9,
 'comment': 3,
 'people write': 6,
 'write comment': 10,
 'campusx write': 2}

# tf-idf

In [20]:
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf = TfidfVectorizer()
tfidf.fit_transform(df['text']).toarray()

array([[0.49681612, 0.        , 0.61366674, 0.61366674, 0.        ],
       [0.8508161 , 0.        , 0.        , 0.52546357, 0.        ],
       [0.        , 0.57735027, 0.57735027, 0.        , 0.57735027],
       [0.49681612, 0.61366674, 0.        , 0.        , 0.61366674]])

In [21]:
tfidf.idf_

array([1.22314355, 1.51082562, 1.51082562, 1.51082562, 1.51082562])

In [22]:
tfidf.get_feature_names_out()

array(['campusx', 'comment', 'people', 'watch', 'write'], dtype=object)