#### Bag of Words (BOW) Model

In [9]:
docs = ['SUPERB, I AM IN LOVE IN THIS PHONE', 'I hate this phone']

words = list(set([word for doc in docs for word in doc.lower().split()]))

vectors =[]
for doc in docs:
    vectors.append([1 if word in doc.lower().split() else 0 for word in words])
print(f'vocabulary: {words}')
print(f'vectors: {vectors}')

vocabulary: ['in', 'this', 'superb,', 'love', 'am', 'phone', 'i', 'hate']
vectors: [[1, 1, 1, 1, 1, 1, 1, 0], [0, 1, 0, 0, 0, 1, 1, 1]]


#### Word Counts with CountVectorizer (sklearn)

In [11]:
from sklearn.feature_extraction.text import CountVectorizer

# list of documents
docs = ['SUPERB, I AM IN LOVE IN THIS PHONE', 'I hate this phone']

# create the transform
vectorizer = CountVectorizer()

# tokenize and build vocab
vectorizer.fit(docs)
print(f'vocabulary: {vectorizer.vocabulary_}')

# encode document
vector = vectorizer.transform(docs)

# summarize encoded vector
print(f'shape: {vector.shape}')
print(f'vectors:\n{vector.toarray()}')

vocabulary: {'superb': 5, 'am': 0, 'in': 2, 'love': 3, 'this': 6, 'phone': 4, 'hate': 1}
shape: (2, 7)
vectors:
[[1 0 2 1 1 1 1]
 [0 1 0 0 1 0 1]]


#### Word Frequency with TFIDFVectorizer (sklearn)

In [15]:
from sklearn.feature_extraction.text import TfidfVectorizer

# list of documents
docs = ['SUPERB, I AM IN LOVE IN THIS PHONE', 'I hate this phone']

# create the transform
vectorizer = TfidfVectorizer()

# tokenize and build vocab
vectorizer.fit(docs)

# summarize
print(f'vocabulary: {vectorizer.vocabulary_}')
print(f'idfs: {vectorizer.idf_}')

# encode document
vector = vectorizer.transform([docs[0]])

# summarize encoded vector
print(f'vectors: {vector.toarray()}')

vocabulary: {'superb': 5, 'am': 0, 'in': 2, 'love': 3, 'this': 6, 'phone': 4, 'hate': 1}
idfs: [1.40546511 1.40546511 1.40546511 1.40546511 1.         1.40546511
 1.        ]
vectors: [[0.35327777 0.         0.70655553 0.35327777 0.25136004 0.35327777
  0.25136004]]
